diff options
author | José Pekkarinen <jose.pekkarinen@nokia.com> | 2016-04-11 10:41:07 +0300 |
---|---|---|
committer | José Pekkarinen <jose.pekkarinen@nokia.com> | 2016-04-13 08:17:18 +0300 |
commit | e09b41010ba33a20a87472ee821fa407a5b8da36 (patch) | |
tree | d10dc367189862e7ca5c592f033dc3726e1df4e3 /kernel/kernel | |
parent | f93b97fd65072de626c074dbe099a1fff05ce060 (diff) |
These changes are the raw update to linux-4.4.6-rt14. Kernel sources
are taken from kernel.org, and rt patch from the rt wiki download page.
During the rebasing, the following patch collided:
Force tick interrupt and get rid of softirq magic(I70131fb85).
Collisions have been removed because its logic was found on the
source already.
Change-Id: I7f57a4081d9deaa0d9ccfc41a6c8daccdee3b769
Signed-off-by: José Pekkarinen <jose.pekkarinen@nokia.com>
Diffstat (limited to 'kernel/kernel')
220 files changed, 20362 insertions, 13547 deletions
diff --git a/kernel/kernel/.gitignore b/kernel/kernel/.gitignore index 790d83c7d..b3097bde4 100644 --- a/kernel/kernel/.gitignore +++ b/kernel/kernel/.gitignore @@ -5,4 +5,3 @@ config_data.h config_data.gz timeconst.h hz.bc -x509_certificate_list diff --git a/kernel/kernel/Kconfig.locks b/kernel/kernel/Kconfig.locks index c61e9131e..b9e6aa7e5 100644 --- a/kernel/kernel/Kconfig.locks +++ b/kernel/kernel/Kconfig.locks @@ -235,9 +235,16 @@ config LOCK_SPIN_ON_OWNER def_bool y depends on MUTEX_SPIN_ON_OWNER || RWSEM_SPIN_ON_OWNER -config ARCH_USE_QUEUE_RWLOCK +config ARCH_USE_QUEUED_SPINLOCKS bool -config QUEUE_RWLOCK - def_bool y if ARCH_USE_QUEUE_RWLOCK +config QUEUED_SPINLOCKS + def_bool y if ARCH_USE_QUEUED_SPINLOCKS + depends on SMP + +config ARCH_USE_QUEUED_RWLOCKS + bool + +config QUEUED_RWLOCKS + def_bool y if ARCH_USE_QUEUED_RWLOCKS depends on SMP diff --git a/kernel/kernel/Makefile b/kernel/kernel/Makefile index 60c302cfb..53abf008e 100644 --- a/kernel/kernel/Makefile +++ b/kernel/kernel/Makefile @@ -45,16 +45,18 @@ ifneq ($(CONFIG_SMP),y) obj-y += up.o endif obj-$(CONFIG_UID16) += uid16.o -obj-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += system_keyring.o system_certificates.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_MODULE_SIG) += module_signing.o obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o +obj-$(CONFIG_KEXEC_CORE) += kexec_core.o obj-$(CONFIG_KEXEC) += kexec.o +obj-$(CONFIG_KEXEC_FILE) += kexec_file.o obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CGROUPS) += cgroup.o obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o +obj-$(CONFIG_CGROUP_PIDS) += cgroup_pids.o obj-$(CONFIG_CPUSETS) += cpuset.o obj-$(CONFIG_UTS_NS) += utsname.o obj-$(CONFIG_USER_NS) += user_namespace.o @@ -64,7 +66,7 @@ obj-$(CONFIG_SMP) += stop_machine.o obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o obj-$(CONFIG_AUDIT) += audit.o auditfilter.o obj-$(CONFIG_AUDITSYSCALL) += auditsc.o -obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o +obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o audit_fsnotify.o obj-$(CONFIG_AUDIT_TREE) += audit_tree.o obj-$(CONFIG_GCOV_KERNEL) += gcov/ obj-$(CONFIG_KPROBES) += kprobes.o @@ -98,6 +100,9 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o obj-$(CONFIG_TORTURE_TEST) += torture.o +obj-$(CONFIG_MEMBARRIER) += membarrier.o + +obj-$(CONFIG_HAS_IOMEM) += memremap.o $(obj)/configs.o: $(obj)/config_data.h @@ -111,99 +116,3 @@ $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE targets += config_data.h $(obj)/config_data.h: $(obj)/config_data.gz FORCE $(call filechk,ikconfiggz) - -############################################################################### -# -# Roll all the X.509 certificates that we can find together and pull them into -# the kernel so that they get loaded into the system trusted keyring during -# boot. -# -# We look in the source root and the build root for all files whose name ends -# in ".x509". Unfortunately, this will generate duplicate filenames, so we -# have make canonicalise the pathnames and then sort them to discard the -# duplicates. -# -############################################################################### -ifeq ($(CONFIG_SYSTEM_TRUSTED_KEYRING),y) -X509_CERTIFICATES-y := $(wildcard *.x509) $(wildcard $(srctree)/*.x509) -X509_CERTIFICATES-$(CONFIG_MODULE_SIG) += $(objtree)/signing_key.x509 -X509_CERTIFICATES-raw := $(sort $(foreach CERT,$(X509_CERTIFICATES-y), \ - $(or $(realpath $(CERT)),$(CERT)))) -X509_CERTIFICATES := $(subst $(realpath $(objtree))/,,$(X509_CERTIFICATES-raw)) - -ifeq ($(X509_CERTIFICATES),) -$(warning *** No X.509 certificates found ***) -endif - -ifneq ($(wildcard $(obj)/.x509.list),) -ifneq ($(shell cat $(obj)/.x509.list),$(X509_CERTIFICATES)) -$(info X.509 certificate list changed) -$(shell rm $(obj)/.x509.list) -endif -endif - -kernel/system_certificates.o: $(obj)/x509_certificate_list - -quiet_cmd_x509certs = CERTS $@ - cmd_x509certs = cat $(X509_CERTIFICATES) /dev/null >$@ $(foreach X509,$(X509_CERTIFICATES),; $(kecho) " - Including cert $(X509)") - -targets += $(obj)/x509_certificate_list -$(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list - $(call if_changed,x509certs) - -targets += $(obj)/.x509.list -$(obj)/.x509.list: - @echo $(X509_CERTIFICATES) >$@ -endif - -clean-files := x509_certificate_list .x509.list - -ifeq ($(CONFIG_MODULE_SIG),y) -############################################################################### -# -# If module signing is requested, say by allyesconfig, but a key has not been -# supplied, then one will need to be generated to make sure the build does not -# fail and that the kernel may be used afterwards. -# -############################################################################### -ifndef CONFIG_MODULE_SIG_HASH -$(error Could not determine digest type to use from kernel config) -endif - -signing_key.priv signing_key.x509: x509.genkey - @echo "###" - @echo "### Now generating an X.509 key pair to be used for signing modules." - @echo "###" - @echo "### If this takes a long time, you might wish to run rngd in the" - @echo "### background to keep the supply of entropy topped up. It" - @echo "### needs to be run as root, and uses a hardware random" - @echo "### number generator if one is available." - @echo "###" - openssl req -new -nodes -utf8 -$(CONFIG_MODULE_SIG_HASH) -days 36500 \ - -batch -x509 -config x509.genkey \ - -outform DER -out signing_key.x509 \ - -keyout signing_key.priv 2>&1 - @echo "###" - @echo "### Key pair generated." - @echo "###" - -x509.genkey: - @echo Generating X.509 key generation config - @echo >x509.genkey "[ req ]" - @echo >>x509.genkey "default_bits = 4096" - @echo >>x509.genkey "distinguished_name = req_distinguished_name" - @echo >>x509.genkey "prompt = no" - @echo >>x509.genkey "string_mask = utf8only" - @echo >>x509.genkey "x509_extensions = myexts" - @echo >>x509.genkey - @echo >>x509.genkey "[ req_distinguished_name ]" - @echo >>x509.genkey "#O = Unspecified company" - @echo >>x509.genkey "CN = Build time autogenerated kernel key" - @echo >>x509.genkey "#emailAddress = unspecified.user@unspecified.company" - @echo >>x509.genkey - @echo >>x509.genkey "[ myexts ]" - @echo >>x509.genkey "basicConstraints=critical,CA:FALSE" - @echo >>x509.genkey "keyUsage=digitalSignature" - @echo >>x509.genkey "subjectKeyIdentifier=hash" - @echo >>x509.genkey "authorityKeyIdentifier=keyid" -endif diff --git a/kernel/kernel/audit.c b/kernel/kernel/audit.c index 1c13e4267..5ffcbd354 100644 --- a/kernel/kernel/audit.c +++ b/kernel/kernel/audit.c @@ -407,16 +407,33 @@ static void audit_printk_skb(struct sk_buff *skb) static void kauditd_send_skb(struct sk_buff *skb) { int err; + int attempts = 0; +#define AUDITD_RETRIES 5 + +restart: /* take a reference in case we can't send it and we want to hold it */ skb_get(skb); err = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0); if (err < 0) { - BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */ + pr_err("netlink_unicast sending to audit_pid=%d returned error: %d\n", + audit_pid, err); if (audit_pid) { - pr_err("*NO* daemon at audit_pid=%d\n", audit_pid); - audit_log_lost("auditd disappeared"); - audit_pid = 0; - audit_sock = NULL; + if (err == -ECONNREFUSED || err == -EPERM + || ++attempts >= AUDITD_RETRIES) { + char s[32]; + + snprintf(s, sizeof(s), "audit_pid=%d reset", audit_pid); + audit_log_lost(s); + audit_pid = 0; + audit_sock = NULL; + } else { + pr_warn("re-scheduling(#%d) write to audit_pid=%d\n", + attempts, audit_pid); + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + __set_current_state(TASK_RUNNING); + goto restart; + } } /* we might get lucky and get this in the next auditd */ audit_hold_skb(skb); @@ -684,25 +701,22 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) return err; } -static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type) +static void audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type) { - int rc = 0; uid_t uid = from_kuid(&init_user_ns, current_uid()); pid_t pid = task_tgid_nr(current); if (!audit_enabled && msg_type != AUDIT_USER_AVC) { *ab = NULL; - return rc; + return; } *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); if (unlikely(!*ab)) - return rc; + return; audit_log_format(*ab, "pid=%d uid=%u", pid, uid); audit_log_session_info(*ab); audit_log_task_context(*ab); - - return rc; } int is_audit_feature_set(int i) @@ -1357,16 +1371,16 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, if (unlikely(audit_filter_type(type))) return NULL; - if (gfp_mask & __GFP_WAIT) { + if (gfp_mask & __GFP_DIRECT_RECLAIM) { if (audit_pid && audit_pid == current->pid) - gfp_mask &= ~__GFP_WAIT; + gfp_mask &= ~__GFP_DIRECT_RECLAIM; else reserve = 0; } while (audit_backlog_limit && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) { - if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time) { + if (gfp_mask & __GFP_DIRECT_RECLAIM && audit_backlog_wait_time) { long sleep_time; sleep_time = timeout_start + audit_backlog_wait_time - jiffies; @@ -1566,14 +1580,14 @@ void audit_log_n_string(struct audit_buffer *ab, const char *string, * @string: string to be checked * @len: max length of the string to check */ -int audit_string_contains_control(const char *string, size_t len) +bool audit_string_contains_control(const char *string, size_t len) { const unsigned char *p; for (p = string; p < (const unsigned char *)string + len; p++) { if (*p == '"' || *p < 0x21 || *p > 0x7e) - return 1; + return true; } - return 0; + return false; } /** @@ -1761,7 +1775,7 @@ void audit_log_name(struct audit_context *context, struct audit_names *n, } else audit_log_format(ab, " name=(null)"); - if (n->ino != (unsigned long)-1) + if (n->ino != AUDIT_INO_UNSET) audit_log_format(ab, " inode=%lu" " dev=%02x:%02x mode=%#ho" " ouid=%u ogid=%u rdev=%02x:%02x", @@ -1904,7 +1918,7 @@ EXPORT_SYMBOL(audit_log_task_info); /** * audit_log_link_denied - report a link restriction denial - * @operation: specific link opreation + * @operation: specific link operation * @link: the path that triggered the restriction */ void audit_log_link_denied(const char *operation, struct path *link) diff --git a/kernel/kernel/audit.h b/kernel/kernel/audit.h index d641f9bb3..de6cbb7cf 100644 --- a/kernel/kernel/audit.h +++ b/kernel/kernel/audit.h @@ -50,6 +50,7 @@ enum audit_state { /* Rule lists */ struct audit_watch; +struct audit_fsnotify_mark; struct audit_tree; struct audit_chunk; @@ -252,6 +253,7 @@ struct audit_net { extern int selinux_audit_rule_update(void); extern struct mutex audit_filter_mutex; +extern int audit_del_rule(struct audit_entry *); extern void audit_free_rule_rcu(struct rcu_head *); extern struct list_head audit_filter_list[]; @@ -269,6 +271,15 @@ extern int audit_add_watch(struct audit_krule *krule, struct list_head **list); extern void audit_remove_watch_rule(struct audit_krule *krule); extern char *audit_watch_path(struct audit_watch *watch); extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev); + +extern struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pathname, int len); +extern char *audit_mark_path(struct audit_fsnotify_mark *mark); +extern void audit_remove_mark(struct audit_fsnotify_mark *audit_mark); +extern void audit_remove_mark_rule(struct audit_krule *krule); +extern int audit_mark_compare(struct audit_fsnotify_mark *mark, unsigned long ino, dev_t dev); +extern int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old); +extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark); + #else #define audit_put_watch(w) {} #define audit_get_watch(w) {} @@ -278,12 +289,19 @@ extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev #define audit_watch_path(w) "" #define audit_watch_compare(w, i, d) 0 +#define audit_alloc_mark(k, p, l) (ERR_PTR(-EINVAL)) +#define audit_mark_path(m) "" +#define audit_remove_mark(m) +#define audit_remove_mark_rule(k) +#define audit_mark_compare(m, i, d) 0 +#define audit_exe_compare(t, m) (-EINVAL) +#define audit_dupe_exe(n, o) (-EINVAL) #endif /* CONFIG_AUDIT_WATCH */ #ifdef CONFIG_AUDIT_TREE extern struct audit_chunk *audit_tree_lookup(const struct inode *); extern void audit_put_chunk(struct audit_chunk *); -extern int audit_tree_match(struct audit_chunk *, struct audit_tree *); +extern bool audit_tree_match(struct audit_chunk *, struct audit_tree *); extern int audit_make_tree(struct audit_krule *, char *, u32); extern int audit_add_tree_rule(struct audit_krule *); extern int audit_remove_tree_rule(struct audit_krule *); diff --git a/kernel/kernel/audit_fsnotify.c b/kernel/kernel/audit_fsnotify.c new file mode 100644 index 000000000..27c6046c2 --- /dev/null +++ b/kernel/kernel/audit_fsnotify.c @@ -0,0 +1,216 @@ +/* audit_fsnotify.c -- tracking inodes + * + * Copyright 2003-2009,2014-2015 Red Hat, Inc. + * Copyright 2005 Hewlett-Packard Development Company, L.P. + * Copyright 2005 IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/kernel.h> +#include <linux/audit.h> +#include <linux/kthread.h> +#include <linux/mutex.h> +#include <linux/fs.h> +#include <linux/fsnotify_backend.h> +#include <linux/namei.h> +#include <linux/netlink.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/security.h> +#include "audit.h" + +/* + * this mark lives on the parent directory of the inode in question. + * but dev, ino, and path are about the child + */ +struct audit_fsnotify_mark { + dev_t dev; /* associated superblock device */ + unsigned long ino; /* associated inode number */ + char *path; /* insertion path */ + struct fsnotify_mark mark; /* fsnotify mark on the inode */ + struct audit_krule *rule; +}; + +/* fsnotify handle. */ +static struct fsnotify_group *audit_fsnotify_group; + +/* fsnotify events we care about. */ +#define AUDIT_FS_EVENTS (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\ + FS_MOVE_SELF | FS_EVENT_ON_CHILD) + +static void audit_fsnotify_mark_free(struct audit_fsnotify_mark *audit_mark) +{ + kfree(audit_mark->path); + kfree(audit_mark); +} + +static void audit_fsnotify_free_mark(struct fsnotify_mark *mark) +{ + struct audit_fsnotify_mark *audit_mark; + + audit_mark = container_of(mark, struct audit_fsnotify_mark, mark); + audit_fsnotify_mark_free(audit_mark); +} + +char *audit_mark_path(struct audit_fsnotify_mark *mark) +{ + return mark->path; +} + +int audit_mark_compare(struct audit_fsnotify_mark *mark, unsigned long ino, dev_t dev) +{ + if (mark->ino == AUDIT_INO_UNSET) + return 0; + return (mark->ino == ino) && (mark->dev == dev); +} + +static void audit_update_mark(struct audit_fsnotify_mark *audit_mark, + struct inode *inode) +{ + audit_mark->dev = inode ? inode->i_sb->s_dev : AUDIT_DEV_UNSET; + audit_mark->ino = inode ? inode->i_ino : AUDIT_INO_UNSET; +} + +struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pathname, int len) +{ + struct audit_fsnotify_mark *audit_mark; + struct path path; + struct dentry *dentry; + struct inode *inode; + int ret; + + if (pathname[0] != '/' || pathname[len-1] == '/') + return ERR_PTR(-EINVAL); + + dentry = kern_path_locked(pathname, &path); + if (IS_ERR(dentry)) + return (void *)dentry; /* returning an error */ + inode = path.dentry->d_inode; + mutex_unlock(&inode->i_mutex); + + audit_mark = kzalloc(sizeof(*audit_mark), GFP_KERNEL); + if (unlikely(!audit_mark)) { + audit_mark = ERR_PTR(-ENOMEM); + goto out; + } + + fsnotify_init_mark(&audit_mark->mark, audit_fsnotify_free_mark); + audit_mark->mark.mask = AUDIT_FS_EVENTS; + audit_mark->path = pathname; + audit_update_mark(audit_mark, dentry->d_inode); + audit_mark->rule = krule; + + ret = fsnotify_add_mark(&audit_mark->mark, audit_fsnotify_group, inode, NULL, true); + if (ret < 0) { + audit_fsnotify_mark_free(audit_mark); + audit_mark = ERR_PTR(ret); + } +out: + dput(dentry); + path_put(&path); + return audit_mark; +} + +static void audit_mark_log_rule_change(struct audit_fsnotify_mark *audit_mark, char *op) +{ + struct audit_buffer *ab; + struct audit_krule *rule = audit_mark->rule; + + if (!audit_enabled) + return; + ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); + if (unlikely(!ab)) + return; + audit_log_format(ab, "auid=%u ses=%u op=", + from_kuid(&init_user_ns, audit_get_loginuid(current)), + audit_get_sessionid(current)); + audit_log_string(ab, op); + audit_log_format(ab, " path="); + audit_log_untrustedstring(ab, audit_mark->path); + audit_log_key(ab, rule->filterkey); + audit_log_format(ab, " list=%d res=1", rule->listnr); + audit_log_end(ab); +} + +void audit_remove_mark(struct audit_fsnotify_mark *audit_mark) +{ + fsnotify_destroy_mark(&audit_mark->mark, audit_fsnotify_group); + fsnotify_put_mark(&audit_mark->mark); +} + +void audit_remove_mark_rule(struct audit_krule *krule) +{ + struct audit_fsnotify_mark *mark = krule->exe; + + audit_remove_mark(mark); +} + +static void audit_autoremove_mark_rule(struct audit_fsnotify_mark *audit_mark) +{ + struct audit_krule *rule = audit_mark->rule; + struct audit_entry *entry = container_of(rule, struct audit_entry, rule); + + audit_mark_log_rule_change(audit_mark, "autoremove_rule"); + audit_del_rule(entry); +} + +/* Update mark data in audit rules based on fsnotify events. */ +static int audit_mark_handle_event(struct fsnotify_group *group, + struct inode *to_tell, + struct fsnotify_mark *inode_mark, + struct fsnotify_mark *vfsmount_mark, + u32 mask, void *data, int data_type, + const unsigned char *dname, u32 cookie) +{ + struct audit_fsnotify_mark *audit_mark; + struct inode *inode = NULL; + + audit_mark = container_of(inode_mark, struct audit_fsnotify_mark, mark); + + BUG_ON(group != audit_fsnotify_group); + + switch (data_type) { + case (FSNOTIFY_EVENT_PATH): + inode = ((struct path *)data)->dentry->d_inode; + break; + case (FSNOTIFY_EVENT_INODE): + inode = (struct inode *)data; + break; + default: + BUG(); + return 0; + }; + + if (mask & (FS_CREATE|FS_MOVED_TO|FS_DELETE|FS_MOVED_FROM)) { + if (audit_compare_dname_path(dname, audit_mark->path, AUDIT_NAME_FULL)) + return 0; + audit_update_mark(audit_mark, inode); + } else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF)) + audit_autoremove_mark_rule(audit_mark); + + return 0; +} + +static const struct fsnotify_ops audit_mark_fsnotify_ops = { + .handle_event = audit_mark_handle_event, +}; + +static int __init audit_fsnotify_init(void) +{ + audit_fsnotify_group = fsnotify_alloc_group(&audit_mark_fsnotify_ops); + if (IS_ERR(audit_fsnotify_group)) { + audit_fsnotify_group = NULL; + audit_panic("cannot create audit fsnotify group"); + } + return 0; +} +device_initcall(audit_fsnotify_init); diff --git a/kernel/kernel/audit_tree.c b/kernel/kernel/audit_tree.c index b0f987727..5efe9b299 100644 --- a/kernel/kernel/audit_tree.c +++ b/kernel/kernel/audit_tree.c @@ -197,13 +197,13 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode) return NULL; } -int audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree) +bool audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree) { int n; for (n = 0; n < chunk->count; n++) if (chunk->owners[n].owner == tree) - return 1; - return 0; + return true; + return false; } /* tagging and untagging inodes with trees */ @@ -479,6 +479,8 @@ static void kill_rules(struct audit_tree *tree) if (rule->tree) { /* not a half-baked one */ audit_tree_log_remove_rule(rule); + if (entry->rule.exe) + audit_remove_mark(entry->rule.exe); rule->tree = NULL; list_del_rcu(&entry->list); list_del(&entry->rule.list); diff --git a/kernel/kernel/audit_watch.c b/kernel/kernel/audit_watch.c index 6e30024d9..656c7e93a 100644 --- a/kernel/kernel/audit_watch.c +++ b/kernel/kernel/audit_watch.c @@ -138,7 +138,7 @@ char *audit_watch_path(struct audit_watch *watch) int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev) { - return (watch->ino != (unsigned long)-1) && + return (watch->ino != AUDIT_INO_UNSET) && (watch->ino == ino) && (watch->dev == dev); } @@ -179,8 +179,8 @@ static struct audit_watch *audit_init_watch(char *path) INIT_LIST_HEAD(&watch->rules); atomic_set(&watch->count, 1); watch->path = path; - watch->dev = (dev_t)-1; - watch->ino = (unsigned long)-1; + watch->dev = AUDIT_DEV_UNSET; + watch->ino = AUDIT_INO_UNSET; return watch; } @@ -203,7 +203,6 @@ int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op) if (IS_ERR(watch)) return PTR_ERR(watch); - audit_get_watch(watch); krule->watch = watch; return 0; @@ -313,6 +312,8 @@ static void audit_update_watch(struct audit_parent *parent, list_replace(&oentry->rule.list, &nentry->rule.list); } + if (oentry->rule.exe) + audit_remove_mark(oentry->rule.exe); audit_watch_log_rule_change(r, owatch, "updated_rules"); @@ -343,6 +344,8 @@ static void audit_remove_parent_watches(struct audit_parent *parent) list_for_each_entry_safe(r, nextr, &w->rules, rlist) { e = container_of(r, struct audit_entry, rule); audit_watch_log_rule_change(r, w, "remove_rule"); + if (e->rule.exe) + audit_remove_mark(e->rule.exe); list_del(&r->rlist); list_del(&r->list); list_del_rcu(&e->list); @@ -387,19 +390,20 @@ static void audit_add_to_parent(struct audit_krule *krule, watch_found = 1; - /* put krule's and initial refs to temporary watch */ - audit_put_watch(watch); + /* put krule's ref to temporary watch */ audit_put_watch(watch); audit_get_watch(w); krule->watch = watch = w; + + audit_put_parent(parent); break; } if (!watch_found) { - audit_get_parent(parent); watch->parent = parent; + audit_get_watch(watch); list_add(&watch->wlist, &parent->watches); } list_add(&krule->rlist, &watch->rules); @@ -437,9 +441,6 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list) audit_add_to_parent(krule, parent); - /* match get in audit_find_parent or audit_init_parent */ - audit_put_parent(parent); - h = audit_hash_ino((u32)watch->ino); *list = &audit_inode_hash[h]; error: @@ -496,7 +497,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group, if (mask & (FS_CREATE|FS_MOVED_TO) && inode) audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0); else if (mask & (FS_DELETE|FS_MOVED_FROM)) - audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1); + audit_update_watch(parent, dname, AUDIT_DEV_UNSET, AUDIT_INO_UNSET, 1); else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF)) audit_remove_parent_watches(parent); @@ -517,3 +518,36 @@ static int __init audit_watch_init(void) return 0; } device_initcall(audit_watch_init); + +int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old) +{ + struct audit_fsnotify_mark *audit_mark; + char *pathname; + + pathname = kstrdup(audit_mark_path(old->exe), GFP_KERNEL); + if (!pathname) + return -ENOMEM; + + audit_mark = audit_alloc_mark(new, pathname, strlen(pathname)); + if (IS_ERR(audit_mark)) { + kfree(pathname); + return PTR_ERR(audit_mark); + } + new->exe = audit_mark; + + return 0; +} + +int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark) +{ + struct file *exe_file; + unsigned long ino; + dev_t dev; + + rcu_read_lock(); + exe_file = rcu_dereference(tsk->mm->exe_file); + ino = exe_file->f_inode->i_ino; + dev = exe_file->f_inode->i_sb->s_dev; + rcu_read_unlock(); + return audit_mark_compare(mark, ino, dev); +} diff --git a/kernel/kernel/auditfilter.c b/kernel/kernel/auditfilter.c index 72e1660a7..b8ff9e193 100644 --- a/kernel/kernel/auditfilter.c +++ b/kernel/kernel/auditfilter.c @@ -39,13 +39,13 @@ * Locking model: * * audit_filter_mutex: - * Synchronizes writes and blocking reads of audit's filterlist - * data. Rcu is used to traverse the filterlist and access - * contents of structs audit_entry, audit_watch and opaque - * LSM rules during filtering. If modified, these structures - * must be copied and replace their counterparts in the filterlist. - * An audit_parent struct is not accessed during filtering, so may - * be written directly provided audit_filter_mutex is held. + * Synchronizes writes and blocking reads of audit's filterlist + * data. Rcu is used to traverse the filterlist and access + * contents of structs audit_entry, audit_watch and opaque + * LSM rules during filtering. If modified, these structures + * must be copied and replace their counterparts in the filterlist. + * An audit_parent struct is not accessed during filtering, so may + * be written directly provided audit_filter_mutex is held. */ /* Audit filter lists, defined in <linux/audit.h> */ @@ -405,6 +405,12 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) if (f->val > AUDIT_MAX_FIELD_COMPARE) return -EINVAL; break; + case AUDIT_EXE: + if (f->op != Audit_equal) + return -EINVAL; + if (entry->rule.listnr != AUDIT_FILTER_EXIT) + return -EINVAL; + break; }; return 0; } @@ -419,6 +425,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, size_t remain = datasz - sizeof(struct audit_rule_data); int i; char *str; + struct audit_fsnotify_mark *audit_mark; entry = audit_to_entry_common(data); if (IS_ERR(entry)) @@ -539,6 +546,24 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, entry->rule.buflen += f->val; entry->rule.filterkey = str; break; + case AUDIT_EXE: + if (entry->rule.exe || f->val > PATH_MAX) + goto exit_free; + str = audit_unpack_string(&bufp, &remain, f->val); + if (IS_ERR(str)) { + err = PTR_ERR(str); + goto exit_free; + } + entry->rule.buflen += f->val; + + audit_mark = audit_alloc_mark(&entry->rule, str, f->val); + if (IS_ERR(audit_mark)) { + kfree(str); + err = PTR_ERR(audit_mark); + goto exit_free; + } + entry->rule.exe = audit_mark; + break; } } @@ -549,10 +574,10 @@ exit_nofree: return entry; exit_free: - if (entry->rule.watch) - audit_put_watch(entry->rule.watch); /* matches initial get */ if (entry->rule.tree) audit_put_tree(entry->rule.tree); /* that's the temporary one */ + if (entry->rule.exe) + audit_remove_mark(entry->rule.exe); /* that's the template one */ audit_free_rule(entry); return ERR_PTR(err); } @@ -617,6 +642,10 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) data->buflen += data->values[i] = audit_pack_string(&bufp, krule->filterkey); break; + case AUDIT_EXE: + data->buflen += data->values[i] = + audit_pack_string(&bufp, audit_mark_path(krule->exe)); + break; case AUDIT_LOGINUID_SET: if (krule->pflags & AUDIT_LOGINUID_LEGACY && !f->val) { data->fields[i] = AUDIT_LOGINUID; @@ -680,6 +709,12 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) if (strcmp(a->filterkey, b->filterkey)) return 1; break; + case AUDIT_EXE: + /* both paths exist based on above type compare */ + if (strcmp(audit_mark_path(a->exe), + audit_mark_path(b->exe))) + return 1; + break; case AUDIT_UID: case AUDIT_EUID: case AUDIT_SUID: @@ -801,8 +836,14 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old) err = -ENOMEM; else new->filterkey = fk; + break; + case AUDIT_EXE: + err = audit_dupe_exe(new, old); + break; } if (err) { + if (new->exe) + audit_remove_mark(new->exe); audit_free_rule(entry); return ERR_PTR(err); } @@ -863,7 +904,7 @@ static inline int audit_add_rule(struct audit_entry *entry) struct audit_watch *watch = entry->rule.watch; struct audit_tree *tree = entry->rule.tree; struct list_head *list; - int err; + int err = 0; #ifdef CONFIG_AUDITSYSCALL int dont_count = 0; @@ -881,7 +922,7 @@ static inline int audit_add_rule(struct audit_entry *entry) /* normally audit_add_tree_rule() will free it on failure */ if (tree) audit_put_tree(tree); - goto error; + return err; } if (watch) { @@ -895,14 +936,14 @@ static inline int audit_add_rule(struct audit_entry *entry) */ if (tree) audit_put_tree(tree); - goto error; + return err; } } if (tree) { err = audit_add_tree_rule(&entry->rule); if (err) { mutex_unlock(&audit_filter_mutex); - goto error; + return err; } } @@ -933,19 +974,13 @@ static inline int audit_add_rule(struct audit_entry *entry) #endif mutex_unlock(&audit_filter_mutex); - return 0; - -error: - if (watch) - audit_put_watch(watch); /* tmp watch, matches initial get */ return err; } /* Remove an existing rule from filterlist. */ -static inline int audit_del_rule(struct audit_entry *entry) +int audit_del_rule(struct audit_entry *entry) { struct audit_entry *e; - struct audit_watch *watch = entry->rule.watch; struct audit_tree *tree = entry->rule.tree; struct list_head *list; int ret = 0; @@ -961,7 +996,6 @@ static inline int audit_del_rule(struct audit_entry *entry) mutex_lock(&audit_filter_mutex); e = audit_find_rule(entry, &list); if (!e) { - mutex_unlock(&audit_filter_mutex); ret = -ENOENT; goto out; } @@ -972,9 +1006,8 @@ static inline int audit_del_rule(struct audit_entry *entry) if (e->rule.tree) audit_remove_tree_rule(&e->rule); - list_del_rcu(&e->list); - list_del(&e->rule.list); - call_rcu(&e->rcu, audit_free_rule_rcu); + if (e->rule.exe) + audit_remove_mark_rule(&e->rule); #ifdef CONFIG_AUDITSYSCALL if (!dont_count) @@ -983,11 +1016,14 @@ static inline int audit_del_rule(struct audit_entry *entry) if (!audit_match_signal(entry)) audit_signals--; #endif - mutex_unlock(&audit_filter_mutex); + + list_del_rcu(&e->list); + list_del(&e->rule.list); + call_rcu(&e->rcu, audit_free_rule_rcu); out: - if (watch) - audit_put_watch(watch); /* match initial get */ + mutex_unlock(&audit_filter_mutex); + if (tree) audit_put_tree(tree); /* that's the temporary one */ @@ -1077,8 +1113,11 @@ int audit_rule_change(int type, __u32 portid, int seq, void *data, WARN_ON(1); } - if (err || type == AUDIT_DEL_RULE) + if (err || type == AUDIT_DEL_RULE) { + if (entry->rule.exe) + audit_remove_mark(entry->rule.exe); audit_free_rule(entry); + } return err; } @@ -1370,6 +1409,8 @@ static int update_lsm_rule(struct audit_krule *r) return 0; nentry = audit_dupe_rule(r); + if (entry->rule.exe) + audit_remove_mark(entry->rule.exe); if (IS_ERR(nentry)) { /* save the first error encountered for the * return value */ diff --git a/kernel/kernel/auditsc.c b/kernel/kernel/auditsc.c index 9fb9d1cb8..b86cc0495 100644 --- a/kernel/kernel/auditsc.c +++ b/kernel/kernel/auditsc.c @@ -180,7 +180,7 @@ static int audit_match_filetype(struct audit_context *ctx, int val) return 0; list_for_each_entry(n, &ctx->names_list, list) { - if ((n->ino != -1) && + if ((n->ino != AUDIT_INO_UNSET) && ((n->mode & S_IFMT) == mode)) return 1; } @@ -466,6 +466,9 @@ static int audit_filter_rules(struct task_struct *tsk, result = audit_comparator(ctx->ppid, f->op, f->val); } break; + case AUDIT_EXE: + result = audit_exe_compare(tsk, rule->exe); + break; case AUDIT_UID: result = audit_uid_comparator(cred->uid, f->op, f->uid); break; @@ -599,9 +602,7 @@ static int audit_filter_rules(struct task_struct *tsk, result = match_tree_refs(ctx, rule->tree); break; case AUDIT_LOGINUID: - result = 0; - if (ctx) - result = audit_uid_comparator(tsk->loginuid, f->op, f->uid); + result = audit_uid_comparator(tsk->loginuid, f->op, f->uid); break; case AUDIT_LOGINUID_SET: result = audit_comparator(audit_loginuid_set(tsk), f->op, f->val); @@ -1023,8 +1024,7 @@ static int audit_log_single_execve_arg(struct audit_context *context, * for strings that are too long, we should not have created * any. */ - if (unlikely((len == -1) || len > MAX_ARG_STRLEN - 1)) { - WARN_ON(1); + if (WARN_ON_ONCE(len < 0 || len > MAX_ARG_STRLEN - 1)) { send_sig(SIGKILL, current, 0); return -1; } @@ -1683,7 +1683,7 @@ static struct audit_names *audit_alloc_name(struct audit_context *context, aname->should_free = true; } - aname->ino = (unsigned long)-1; + aname->ino = AUDIT_INO_UNSET; aname->type = type; list_add_tail(&aname->list, &context->names_list); @@ -1925,7 +1925,7 @@ void __audit_inode_child(const struct inode *parent, if (inode) audit_copy_inode(found_child, dentry, inode); else - found_child->ino = (unsigned long)-1; + found_child->ino = AUDIT_INO_UNSET; } EXPORT_SYMBOL_GPL(__audit_inode_child); diff --git a/kernel/kernel/bpf/Makefile b/kernel/kernel/bpf/Makefile index e6983be12..13272582e 100644 --- a/kernel/kernel/bpf/Makefile +++ b/kernel/kernel/bpf/Makefile @@ -1,2 +1,4 @@ obj-y := core.o -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o helpers.o + +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o +obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o diff --git a/kernel/kernel/bpf/arraymap.c b/kernel/kernel/bpf/arraymap.c index 8a6616583..b0799bced 100644 --- a/kernel/kernel/bpf/arraymap.c +++ b/kernel/kernel/bpf/arraymap.c @@ -14,12 +14,8 @@ #include <linux/vmalloc.h> #include <linux/slab.h> #include <linux/mm.h> - -struct bpf_array { - struct bpf_map map; - u32 elem_size; - char value[0] __aligned(8); -}; +#include <linux/filter.h> +#include <linux/perf_event.h> /* Called from syscall */ static struct bpf_map *array_map_alloc(union bpf_attr *attr) @@ -32,11 +28,17 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) attr->value_size == 0) return ERR_PTR(-EINVAL); + if (attr->value_size >= 1 << (KMALLOC_SHIFT_MAX - 1)) + /* if value_size is bigger, the user space won't be able to + * access the elements. + */ + return ERR_PTR(-E2BIG); + elem_size = round_up(attr->value_size, 8); /* check round_up into zero and u32 overflow */ if (elem_size == 0 || - attr->max_entries > (U32_MAX - sizeof(*array)) / elem_size) + attr->max_entries > (U32_MAX - PAGE_SIZE - sizeof(*array)) / elem_size) return ERR_PTR(-ENOMEM); array_size = sizeof(*array) + attr->max_entries * elem_size; @@ -53,7 +55,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) array->map.key_size = attr->key_size; array->map.value_size = attr->value_size; array->map.max_entries = attr->max_entries; - + array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT; array->elem_size = elem_size; return &array->map; @@ -109,7 +111,7 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, /* all elements already exist */ return -EEXIST; - memcpy(array->value + array->elem_size * index, value, array->elem_size); + memcpy(array->value + array->elem_size * index, value, map->value_size); return 0; } @@ -154,3 +156,193 @@ static int __init register_array_map(void) return 0; } late_initcall(register_array_map); + +static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr) +{ + /* only file descriptors can be stored in this type of map */ + if (attr->value_size != sizeof(u32)) + return ERR_PTR(-EINVAL); + return array_map_alloc(attr); +} + +static void fd_array_map_free(struct bpf_map *map) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + int i; + + synchronize_rcu(); + + /* make sure it's empty */ + for (i = 0; i < array->map.max_entries; i++) + BUG_ON(array->ptrs[i] != NULL); + kvfree(array); +} + +static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key) +{ + return NULL; +} + +/* only called from syscall */ +static int fd_array_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + void *new_ptr, *old_ptr; + u32 index = *(u32 *)key, ufd; + + if (map_flags != BPF_ANY) + return -EINVAL; + + if (index >= array->map.max_entries) + return -E2BIG; + + ufd = *(u32 *)value; + new_ptr = map->ops->map_fd_get_ptr(map, ufd); + if (IS_ERR(new_ptr)) + return PTR_ERR(new_ptr); + + old_ptr = xchg(array->ptrs + index, new_ptr); + if (old_ptr) + map->ops->map_fd_put_ptr(old_ptr); + + return 0; +} + +static int fd_array_map_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + void *old_ptr; + u32 index = *(u32 *)key; + + if (index >= array->map.max_entries) + return -E2BIG; + + old_ptr = xchg(array->ptrs + index, NULL); + if (old_ptr) { + map->ops->map_fd_put_ptr(old_ptr); + return 0; + } else { + return -ENOENT; + } +} + +static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + struct bpf_prog *prog = bpf_prog_get(fd); + if (IS_ERR(prog)) + return prog; + + if (!bpf_prog_array_compatible(array, prog)) { + bpf_prog_put(prog); + return ERR_PTR(-EINVAL); + } + return prog; +} + +static void prog_fd_array_put_ptr(void *ptr) +{ + struct bpf_prog *prog = ptr; + + bpf_prog_put_rcu(prog); +} + +/* decrement refcnt of all bpf_progs that are stored in this map */ +void bpf_fd_array_map_clear(struct bpf_map *map) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + int i; + + for (i = 0; i < array->map.max_entries; i++) + fd_array_map_delete_elem(map, &i); +} + +static const struct bpf_map_ops prog_array_ops = { + .map_alloc = fd_array_map_alloc, + .map_free = fd_array_map_free, + .map_get_next_key = array_map_get_next_key, + .map_lookup_elem = fd_array_map_lookup_elem, + .map_update_elem = fd_array_map_update_elem, + .map_delete_elem = fd_array_map_delete_elem, + .map_fd_get_ptr = prog_fd_array_get_ptr, + .map_fd_put_ptr = prog_fd_array_put_ptr, +}; + +static struct bpf_map_type_list prog_array_type __read_mostly = { + .ops = &prog_array_ops, + .type = BPF_MAP_TYPE_PROG_ARRAY, +}; + +static int __init register_prog_array_map(void) +{ + bpf_register_map_type(&prog_array_type); + return 0; +} +late_initcall(register_prog_array_map); + +static void perf_event_array_map_free(struct bpf_map *map) +{ + bpf_fd_array_map_clear(map); + fd_array_map_free(map); +} + +static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd) +{ + struct perf_event *event; + const struct perf_event_attr *attr; + + event = perf_event_get(fd); + if (IS_ERR(event)) + return event; + + attr = perf_event_attrs(event); + if (IS_ERR(attr)) + goto err; + + if (attr->inherit) + goto err; + + if (attr->type == PERF_TYPE_RAW) + return event; + + if (attr->type == PERF_TYPE_HARDWARE) + return event; + + if (attr->type == PERF_TYPE_SOFTWARE && + attr->config == PERF_COUNT_SW_BPF_OUTPUT) + return event; +err: + perf_event_release_kernel(event); + return ERR_PTR(-EINVAL); +} + +static void perf_event_fd_array_put_ptr(void *ptr) +{ + struct perf_event *event = ptr; + + perf_event_release_kernel(event); +} + +static const struct bpf_map_ops perf_event_array_ops = { + .map_alloc = fd_array_map_alloc, + .map_free = perf_event_array_map_free, + .map_get_next_key = array_map_get_next_key, + .map_lookup_elem = fd_array_map_lookup_elem, + .map_update_elem = fd_array_map_update_elem, + .map_delete_elem = fd_array_map_delete_elem, + .map_fd_get_ptr = perf_event_fd_array_get_ptr, + .map_fd_put_ptr = perf_event_fd_array_put_ptr, +}; + +static struct bpf_map_type_list perf_event_array_type __read_mostly = { + .ops = &perf_event_array_ops, + .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, +}; + +static int __init register_perf_event_array_map(void) +{ + bpf_register_map_type(&perf_event_array_type); + return 0; +} +late_initcall(register_perf_event_array_map); diff --git a/kernel/kernel/bpf/core.c b/kernel/kernel/bpf/core.c index 54f0e7fcd..334b1bdd5 100644 --- a/kernel/kernel/bpf/core.c +++ b/kernel/kernel/bpf/core.c @@ -26,9 +26,10 @@ #include <linux/vmalloc.h> #include <linux/random.h> #include <linux/moduleloader.h> -#include <asm/unaligned.h> #include <linux/bpf.h> +#include <asm/unaligned.h> + /* Registers */ #define BPF_R0 regs[BPF_REG_0] #define BPF_R1 regs[BPF_REG_1] @@ -62,6 +63,7 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns ptr = skb_network_header(skb) + k - SKF_NET_OFF; else if (k >= SKF_LL_OFF) ptr = skb_mac_header(skb) + k - SKF_LL_OFF; + if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb)) return ptr; @@ -80,6 +82,8 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) if (fp == NULL) return NULL; + kmemcheck_annotate_bitfield(fp, meta); + aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags); if (aux == NULL) { vfree(fp); @@ -88,6 +92,7 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) fp->pages = size / PAGE_SIZE; fp->aux = aux; + fp->aux->prog = fp; return fp; } @@ -108,8 +113,11 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); if (fp != NULL) { + kmemcheck_annotate_bitfield(fp, meta); + memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE); fp->pages = size / PAGE_SIZE; + fp->aux->prog = fp; /* We keep fp->aux from fp_old around in the new * reallocated structure. @@ -175,6 +183,7 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) { return 0; } +EXPORT_SYMBOL_GPL(__bpf_call_base); /** * __bpf_prog_run - run eBPF program on a given context @@ -244,6 +253,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG, /* Call instruction */ [BPF_JMP | BPF_CALL] = &&JMP_CALL, + [BPF_JMP | BPF_CALL | BPF_X] = &&JMP_TAIL_CALL, /* Jumps */ [BPF_JMP | BPF_JA] = &&JMP_JA, [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X, @@ -286,6 +296,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B, [BPF_LD | BPF_IMM | BPF_DW] = &&LD_IMM_DW, }; + u32 tail_call_cnt = 0; void *ptr; int off; @@ -431,6 +442,34 @@ select_insn: BPF_R4, BPF_R5); CONT; + JMP_TAIL_CALL: { + struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2; + struct bpf_array *array = container_of(map, struct bpf_array, map); + struct bpf_prog *prog; + u64 index = BPF_R3; + + if (unlikely(index >= array->map.max_entries)) + goto out; + + if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT)) + goto out; + + tail_call_cnt++; + + prog = READ_ONCE(array->ptrs[index]); + if (unlikely(!prog)) + goto out; + + /* ARG1 at this point is guaranteed to point to CTX from + * the verifier side due to the fact that the tail call is + * handeled like a helper, that is, bpf_tail_call_proto, + * where arg1_type is ARG_PTR_TO_CTX. + */ + insn = prog->insnsi; + goto select_insn; +out: + CONT; + } /* JMP */ JMP_JA: insn += insn->off; @@ -615,25 +654,63 @@ load_byte: return 0; } -void __weak bpf_int_jit_compile(struct bpf_prog *prog) +bool bpf_prog_array_compatible(struct bpf_array *array, + const struct bpf_prog *fp) +{ + if (!array->owner_prog_type) { + /* There's no owner yet where we could check for + * compatibility. + */ + array->owner_prog_type = fp->type; + array->owner_jited = fp->jited; + + return true; + } + + return array->owner_prog_type == fp->type && + array->owner_jited == fp->jited; +} + +static int bpf_check_tail_call(const struct bpf_prog *fp) { + struct bpf_prog_aux *aux = fp->aux; + int i; + + for (i = 0; i < aux->used_map_cnt; i++) { + struct bpf_map *map = aux->used_maps[i]; + struct bpf_array *array; + + if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY) + continue; + + array = container_of(map, struct bpf_array, map); + if (!bpf_prog_array_compatible(array, fp)) + return -EINVAL; + } + + return 0; } /** - * bpf_prog_select_runtime - select execution runtime for BPF program + * bpf_prog_select_runtime - select exec runtime for BPF program * @fp: bpf_prog populated with internal BPF program * - * try to JIT internal BPF program, if JIT is not available select interpreter - * BPF program will be executed via BPF_PROG_RUN() macro + * Try to JIT eBPF program, if JIT is not available, use interpreter. + * The BPF program will be executed via BPF_PROG_RUN() macro. */ -void bpf_prog_select_runtime(struct bpf_prog *fp) +int bpf_prog_select_runtime(struct bpf_prog *fp) { fp->bpf_func = (void *) __bpf_prog_run; - /* Probe if internal BPF can be JITed */ bpf_int_jit_compile(fp); - /* Lock whole bpf_prog as read-only */ bpf_prog_lock_ro(fp); + + /* The tail call compatibility check can only be done at + * this late stage as we need to determine, if we deal + * with JITed or non JITed program concatenations and not + * all eBPF JITs might immediately support all features. + */ + return bpf_check_tail_call(fp); } EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); @@ -651,11 +728,36 @@ void bpf_prog_free(struct bpf_prog *fp) struct bpf_prog_aux *aux = fp->aux; INIT_WORK(&aux->work, bpf_prog_free_deferred); - aux->prog = fp; schedule_work(&aux->work); } EXPORT_SYMBOL_GPL(bpf_prog_free); +/* RNG for unpriviledged user space with separated state from prandom_u32(). */ +static DEFINE_PER_CPU(struct rnd_state, bpf_user_rnd_state); + +void bpf_user_rnd_init_once(void) +{ + prandom_init_once(&bpf_user_rnd_state); +} + +u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + /* Should someone ever have the rather unwise idea to use some + * of the registers passed into this function, then note that + * this function is called from native eBPF and classic-to-eBPF + * transformations. Register assignments from both sides are + * different, f.e. classic always sets fn(ctx, A, X) here. + */ + struct rnd_state *state; + u32 res; + + state = &get_cpu_var(bpf_user_rnd_state); + res = prandom_u32_state(state); + put_cpu_var(state); + + return res; +} + /* Weak definitions of helper functions in case we don't have bpf syscall. */ const struct bpf_func_proto bpf_map_lookup_elem_proto __weak; const struct bpf_func_proto bpf_map_update_elem_proto __weak; @@ -663,6 +765,29 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak; const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; +const struct bpf_func_proto bpf_ktime_get_ns_proto __weak; +const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; +const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; +const struct bpf_func_proto bpf_get_current_comm_proto __weak; +const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) +{ + return NULL; +} + +/* Always built-in helper functions. */ +const struct bpf_func_proto bpf_tail_call_proto = { + .func = NULL, + .gpl_only = false, + .ret_type = RET_VOID, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, +}; + +/* For classic BPF JITs that don't implement bpf_int_jit_compile(). */ +void __weak bpf_int_jit_compile(struct bpf_prog *prog) +{ +} /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call * skb_copy_bits(), so provide a weak definition of it for NET-less config. diff --git a/kernel/kernel/bpf/hashtab.c b/kernel/kernel/bpf/hashtab.c index 83c209d9b..34777b374 100644 --- a/kernel/kernel/bpf/hashtab.c +++ b/kernel/kernel/bpf/hashtab.c @@ -17,7 +17,7 @@ struct bpf_htab { struct bpf_map map; struct hlist_head *buckets; - spinlock_t lock; + raw_spinlock_t lock; u32 count; /* number of elements in this hashtable */ u32 n_buckets; /* number of hash buckets */ u32 elem_size; /* size of each element in bytes */ @@ -64,12 +64,35 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) */ goto free_htab; - err = -ENOMEM; + if (htab->map.value_size >= (1 << (KMALLOC_SHIFT_MAX - 1)) - + MAX_BPF_STACK - sizeof(struct htab_elem)) + /* if value_size is bigger, the user space won't be able to + * access the elements via bpf syscall. This check also makes + * sure that the elem_size doesn't overflow and it's + * kmalloc-able later in htab_map_update_elem() + */ + goto free_htab; + + htab->elem_size = sizeof(struct htab_elem) + + round_up(htab->map.key_size, 8) + + htab->map.value_size; + /* prevent zero size kmalloc and check for u32 overflow */ if (htab->n_buckets == 0 || htab->n_buckets > U32_MAX / sizeof(struct hlist_head)) goto free_htab; + if ((u64) htab->n_buckets * sizeof(struct hlist_head) + + (u64) htab->elem_size * htab->map.max_entries >= + U32_MAX - PAGE_SIZE) + /* make sure page count doesn't overflow */ + goto free_htab; + + htab->map.pages = round_up(htab->n_buckets * sizeof(struct hlist_head) + + htab->elem_size * htab->map.max_entries, + PAGE_SIZE) >> PAGE_SHIFT; + + err = -ENOMEM; htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct hlist_head), GFP_USER | __GFP_NOWARN); @@ -82,12 +105,9 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) for (i = 0; i < htab->n_buckets; i++) INIT_HLIST_HEAD(&htab->buckets[i]); - spin_lock_init(&htab->lock); + raw_spin_lock_init(&htab->lock); htab->count = 0; - htab->elem_size = sizeof(struct htab_elem) + - round_up(htab->map.key_size, 8) + - htab->map.value_size; return &htab->map; free_htab: @@ -218,7 +238,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, WARN_ON_ONCE(!rcu_read_lock_held()); /* allocate new element outside of lock */ - l_new = kmalloc(htab->elem_size, GFP_ATOMIC); + l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN); if (!l_new) return -ENOMEM; @@ -230,7 +250,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, l_new->hash = htab_map_hash(l_new->key, key_size); /* bpf_map_update_elem() can be called in_irq() */ - spin_lock_irqsave(&htab->lock, flags); + raw_spin_lock_irqsave(&htab->lock, flags); head = select_bucket(htab, l_new->hash); @@ -266,11 +286,11 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, } else { htab->count++; } - spin_unlock_irqrestore(&htab->lock, flags); + raw_spin_unlock_irqrestore(&htab->lock, flags); return 0; err: - spin_unlock_irqrestore(&htab->lock, flags); + raw_spin_unlock_irqrestore(&htab->lock, flags); kfree(l_new); return ret; } @@ -291,7 +311,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) hash = htab_map_hash(key, key_size); - spin_lock_irqsave(&htab->lock, flags); + raw_spin_lock_irqsave(&htab->lock, flags); head = select_bucket(htab, hash); @@ -304,7 +324,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) ret = 0; } - spin_unlock_irqrestore(&htab->lock, flags); + raw_spin_unlock_irqrestore(&htab->lock, flags); return ret; } diff --git a/kernel/kernel/bpf/helpers.c b/kernel/kernel/bpf/helpers.c index bd7f5988e..4504ca661 100644 --- a/kernel/kernel/bpf/helpers.c +++ b/kernel/kernel/bpf/helpers.c @@ -13,6 +13,9 @@ #include <linux/rcupdate.h> #include <linux/random.h> #include <linux/smp.h> +#include <linux/ktime.h> +#include <linux/sched.h> +#include <linux/uidgid.h> /* If kernel subsystem is allowing eBPF programs to call this function, * inside its own verifier_ops->get_func_proto() callback it should return @@ -44,11 +47,11 @@ static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) } const struct bpf_func_proto bpf_map_lookup_elem_proto = { - .func = bpf_map_lookup_elem, - .gpl_only = false, - .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, - .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_MAP_KEY, + .func = bpf_map_lookup_elem, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_MAP_KEY, }; static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) @@ -63,13 +66,13 @@ static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) } const struct bpf_func_proto bpf_map_update_elem_proto = { - .func = bpf_map_update_elem, - .gpl_only = false, - .ret_type = RET_INTEGER, - .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_MAP_KEY, - .arg3_type = ARG_PTR_TO_MAP_VALUE, - .arg4_type = ARG_ANYTHING, + .func = bpf_map_update_elem, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_MAP_KEY, + .arg3_type = ARG_PTR_TO_MAP_VALUE, + .arg4_type = ARG_ANYTHING, }; static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) @@ -83,20 +86,15 @@ static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) } const struct bpf_func_proto bpf_map_delete_elem_proto = { - .func = bpf_map_delete_elem, - .gpl_only = false, - .ret_type = RET_INTEGER, - .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_MAP_KEY, + .func = bpf_map_delete_elem, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_MAP_KEY, }; -static u64 bpf_get_prandom_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) -{ - return prandom_u32(); -} - const struct bpf_func_proto bpf_get_prandom_u32_proto = { - .func = bpf_get_prandom_u32, + .func = bpf_user_rnd_u32, .gpl_only = false, .ret_type = RET_INTEGER, }; @@ -111,3 +109,71 @@ const struct bpf_func_proto bpf_get_smp_processor_id_proto = { .gpl_only = false, .ret_type = RET_INTEGER, }; + +static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + /* NMI safe access to clock monotonic */ + return ktime_get_mono_fast_ns(); +} + +const struct bpf_func_proto bpf_ktime_get_ns_proto = { + .func = bpf_ktime_get_ns, + .gpl_only = true, + .ret_type = RET_INTEGER, +}; + +static u64 bpf_get_current_pid_tgid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + struct task_struct *task = current; + + if (!task) + return -EINVAL; + + return (u64) task->tgid << 32 | task->pid; +} + +const struct bpf_func_proto bpf_get_current_pid_tgid_proto = { + .func = bpf_get_current_pid_tgid, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + +static u64 bpf_get_current_uid_gid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + struct task_struct *task = current; + kuid_t uid; + kgid_t gid; + + if (!task) + return -EINVAL; + + current_uid_gid(&uid, &gid); + return (u64) from_kgid(&init_user_ns, gid) << 32 | + from_kuid(&init_user_ns, uid); +} + +const struct bpf_func_proto bpf_get_current_uid_gid_proto = { + .func = bpf_get_current_uid_gid, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + +static u64 bpf_get_current_comm(u64 r1, u64 size, u64 r3, u64 r4, u64 r5) +{ + struct task_struct *task = current; + char *buf = (char *) (long) r1; + + if (!task) + return -EINVAL; + + memcpy(buf, task->comm, min_t(size_t, size, sizeof(task->comm))); + return 0; +} + +const struct bpf_func_proto bpf_get_current_comm_proto = { + .func = bpf_get_current_comm, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_STACK, + .arg2_type = ARG_CONST_STACK_SIZE, +}; diff --git a/kernel/kernel/bpf/inode.c b/kernel/kernel/bpf/inode.c new file mode 100644 index 000000000..5a8a797d5 --- /dev/null +++ b/kernel/kernel/bpf/inode.c @@ -0,0 +1,387 @@ +/* + * Minimal file system backend for holding eBPF maps and programs, + * used by bpf(2) object pinning. + * + * Authors: + * + * Daniel Borkmann <daniel@iogearbox.net> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/magic.h> +#include <linux/major.h> +#include <linux/mount.h> +#include <linux/namei.h> +#include <linux/fs.h> +#include <linux/kdev_t.h> +#include <linux/filter.h> +#include <linux/bpf.h> + +enum bpf_type { + BPF_TYPE_UNSPEC = 0, + BPF_TYPE_PROG, + BPF_TYPE_MAP, +}; + +static void *bpf_any_get(void *raw, enum bpf_type type) +{ + switch (type) { + case BPF_TYPE_PROG: + atomic_inc(&((struct bpf_prog *)raw)->aux->refcnt); + break; + case BPF_TYPE_MAP: + bpf_map_inc(raw, true); + break; + default: + WARN_ON_ONCE(1); + break; + } + + return raw; +} + +static void bpf_any_put(void *raw, enum bpf_type type) +{ + switch (type) { + case BPF_TYPE_PROG: + bpf_prog_put(raw); + break; + case BPF_TYPE_MAP: + bpf_map_put_with_uref(raw); + break; + default: + WARN_ON_ONCE(1); + break; + } +} + +static void *bpf_fd_probe_obj(u32 ufd, enum bpf_type *type) +{ + void *raw; + + *type = BPF_TYPE_MAP; + raw = bpf_map_get_with_uref(ufd); + if (IS_ERR(raw)) { + *type = BPF_TYPE_PROG; + raw = bpf_prog_get(ufd); + } + + return raw; +} + +static const struct inode_operations bpf_dir_iops; + +static const struct inode_operations bpf_prog_iops = { }; +static const struct inode_operations bpf_map_iops = { }; + +static struct inode *bpf_get_inode(struct super_block *sb, + const struct inode *dir, + umode_t mode) +{ + struct inode *inode; + + switch (mode & S_IFMT) { + case S_IFDIR: + case S_IFREG: + break; + default: + return ERR_PTR(-EINVAL); + } + + inode = new_inode(sb); + if (!inode) + return ERR_PTR(-ENOSPC); + + inode->i_ino = get_next_ino(); + inode->i_atime = CURRENT_TIME; + inode->i_mtime = inode->i_atime; + inode->i_ctime = inode->i_atime; + + inode_init_owner(inode, dir, mode); + + return inode; +} + +static int bpf_inode_type(const struct inode *inode, enum bpf_type *type) +{ + *type = BPF_TYPE_UNSPEC; + if (inode->i_op == &bpf_prog_iops) + *type = BPF_TYPE_PROG; + else if (inode->i_op == &bpf_map_iops) + *type = BPF_TYPE_MAP; + else + return -EACCES; + + return 0; +} + +static bool bpf_dname_reserved(const struct dentry *dentry) +{ + return strchr(dentry->d_name.name, '.'); +} + +static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct inode *inode; + + if (bpf_dname_reserved(dentry)) + return -EPERM; + + inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFDIR); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_op = &bpf_dir_iops; + inode->i_fop = &simple_dir_operations; + + inc_nlink(inode); + inc_nlink(dir); + + d_instantiate(dentry, inode); + dget(dentry); + + return 0; +} + +static int bpf_mkobj_ops(struct inode *dir, struct dentry *dentry, + umode_t mode, const struct inode_operations *iops) +{ + struct inode *inode; + + if (bpf_dname_reserved(dentry)) + return -EPERM; + + inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFREG); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_op = iops; + inode->i_private = dentry->d_fsdata; + + d_instantiate(dentry, inode); + dget(dentry); + + return 0; +} + +static int bpf_mkobj(struct inode *dir, struct dentry *dentry, umode_t mode, + dev_t devt) +{ + enum bpf_type type = MINOR(devt); + + if (MAJOR(devt) != UNNAMED_MAJOR || !S_ISREG(mode) || + dentry->d_fsdata == NULL) + return -EPERM; + + switch (type) { + case BPF_TYPE_PROG: + return bpf_mkobj_ops(dir, dentry, mode, &bpf_prog_iops); + case BPF_TYPE_MAP: + return bpf_mkobj_ops(dir, dentry, mode, &bpf_map_iops); + default: + return -EPERM; + } +} + +static const struct inode_operations bpf_dir_iops = { + .lookup = simple_lookup, + .mknod = bpf_mkobj, + .mkdir = bpf_mkdir, + .rmdir = simple_rmdir, + .unlink = simple_unlink, +}; + +static int bpf_obj_do_pin(const struct filename *pathname, void *raw, + enum bpf_type type) +{ + struct dentry *dentry; + struct inode *dir; + struct path path; + umode_t mode; + dev_t devt; + int ret; + + dentry = kern_path_create(AT_FDCWD, pathname->name, &path, 0); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask()); + devt = MKDEV(UNNAMED_MAJOR, type); + + ret = security_path_mknod(&path, dentry, mode, devt); + if (ret) + goto out; + + dir = d_inode(path.dentry); + if (dir->i_op != &bpf_dir_iops) { + ret = -EPERM; + goto out; + } + + dentry->d_fsdata = raw; + ret = vfs_mknod(dir, dentry, mode, devt); + dentry->d_fsdata = NULL; +out: + done_path_create(&path, dentry); + return ret; +} + +int bpf_obj_pin_user(u32 ufd, const char __user *pathname) +{ + struct filename *pname; + enum bpf_type type; + void *raw; + int ret; + + pname = getname(pathname); + if (IS_ERR(pname)) + return PTR_ERR(pname); + + raw = bpf_fd_probe_obj(ufd, &type); + if (IS_ERR(raw)) { + ret = PTR_ERR(raw); + goto out; + } + + ret = bpf_obj_do_pin(pname, raw, type); + if (ret != 0) + bpf_any_put(raw, type); +out: + putname(pname); + return ret; +} + +static void *bpf_obj_do_get(const struct filename *pathname, + enum bpf_type *type) +{ + struct inode *inode; + struct path path; + void *raw; + int ret; + + ret = kern_path(pathname->name, LOOKUP_FOLLOW, &path); + if (ret) + return ERR_PTR(ret); + + inode = d_backing_inode(path.dentry); + ret = inode_permission(inode, MAY_WRITE); + if (ret) + goto out; + + ret = bpf_inode_type(inode, type); + if (ret) + goto out; + + raw = bpf_any_get(inode->i_private, *type); + touch_atime(&path); + + path_put(&path); + return raw; +out: + path_put(&path); + return ERR_PTR(ret); +} + +int bpf_obj_get_user(const char __user *pathname) +{ + enum bpf_type type = BPF_TYPE_UNSPEC; + struct filename *pname; + int ret = -ENOENT; + void *raw; + + pname = getname(pathname); + if (IS_ERR(pname)) + return PTR_ERR(pname); + + raw = bpf_obj_do_get(pname, &type); + if (IS_ERR(raw)) { + ret = PTR_ERR(raw); + goto out; + } + + if (type == BPF_TYPE_PROG) + ret = bpf_prog_new_fd(raw); + else if (type == BPF_TYPE_MAP) + ret = bpf_map_new_fd(raw); + else + goto out; + + if (ret < 0) + bpf_any_put(raw, type); +out: + putname(pname); + return ret; +} + +static void bpf_evict_inode(struct inode *inode) +{ + enum bpf_type type; + + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); + + if (!bpf_inode_type(inode, &type)) + bpf_any_put(inode->i_private, type); +} + +static const struct super_operations bpf_super_ops = { + .statfs = simple_statfs, + .drop_inode = generic_delete_inode, + .evict_inode = bpf_evict_inode, +}; + +static int bpf_fill_super(struct super_block *sb, void *data, int silent) +{ + static struct tree_descr bpf_rfiles[] = { { "" } }; + struct inode *inode; + int ret; + + ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles); + if (ret) + return ret; + + sb->s_op = &bpf_super_ops; + + inode = sb->s_root->d_inode; + inode->i_op = &bpf_dir_iops; + inode->i_mode &= ~S_IALLUGO; + inode->i_mode |= S_ISVTX | S_IRWXUGO; + + return 0; +} + +static struct dentry *bpf_mount(struct file_system_type *type, int flags, + const char *dev_name, void *data) +{ + return mount_ns(type, flags, current->nsproxy->mnt_ns, bpf_fill_super); +} + +static struct file_system_type bpf_fs_type = { + .owner = THIS_MODULE, + .name = "bpf", + .mount = bpf_mount, + .kill_sb = kill_litter_super, + .fs_flags = FS_USERNS_MOUNT, +}; + +MODULE_ALIAS_FS("bpf"); + +static int __init bpf_init(void) +{ + int ret; + + ret = sysfs_create_mount_point(fs_kobj, "bpf"); + if (ret) + return ret; + + ret = register_filesystem(&bpf_fs_type); + if (ret) + sysfs_remove_mount_point(fs_kobj, "bpf"); + + return ret; +} +fs_initcall(bpf_init); diff --git a/kernel/kernel/bpf/syscall.c b/kernel/kernel/bpf/syscall.c index 3bae6c591..3b39550d8 100644 --- a/kernel/kernel/bpf/syscall.c +++ b/kernel/kernel/bpf/syscall.c @@ -18,6 +18,8 @@ #include <linux/filter.h> #include <linux/version.h> +int sysctl_unprivileged_bpf_disabled __read_mostly; + static LIST_HEAD(bpf_map_types); static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) @@ -44,15 +46,50 @@ void bpf_register_map_type(struct bpf_map_type_list *tl) list_add(&tl->list_node, &bpf_map_types); } +static int bpf_map_charge_memlock(struct bpf_map *map) +{ + struct user_struct *user = get_current_user(); + unsigned long memlock_limit; + + memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + + atomic_long_add(map->pages, &user->locked_vm); + + if (atomic_long_read(&user->locked_vm) > memlock_limit) { + atomic_long_sub(map->pages, &user->locked_vm); + free_uid(user); + return -EPERM; + } + map->user = user; + return 0; +} + +static void bpf_map_uncharge_memlock(struct bpf_map *map) +{ + struct user_struct *user = map->user; + + atomic_long_sub(map->pages, &user->locked_vm); + free_uid(user); +} + /* called from workqueue */ static void bpf_map_free_deferred(struct work_struct *work) { struct bpf_map *map = container_of(work, struct bpf_map, work); + bpf_map_uncharge_memlock(map); /* implementation dependent freeing */ map->ops->map_free(map); } +static void bpf_map_put_uref(struct bpf_map *map) +{ + if (atomic_dec_and_test(&map->usercnt)) { + if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) + bpf_fd_array_map_clear(map); + } +} + /* decrement map refcnt and schedule it for freeing via workqueue * (unrelying map implementation ops->map_free() might sleep) */ @@ -64,11 +101,15 @@ void bpf_map_put(struct bpf_map *map) } } -static int bpf_map_release(struct inode *inode, struct file *filp) +void bpf_map_put_with_uref(struct bpf_map *map) { - struct bpf_map *map = filp->private_data; - + bpf_map_put_uref(map); bpf_map_put(map); +} + +static int bpf_map_release(struct inode *inode, struct file *filp) +{ + bpf_map_put_with_uref(filp->private_data); return 0; } @@ -76,6 +117,12 @@ static const struct file_operations bpf_map_fops = { .release = bpf_map_release, }; +int bpf_map_new_fd(struct bpf_map *map) +{ + return anon_inode_getfd("bpf-map", &bpf_map_fops, map, + O_RDWR | O_CLOEXEC); +} + /* helper macro to check that unused fields 'union bpf_attr' are zero */ #define CHECK_ATTR(CMD) \ memchr_inv((void *) &attr->CMD##_LAST_FIELD + \ @@ -101,9 +148,13 @@ static int map_create(union bpf_attr *attr) return PTR_ERR(map); atomic_set(&map->refcnt, 1); + atomic_set(&map->usercnt, 1); - err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC); + err = bpf_map_charge_memlock(map); + if (err) + goto free_map; + err = bpf_map_new_fd(map); if (err < 0) /* failed to allocate fd */ goto free_map; @@ -118,19 +169,36 @@ free_map: /* if error is returned, fd is released. * On success caller should complete fd access with matching fdput() */ -struct bpf_map *bpf_map_get(struct fd f) +struct bpf_map *__bpf_map_get(struct fd f) { - struct bpf_map *map; - if (!f.file) return ERR_PTR(-EBADF); - if (f.file->f_op != &bpf_map_fops) { fdput(f); return ERR_PTR(-EINVAL); } - map = f.file->private_data; + return f.file->private_data; +} + +void bpf_map_inc(struct bpf_map *map, bool uref) +{ + atomic_inc(&map->refcnt); + if (uref) + atomic_inc(&map->usercnt); +} + +struct bpf_map *bpf_map_get_with_uref(u32 ufd) +{ + struct fd f = fdget(ufd); + struct bpf_map *map; + + map = __bpf_map_get(f); + if (IS_ERR(map)) + return map; + + bpf_map_inc(map, true); + fdput(f); return map; } @@ -149,15 +217,16 @@ static int map_lookup_elem(union bpf_attr *attr) void __user *ukey = u64_to_ptr(attr->key); void __user *uvalue = u64_to_ptr(attr->value); int ufd = attr->map_fd; - struct fd f = fdget(ufd); struct bpf_map *map; void *key, *value, *ptr; + struct fd f; int err; if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM)) return -EINVAL; - map = bpf_map_get(f); + f = fdget(ufd); + map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); @@ -171,7 +240,7 @@ static int map_lookup_elem(union bpf_attr *attr) goto free_key; err = -ENOMEM; - value = kmalloc(map->value_size, GFP_USER); + value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN); if (!value) goto free_key; @@ -207,15 +276,16 @@ static int map_update_elem(union bpf_attr *attr) void __user *ukey = u64_to_ptr(attr->key); void __user *uvalue = u64_to_ptr(attr->value); int ufd = attr->map_fd; - struct fd f = fdget(ufd); struct bpf_map *map; void *key, *value; + struct fd f; int err; if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM)) return -EINVAL; - map = bpf_map_get(f); + f = fdget(ufd); + map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); @@ -229,7 +299,7 @@ static int map_update_elem(union bpf_attr *attr) goto free_key; err = -ENOMEM; - value = kmalloc(map->value_size, GFP_USER); + value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN); if (!value) goto free_key; @@ -259,15 +329,16 @@ static int map_delete_elem(union bpf_attr *attr) { void __user *ukey = u64_to_ptr(attr->key); int ufd = attr->map_fd; - struct fd f = fdget(ufd); struct bpf_map *map; + struct fd f; void *key; int err; if (CHECK_ATTR(BPF_MAP_DELETE_ELEM)) return -EINVAL; - map = bpf_map_get(f); + f = fdget(ufd); + map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); @@ -299,15 +370,16 @@ static int map_get_next_key(union bpf_attr *attr) void __user *ukey = u64_to_ptr(attr->key); void __user *unext_key = u64_to_ptr(attr->next_key); int ufd = attr->map_fd; - struct fd f = fdget(ufd); struct bpf_map *map; void *key, *next_key; + struct fd f; int err; if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY)) return -EINVAL; - map = bpf_map_get(f); + f = fdget(ufd); + map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); @@ -392,6 +464,23 @@ static void fixup_bpf_calls(struct bpf_prog *prog) */ BUG_ON(!prog->aux->ops->get_func_proto); + if (insn->imm == BPF_FUNC_get_route_realm) + prog->dst_needed = 1; + if (insn->imm == BPF_FUNC_get_prandom_u32) + bpf_user_rnd_init_once(); + if (insn->imm == BPF_FUNC_tail_call) { + /* mark bpf_tail_call as different opcode + * to avoid conditional branch in + * interpeter for every normal call + * and to prevent accidental JITing by + * JIT compiler that doesn't support + * bpf_tail_call yet + */ + insn->imm = 0; + insn->code |= BPF_X; + continue; + } + fn = prog->aux->ops->get_func_proto(insn->imm); /* all functions that have prototype and verifier allowed * programs to call them, must be real in-kernel functions @@ -413,12 +502,51 @@ static void free_used_maps(struct bpf_prog_aux *aux) kfree(aux->used_maps); } -void bpf_prog_put(struct bpf_prog *prog) +static int bpf_prog_charge_memlock(struct bpf_prog *prog) { - if (atomic_dec_and_test(&prog->aux->refcnt)) { - free_used_maps(prog->aux); - bpf_prog_free(prog); + struct user_struct *user = get_current_user(); + unsigned long memlock_limit; + + memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + + atomic_long_add(prog->pages, &user->locked_vm); + if (atomic_long_read(&user->locked_vm) > memlock_limit) { + atomic_long_sub(prog->pages, &user->locked_vm); + free_uid(user); + return -EPERM; } + prog->aux->user = user; + return 0; +} + +static void bpf_prog_uncharge_memlock(struct bpf_prog *prog) +{ + struct user_struct *user = prog->aux->user; + + atomic_long_sub(prog->pages, &user->locked_vm); + free_uid(user); +} + +static void __prog_put_common(struct rcu_head *rcu) +{ + struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu); + + free_used_maps(aux); + bpf_prog_uncharge_memlock(aux->prog); + bpf_prog_free(aux->prog); +} + +/* version of bpf_prog_put() that is called after a grace period */ +void bpf_prog_put_rcu(struct bpf_prog *prog) +{ + if (atomic_dec_and_test(&prog->aux->refcnt)) + call_rcu(&prog->aux->rcu, __prog_put_common); +} + +void bpf_prog_put(struct bpf_prog *prog) +{ + if (atomic_dec_and_test(&prog->aux->refcnt)) + __prog_put_common(&prog->aux->rcu); } EXPORT_SYMBOL_GPL(bpf_prog_put); @@ -426,7 +554,7 @@ static int bpf_prog_release(struct inode *inode, struct file *filp) { struct bpf_prog *prog = filp->private_data; - bpf_prog_put(prog); + bpf_prog_put_rcu(prog); return 0; } @@ -434,21 +562,22 @@ static const struct file_operations bpf_prog_fops = { .release = bpf_prog_release, }; -static struct bpf_prog *get_prog(struct fd f) +int bpf_prog_new_fd(struct bpf_prog *prog) { - struct bpf_prog *prog; + return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, + O_RDWR | O_CLOEXEC); +} +static struct bpf_prog *__bpf_prog_get(struct fd f) +{ if (!f.file) return ERR_PTR(-EBADF); - if (f.file->f_op != &bpf_prog_fops) { fdput(f); return ERR_PTR(-EINVAL); } - prog = f.file->private_data; - - return prog; + return f.file->private_data; } /* called by sockets/tracing/seccomp before attaching program to an event @@ -459,13 +588,13 @@ struct bpf_prog *bpf_prog_get(u32 ufd) struct fd f = fdget(ufd); struct bpf_prog *prog; - prog = get_prog(f); - + prog = __bpf_prog_get(f); if (IS_ERR(prog)) return prog; atomic_inc(&prog->aux->refcnt); fdput(f); + return prog; } EXPORT_SYMBOL_GPL(bpf_prog_get); @@ -500,11 +629,18 @@ static int bpf_prog_load(union bpf_attr *attr) attr->kern_version != LINUX_VERSION_CODE) return -EINVAL; + if (type != BPF_PROG_TYPE_SOCKET_FILTER && !capable(CAP_SYS_ADMIN)) + return -EPERM; + /* plain bpf_prog allocation */ prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); if (!prog) return -ENOMEM; + err = bpf_prog_charge_memlock(prog); + if (err) + goto free_prog_nouncharge; + prog->len = attr->insn_cnt; err = -EFAULT; @@ -513,10 +649,10 @@ static int bpf_prog_load(union bpf_attr *attr) goto free_prog; prog->orig_prog = NULL; - prog->jited = false; + prog->jited = 0; atomic_set(&prog->aux->refcnt, 1); - prog->gpl_compatible = is_gpl; + prog->gpl_compatible = is_gpl ? 1 : 0; /* find program type: socket_filter vs tracing_filter */ err = find_prog_type(type, prog); @@ -532,9 +668,11 @@ static int bpf_prog_load(union bpf_attr *attr) fixup_bpf_calls(prog); /* eBPF program is ready to be JITed */ - bpf_prog_select_runtime(prog); + err = bpf_prog_select_runtime(prog); + if (err < 0) + goto free_used_maps; - err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC); + err = bpf_prog_new_fd(prog); if (err < 0) /* failed to allocate fd */ goto free_used_maps; @@ -544,20 +682,36 @@ static int bpf_prog_load(union bpf_attr *attr) free_used_maps: free_used_maps(prog->aux); free_prog: + bpf_prog_uncharge_memlock(prog); +free_prog_nouncharge: bpf_prog_free(prog); return err; } +#define BPF_OBJ_LAST_FIELD bpf_fd + +static int bpf_obj_pin(const union bpf_attr *attr) +{ + if (CHECK_ATTR(BPF_OBJ)) + return -EINVAL; + + return bpf_obj_pin_user(attr->bpf_fd, u64_to_ptr(attr->pathname)); +} + +static int bpf_obj_get(const union bpf_attr *attr) +{ + if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0) + return -EINVAL; + + return bpf_obj_get_user(u64_to_ptr(attr->pathname)); +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; int err; - /* the syscall is limited to root temporarily. This restriction will be - * lifted when security audit is clean. Note that eBPF+tracing must have - * this restriction, since it may pass kernel data to user space - */ - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_SYS_ADMIN) && sysctl_unprivileged_bpf_disabled) return -EPERM; if (!access_ok(VERIFY_READ, uattr, 1)) @@ -612,6 +766,12 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_PROG_LOAD: err = bpf_prog_load(&attr); break; + case BPF_OBJ_PIN: + err = bpf_obj_pin(&attr); + break; + case BPF_OBJ_GET: + err = bpf_obj_get(&attr); + break; default: err = -EINVAL; break; diff --git a/kernel/kernel/bpf/verifier.c b/kernel/kernel/bpf/verifier.c index 47dcd3aa6..2e7f7ab73 100644 --- a/kernel/kernel/bpf/verifier.c +++ b/kernel/kernel/bpf/verifier.c @@ -199,6 +199,7 @@ struct verifier_env { struct verifier_state_list **explored_states; /* search pruning optimization */ struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ u32 used_map_cnt; /* number of used maps */ + bool allow_ptr_leaks; }; /* verbose verifier prints what it's seeing @@ -213,7 +214,7 @@ static DEFINE_MUTEX(bpf_verifier_lock); * verbose() is used to dump the verification trace to the log, so the user * can figure out what's wrong with the program */ -static void verbose(const char *fmt, ...) +static __printf(1, 2) void verbose(const char *fmt, ...) { va_list args; @@ -238,6 +239,15 @@ static const char * const reg_type_str[] = { [CONST_IMM] = "imm", }; +static const struct { + int map_type; + int func_id; +} func_limit[] = { + {BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call}, + {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read}, + {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_output}, +}; + static void print_verifier_state(struct verifier_env *env) { enum bpf_reg_type t; @@ -275,7 +285,7 @@ static const char *const bpf_class_string[] = { [BPF_ALU64] = "alu64", }; -static const char *const bpf_alu_string[] = { +static const char *const bpf_alu_string[16] = { [BPF_ADD >> 4] = "+=", [BPF_SUB >> 4] = "-=", [BPF_MUL >> 4] = "*=", @@ -299,7 +309,7 @@ static const char *const bpf_ldst_string[] = { [BPF_DW >> 3] = "u64", }; -static const char *const bpf_jmp_string[] = { +static const char *const bpf_jmp_string[16] = { [BPF_JA >> 4] = "jmp", [BPF_JEQ >> 4] = "==", [BPF_JGT >> 4] = ">", @@ -530,6 +540,21 @@ static int bpf_size_to_bytes(int bpf_size) return -EINVAL; } +static bool is_spillable_regtype(enum bpf_reg_type type) +{ + switch (type) { + case PTR_TO_MAP_VALUE: + case PTR_TO_MAP_VALUE_OR_NULL: + case PTR_TO_STACK: + case PTR_TO_CTX: + case FRAME_PTR: + case CONST_PTR_TO_MAP: + return true; + default: + return false; + } +} + /* check_stack_read/write functions track spill/fill of registers, * stack boundary and alignment are checked in check_mem_access() */ @@ -542,9 +567,7 @@ static int check_stack_write(struct verifier_state *state, int off, int size, */ if (value_regno >= 0 && - (state->regs[value_regno].type == PTR_TO_MAP_VALUE || - state->regs[value_regno].type == PTR_TO_STACK || - state->regs[value_regno].type == PTR_TO_CTX)) { + is_spillable_regtype(state->regs[value_regno].type)) { /* register containing pointer is being spilled into stack */ if (size != BPF_REG_SIZE) { @@ -635,6 +658,20 @@ static int check_ctx_access(struct verifier_env *env, int off, int size, return -EACCES; } +static bool is_pointer_value(struct verifier_env *env, int regno) +{ + if (env->allow_ptr_leaks) + return false; + + switch (env->cur_state.regs[regno].type) { + case UNKNOWN_VALUE: + case CONST_IMM: + return false; + default: + return true; + } +} + /* check whether memory at (regno + off) is accessible for t = (read | write) * if t==write, value_regno is a register which value is stored into memory * if t==read, value_regno is a register which will receive the value from memory @@ -648,6 +685,9 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, struct verifier_state *state = &env->cur_state; int size, err = 0; + if (state->regs[regno].type == PTR_TO_STACK) + off += state->regs[regno].imm; + size = bpf_size_to_bytes(bpf_size); if (size < 0) return size; @@ -658,24 +698,42 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, } if (state->regs[regno].type == PTR_TO_MAP_VALUE) { + if (t == BPF_WRITE && value_regno >= 0 && + is_pointer_value(env, value_regno)) { + verbose("R%d leaks addr into map\n", value_regno); + return -EACCES; + } err = check_map_access(env, regno, off, size); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown_value(state->regs, value_regno); } else if (state->regs[regno].type == PTR_TO_CTX) { + if (t == BPF_WRITE && value_regno >= 0 && + is_pointer_value(env, value_regno)) { + verbose("R%d leaks addr into ctx\n", value_regno); + return -EACCES; + } err = check_ctx_access(env, off, size, t); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown_value(state->regs, value_regno); - } else if (state->regs[regno].type == FRAME_PTR) { + } else if (state->regs[regno].type == FRAME_PTR || + state->regs[regno].type == PTR_TO_STACK) { if (off >= 0 || off < -MAX_BPF_STACK) { verbose("invalid stack off=%d size=%d\n", off, size); return -EACCES; } - if (t == BPF_WRITE) + if (t == BPF_WRITE) { + if (!env->allow_ptr_leaks && + state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL && + size != BPF_REG_SIZE) { + verbose("attempt to corrupt spilled pointer on stack\n"); + return -EACCES; + } err = check_stack_write(state, off, size, value_regno); - else + } else { err = check_stack_read(state, off, size, value_regno); + } } else { verbose("R%d invalid mem access '%s'\n", regno, reg_type_str[state->regs[regno].type]); @@ -763,8 +821,13 @@ static int check_func_arg(struct verifier_env *env, u32 regno, return -EACCES; } - if (arg_type == ARG_ANYTHING) + if (arg_type == ARG_ANYTHING) { + if (is_pointer_value(env, regno)) { + verbose("R%d leaks addr into helper function\n", regno); + return -EACCES; + } return 0; + } if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY || arg_type == ARG_PTR_TO_MAP_VALUE) { @@ -833,6 +896,28 @@ static int check_func_arg(struct verifier_env *env, u32 regno, return err; } +static int check_map_func_compatibility(struct bpf_map *map, int func_id) +{ + bool bool_map, bool_func; + int i; + + if (!map) + return 0; + + for (i = 0; i < ARRAY_SIZE(func_limit); i++) { + bool_map = (map->map_type == func_limit[i].map_type); + bool_func = (func_id == func_limit[i].func_id); + /* only when map & func pair match it can continue. + * don't allow any other map type to be passed into + * the special func; + */ + if (bool_func && bool_map != bool_func) + return -EINVAL; + } + + return 0; +} + static int check_call(struct verifier_env *env, int func_id) { struct verifier_state *state = &env->cur_state; @@ -907,12 +992,18 @@ static int check_call(struct verifier_env *env, int func_id) fn->ret_type, func_id); return -EINVAL; } + + err = check_map_func_compatibility(map, func_id); + if (err) + return err; + return 0; } /* check validity of 32-bit and 64-bit arithmetic operations */ -static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn) +static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) { + struct reg_state *regs = env->cur_state.regs; u8 opcode = BPF_OP(insn->code); int err; @@ -937,6 +1028,12 @@ static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn) if (err) return err; + if (is_pointer_value(env, insn->dst_reg)) { + verbose("R%d pointer arithmetic prohibited\n", + insn->dst_reg); + return -EACCES; + } + /* check dest operand */ err = check_reg_arg(regs, insn->dst_reg, DST_OP); if (err) @@ -973,6 +1070,11 @@ static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn) */ regs[insn->dst_reg] = regs[insn->src_reg]; } else { + if (is_pointer_value(env, insn->src_reg)) { + verbose("R%d partial copy of pointer\n", + insn->src_reg); + return -EACCES; + } regs[insn->dst_reg].type = UNKNOWN_VALUE; regs[insn->dst_reg].map_ptr = NULL; } @@ -1019,11 +1121,31 @@ static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn) return -EINVAL; } + if ((opcode == BPF_LSH || opcode == BPF_RSH || + opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) { + int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32; + + if (insn->imm < 0 || insn->imm >= size) { + verbose("invalid shift %d\n", insn->imm); + return -EINVAL; + } + } + /* pattern match 'bpf_add Rx, imm' instruction */ if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 && regs[insn->dst_reg].type == FRAME_PTR && - BPF_SRC(insn->code) == BPF_K) + BPF_SRC(insn->code) == BPF_K) { stack_relative = true; + } else if (is_pointer_value(env, insn->dst_reg)) { + verbose("R%d pointer arithmetic prohibited\n", + insn->dst_reg); + return -EACCES; + } else if (BPF_SRC(insn->code) == BPF_X && + is_pointer_value(env, insn->src_reg)) { + verbose("R%d pointer arithmetic prohibited\n", + insn->src_reg); + return -EACCES; + } /* check dest operand */ err = check_reg_arg(regs, insn->dst_reg, DST_OP); @@ -1062,6 +1184,12 @@ static int check_cond_jmp_op(struct verifier_env *env, err = check_reg_arg(regs, insn->src_reg, SRC_OP); if (err) return err; + + if (is_pointer_value(env, insn->src_reg)) { + verbose("R%d pointer comparison prohibited\n", + insn->src_reg); + return -EACCES; + } } else { if (insn->src_reg != BPF_REG_0) { verbose("BPF_JMP uses reserved fields\n"); @@ -1116,6 +1244,9 @@ static int check_cond_jmp_op(struct verifier_env *env, regs[insn->dst_reg].type = CONST_IMM; regs[insn->dst_reg].imm = 0; } + } else if (is_pointer_value(env, insn->dst_reg)) { + verbose("R%d pointer comparison prohibited\n", insn->dst_reg); + return -EACCES; } else if (BPF_SRC(insn->code) == BPF_K && (opcode == BPF_JEQ || opcode == BPF_JNE)) { @@ -1619,7 +1750,7 @@ static int do_check(struct verifier_env *env) } if (class == BPF_ALU || class == BPF_ALU64) { - err = check_alu_op(regs, insn); + err = check_alu_op(env, insn); if (err) return err; @@ -1675,6 +1806,8 @@ static int do_check(struct verifier_env *env) } } else if (class == BPF_STX) { + enum bpf_reg_type dst_reg_type; + if (BPF_MODE(insn->code) == BPF_XADD) { err = check_xadd(env, insn); if (err) @@ -1683,11 +1816,6 @@ static int do_check(struct verifier_env *env) continue; } - if (BPF_MODE(insn->code) != BPF_MEM || - insn->imm != 0) { - verbose("BPF_STX uses reserved fields\n"); - return -EINVAL; - } /* check src1 operand */ err = check_reg_arg(regs, insn->src_reg, SRC_OP); if (err) @@ -1697,6 +1825,8 @@ static int do_check(struct verifier_env *env) if (err) return err; + dst_reg_type = regs[insn->dst_reg].type; + /* check that memory (dst_reg + off) is writeable */ err = check_mem_access(env, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, @@ -1704,6 +1834,15 @@ static int do_check(struct verifier_env *env) if (err) return err; + if (insn->imm == 0) { + insn->imm = dst_reg_type; + } else if (dst_reg_type != insn->imm && + (dst_reg_type == PTR_TO_CTX || + insn->imm == PTR_TO_CTX)) { + verbose("same insn cannot be used with different pointers\n"); + return -EINVAL; + } + } else if (class == BPF_ST) { if (BPF_MODE(insn->code) != BPF_MEM || insn->src_reg != BPF_REG_0) { @@ -1769,6 +1908,11 @@ static int do_check(struct verifier_env *env) if (err) return err; + if (is_pointer_value(env, BPF_REG_0)) { + verbose("R0 leaks addr as return value\n"); + return -EACCES; + } + process_bpf_exit: insn_idx = pop_stack(env, &prev_insn_idx); if (insn_idx < 0) { @@ -1822,12 +1966,18 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env) for (i = 0; i < insn_cnt; i++, insn++) { if (BPF_CLASS(insn->code) == BPF_LDX && - (BPF_MODE(insn->code) != BPF_MEM || - insn->imm != 0)) { + (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0)) { verbose("BPF_LDX uses reserved fields\n"); return -EINVAL; } + if (BPF_CLASS(insn->code) == BPF_STX && + ((BPF_MODE(insn->code) != BPF_MEM && + BPF_MODE(insn->code) != BPF_XADD) || insn->imm != 0)) { + verbose("BPF_STX uses reserved fields\n"); + return -EINVAL; + } + if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) { struct bpf_map *map; struct fd f; @@ -1849,8 +1999,7 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env) } f = fdget(insn->imm); - - map = bpf_map_get(f); + map = __bpf_map_get(f); if (IS_ERR(map)) { verbose("fd %d is not pointing to valid bpf_map\n", insn->imm); @@ -1882,8 +2031,7 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env) * will be used by the valid program until it's unloaded * and all maps are released in free_bpf_prog_info() */ - atomic_inc(&map->refcnt); - + bpf_map_inc(map, false); fdput(f); next_insn: insn++; @@ -1934,7 +2082,7 @@ static void adjust_branches(struct bpf_prog *prog, int pos, int delta) /* adjust offset of jmps if necessary */ if (i < pos && i + insn->off + 1 > pos) insn->off += delta; - else if (i > pos && i + insn->off + 1 < pos) + else if (i > pos + delta && i + insn->off + 1 <= pos + delta) insn->off -= delta; } } @@ -1950,12 +2098,17 @@ static int convert_ctx_accesses(struct verifier_env *env) struct bpf_prog *new_prog; u32 cnt; int i; + enum bpf_access_type type; if (!env->prog->aux->ops->convert_ctx_access) return 0; for (i = 0; i < insn_cnt; i++, insn++) { - if (insn->code != (BPF_LDX | BPF_MEM | BPF_W)) + if (insn->code == (BPF_LDX | BPF_MEM | BPF_W)) + type = BPF_READ; + else if (insn->code == (BPF_STX | BPF_MEM | BPF_W)) + type = BPF_WRITE; + else continue; if (insn->imm != PTR_TO_CTX) { @@ -1965,8 +2118,8 @@ static int convert_ctx_accesses(struct verifier_env *env) } cnt = env->prog->aux->ops-> - convert_ctx_access(insn->dst_reg, insn->src_reg, - insn->off, insn_buf); + convert_ctx_access(type, insn->dst_reg, insn->src_reg, + insn->off, insn_buf, env->prog); if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { verbose("bpf verifier is misconfigured\n"); return -EINVAL; @@ -2086,6 +2239,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) if (ret < 0) goto skip_full_check; + env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); + ret = do_check(env); skip_full_check: diff --git a/kernel/kernel/cgroup.c b/kernel/kernel/cgroup.c index 21f5bdfc5..a0087e536 100644 --- a/kernel/kernel/cgroup.c +++ b/kernel/kernel/cgroup.c @@ -45,7 +45,7 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/spinlock.h> -#include <linux/rwsem.h> +#include <linux/percpu-rwsem.h> #include <linux/string.h> #include <linux/sort.h> #include <linux/kmod.h> @@ -57,7 +57,7 @@ #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ #include <linux/kthread.h> #include <linux/delay.h> - +#include <linux/cpuset.h> #include <linux/atomic.h> /* @@ -75,7 +75,7 @@ * cgroup_mutex is the master lock. Any modification to cgroup or its * hierarchy must be performed while holding it. * - * css_set_rwsem protects task->cgroups pointer, the list of css_set + * css_set_lock protects task->cgroups pointer, the list of css_set * objects, and the chain of tasks off each css_set. * * These locks are exported if CONFIG_PROVE_RCU so that accessors in @@ -83,12 +83,12 @@ */ #ifdef CONFIG_PROVE_RCU DEFINE_MUTEX(cgroup_mutex); -DECLARE_RWSEM(css_set_rwsem); +DEFINE_SPINLOCK(css_set_lock); EXPORT_SYMBOL_GPL(cgroup_mutex); -EXPORT_SYMBOL_GPL(css_set_rwsem); +EXPORT_SYMBOL_GPL(css_set_lock); #else static DEFINE_MUTEX(cgroup_mutex); -static DECLARE_RWSEM(css_set_rwsem); +static DEFINE_SPINLOCK(css_set_lock); #endif /* @@ -98,14 +98,22 @@ static DECLARE_RWSEM(css_set_rwsem); static DEFINE_SPINLOCK(cgroup_idr_lock); /* + * Protects cgroup_file->kn for !self csses. It synchronizes notifications + * against file removal/re-creation across css hiding. + */ +static DEFINE_SPINLOCK(cgroup_file_kn_lock); + +/* * Protects cgroup_subsys->release_agent_path. Modifying it also requires * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock. */ static DEFINE_SPINLOCK(release_agent_path_lock); +struct percpu_rw_semaphore cgroup_threadgroup_rwsem; + #define cgroup_assert_mutex_or_rcu_locked() \ - rcu_lockdep_assert(rcu_read_lock_held() || \ - lockdep_is_held(&cgroup_mutex), \ + RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ + !lockdep_is_held(&cgroup_mutex), \ "cgroup_mutex or RCU read lock required"); /* @@ -136,12 +144,34 @@ static const char *cgroup_subsys_name[] = { }; #undef SUBSYS +/* array of static_keys for cgroup_subsys_enabled() and cgroup_subsys_on_dfl() */ +#define SUBSYS(_x) \ + DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key); \ + DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key); \ + EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key); \ + EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key); +#include <linux/cgroup_subsys.h> +#undef SUBSYS + +#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key, +static struct static_key_true *cgroup_subsys_enabled_key[] = { +#include <linux/cgroup_subsys.h> +}; +#undef SUBSYS + +#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key, +static struct static_key_true *cgroup_subsys_on_dfl_key[] = { +#include <linux/cgroup_subsys.h> +}; +#undef SUBSYS + /* * The default hierarchy, reserved for the subsystems that are otherwise * unattached - it never has more than a single cgroup, and all tasks are * part of that cgroup. */ struct cgroup_root cgrp_dfl_root; +EXPORT_SYMBOL_GPL(cgrp_dfl_root); /* * The default hierarchy always exists but is hidden until mounted for the @@ -149,14 +179,8 @@ struct cgroup_root cgrp_dfl_root; */ static bool cgrp_dfl_root_visible; -/* - * Set by the boot param of the same name and makes subsystems with NULL - * ->dfl_files to use ->legacy_files on the default hierarchy. - */ -static bool cgroup_legacy_files_on_dfl; - /* some controllers are not supported in the default hierarchy */ -static unsigned int cgrp_dfl_root_inhibit_ss_mask; +static unsigned long cgrp_dfl_root_inhibit_ss_mask; /* The list of hierarchy roots */ @@ -175,26 +199,104 @@ static DEFINE_IDR(cgroup_hierarchy_idr); */ static u64 css_serial_nr_next = 1; -/* This flag indicates whether tasks in the fork and exit paths should - * check for fork/exit handlers to call. This avoids us having to do - * extra work in the fork/exit path if none of the subsystems need to - * be called. +/* + * These bitmask flags indicate whether tasks in the fork and exit paths have + * fork/exit handlers to call. This avoids us having to do extra work in the + * fork/exit path to check which subsystems have fork/exit callbacks. */ -static int need_forkexit_callback __read_mostly; +static unsigned long have_fork_callback __read_mostly; +static unsigned long have_exit_callback __read_mostly; +static unsigned long have_free_callback __read_mostly; + +/* Ditto for the can_fork callback. */ +static unsigned long have_canfork_callback __read_mostly; static struct cftype cgroup_dfl_base_files[]; static struct cftype cgroup_legacy_base_files[]; static int rebind_subsystems(struct cgroup_root *dst_root, - unsigned int ss_mask); + unsigned long ss_mask); +static void css_task_iter_advance(struct css_task_iter *it); static int cgroup_destroy_locked(struct cgroup *cgrp); static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, bool visible); static void css_release(struct percpu_ref *ref); static void kill_css(struct cgroup_subsys_state *css); -static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], +static int cgroup_addrm_files(struct cgroup_subsys_state *css, + struct cgroup *cgrp, struct cftype cfts[], bool is_add); +/** + * cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID + * @ssid: subsys ID of interest + * + * cgroup_subsys_enabled() can only be used with literal subsys names which + * is fine for individual subsystems but unsuitable for cgroup core. This + * is slower static_key_enabled() based test indexed by @ssid. + */ +static bool cgroup_ssid_enabled(int ssid) +{ + return static_key_enabled(cgroup_subsys_enabled_key[ssid]); +} + +/** + * cgroup_on_dfl - test whether a cgroup is on the default hierarchy + * @cgrp: the cgroup of interest + * + * The default hierarchy is the v2 interface of cgroup and this function + * can be used to test whether a cgroup is on the default hierarchy for + * cases where a subsystem should behave differnetly depending on the + * interface version. + * + * The set of behaviors which change on the default hierarchy are still + * being determined and the mount option is prefixed with __DEVEL__. + * + * List of changed behaviors: + * + * - Mount options "noprefix", "xattr", "clone_children", "release_agent" + * and "name" are disallowed. + * + * - When mounting an existing superblock, mount options should match. + * + * - Remount is disallowed. + * + * - rename(2) is disallowed. + * + * - "tasks" is removed. Everything should be at process granularity. Use + * "cgroup.procs" instead. + * + * - "cgroup.procs" is not sorted. pids will be unique unless they got + * recycled inbetween reads. + * + * - "release_agent" and "notify_on_release" are removed. Replacement + * notification mechanism will be implemented. + * + * - "cgroup.clone_children" is removed. + * + * - "cgroup.subtree_populated" is available. Its value is 0 if the cgroup + * and its descendants contain no task; otherwise, 1. The file also + * generates kernfs notification which can be monitored through poll and + * [di]notify when the value of the file changes. + * + * - cpuset: tasks will be kept in empty cpusets when hotplug happens and + * take masks of ancestors with non-empty cpus/mems, instead of being + * moved to an ancestor. + * + * - cpuset: a task can be moved into an empty cpuset, and again it takes + * masks of ancestors. + * + * - memcg: use_hierarchy is on by default and the cgroup file for the flag + * is not created. + * + * - blkcg: blk-throttle becomes properly hierarchical. + * + * - debug: disallowed on the default hierarchy. + */ +static bool cgroup_on_dfl(const struct cgroup *cgrp) +{ + return cgrp->root == &cgrp_dfl_root; +} + /* IDR wrappers which synchronize using cgroup_idr_lock */ static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, gfp_t gfp_mask) @@ -203,7 +305,7 @@ static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, idr_preload(gfp_mask); spin_lock_bh(&cgroup_idr_lock); - ret = idr_alloc(idr, ptr, start, end, gfp_mask); + ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM); spin_unlock_bh(&cgroup_idr_lock); idr_preload_end(); return ret; @@ -261,7 +363,7 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, * @cgrp: the cgroup of interest * @ss: the subsystem of interest (%NULL returns @cgrp->self) * - * Similar to cgroup_css() but returns the effctive css, which is defined + * Similar to cgroup_css() but returns the effective css, which is defined * as the matching css of the nearest ancestor including self which has @ss * enabled. If @ss is associated with the hierarchy @cgrp is on, this * function is guaranteed to return non-NULL css. @@ -327,6 +429,22 @@ static inline bool cgroup_is_dead(const struct cgroup *cgrp) return !(cgrp->self.flags & CSS_ONLINE); } +static void cgroup_get(struct cgroup *cgrp) +{ + WARN_ON_ONCE(cgroup_is_dead(cgrp)); + css_get(&cgrp->self); +} + +static bool cgroup_tryget(struct cgroup *cgrp) +{ + return css_tryget(&cgrp->self); +} + +static void cgroup_put(struct cgroup *cgrp) +{ + css_put(&cgrp->self); +} + struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) { struct cgroup *cgrp = of->kn->parent->priv; @@ -409,6 +527,24 @@ static int notify_on_release(const struct cgroup *cgrp) for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \ (((ss) = cgroup_subsys[ssid]) || true); (ssid)++) +/** + * for_each_subsys_which - filter for_each_subsys with a bitmask + * @ss: the iteration cursor + * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end + * @ss_maskp: a pointer to the bitmask + * + * The block will only run for cases where the ssid-th bit (1 << ssid) of + * mask is set to 1. + */ +#define for_each_subsys_which(ss, ssid, ss_maskp) \ + if (!CGROUP_SUBSYS_COUNT) /* to avoid spurious gcc warning */ \ + (ssid) = 0; \ + else \ + for_each_set_bit(ssid, ss_maskp, CGROUP_SUBSYS_COUNT) \ + if (((ss) = cgroup_subsys[ssid]) && false) \ + break; \ + else + /* iterate across the hierarchies */ #define for_each_root(root) \ list_for_each_entry((root), &cgroup_roots, root_list) @@ -458,19 +594,31 @@ struct css_set init_css_set = { .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node), .mg_node = LIST_HEAD_INIT(init_css_set.mg_node), + .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), }; static int css_set_count = 1; /* 1 for init_css_set */ /** + * css_set_populated - does a css_set contain any tasks? + * @cset: target css_set + */ +static bool css_set_populated(struct css_set *cset) +{ + lockdep_assert_held(&css_set_lock); + + return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks); +} + +/** * cgroup_update_populated - updated populated count of a cgroup * @cgrp: the target cgroup * @populated: inc or dec populated count * - * @cgrp is either getting the first task (css_set) or losing the last. - * Update @cgrp->populated_cnt accordingly. The count is propagated - * towards root so that a given cgroup's populated_cnt is zero iff the - * cgroup and all its descendants are empty. + * One of the css_sets associated with @cgrp is either getting its first + * task or losing the last. Update @cgrp->populated_cnt accordingly. The + * count is propagated towards root so that a given cgroup's populated_cnt + * is zero iff the cgroup and all its descendants don't contain any tasks. * * @cgrp's interface file "cgroup.populated" is zero if * @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt @@ -480,7 +628,7 @@ static int css_set_count = 1; /* 1 for init_css_set */ */ static void cgroup_update_populated(struct cgroup *cgrp, bool populated) { - lockdep_assert_held(&css_set_rwsem); + lockdep_assert_held(&css_set_lock); do { bool trigger; @@ -493,12 +641,93 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated) if (!trigger) break; - if (cgrp->populated_kn) - kernfs_notify(cgrp->populated_kn); + check_for_release(cgrp); + cgroup_file_notify(&cgrp->events_file); + cgrp = cgroup_parent(cgrp); } while (cgrp); } +/** + * css_set_update_populated - update populated state of a css_set + * @cset: target css_set + * @populated: whether @cset is populated or depopulated + * + * @cset is either getting the first task or losing the last. Update the + * ->populated_cnt of all associated cgroups accordingly. + */ +static void css_set_update_populated(struct css_set *cset, bool populated) +{ + struct cgrp_cset_link *link; + + lockdep_assert_held(&css_set_lock); + + list_for_each_entry(link, &cset->cgrp_links, cgrp_link) + cgroup_update_populated(link->cgrp, populated); +} + +/** + * css_set_move_task - move a task from one css_set to another + * @task: task being moved + * @from_cset: css_set @task currently belongs to (may be NULL) + * @to_cset: new css_set @task is being moved to (may be NULL) + * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks + * + * Move @task from @from_cset to @to_cset. If @task didn't belong to any + * css_set, @from_cset can be NULL. If @task is being disassociated + * instead of moved, @to_cset can be NULL. + * + * This function automatically handles populated_cnt updates and + * css_task_iter adjustments but the caller is responsible for managing + * @from_cset and @to_cset's reference counts. + */ +static void css_set_move_task(struct task_struct *task, + struct css_set *from_cset, struct css_set *to_cset, + bool use_mg_tasks) +{ + lockdep_assert_held(&css_set_lock); + + if (from_cset) { + struct css_task_iter *it, *pos; + + WARN_ON_ONCE(list_empty(&task->cg_list)); + + /* + * @task is leaving, advance task iterators which are + * pointing to it so that they can resume at the next + * position. Advancing an iterator might remove it from + * the list, use safe walk. See css_task_iter_advance*() + * for details. + */ + list_for_each_entry_safe(it, pos, &from_cset->task_iters, + iters_node) + if (it->task_pos == &task->cg_list) + css_task_iter_advance(it); + + list_del_init(&task->cg_list); + if (!css_set_populated(from_cset)) + css_set_update_populated(from_cset, false); + } else { + WARN_ON_ONCE(!list_empty(&task->cg_list)); + } + + if (to_cset) { + /* + * We are synchronized through cgroup_threadgroup_rwsem + * against PF_EXITING setting such that we can't race + * against cgroup_exit() changing the css_set to + * init_css_set and dropping the old one. + */ + WARN_ON_ONCE(task->flags & PF_EXITING); + + if (!css_set_populated(to_cset)) + css_set_update_populated(to_cset, true); + rcu_assign_pointer(task->cgroups, to_cset); + list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks : + &to_cset->tasks); + } +} + /* * hash table for cgroup groups. This improves the performance to find * an existing css_set. This hash doesn't (currently) take into @@ -526,29 +755,24 @@ static void put_css_set_locked(struct css_set *cset) struct cgroup_subsys *ss; int ssid; - lockdep_assert_held(&css_set_rwsem); + lockdep_assert_held(&css_set_lock); if (!atomic_dec_and_test(&cset->refcount)) return; - /* This css_set is dead. unlink it and release cgroup refcounts */ - for_each_subsys(ss, ssid) + /* This css_set is dead. unlink it and release cgroup and css refs */ + for_each_subsys(ss, ssid) { list_del(&cset->e_cset_node[ssid]); + css_put(cset->subsys[ssid]); + } hash_del(&cset->hlist); css_set_count--; list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) { - struct cgroup *cgrp = link->cgrp; - list_del(&link->cset_link); list_del(&link->cgrp_link); - - /* @cgrp can't go away while we're holding css_set_rwsem */ - if (list_empty(&cgrp->cset_links)) { - cgroup_update_populated(cgrp, false); - check_for_release(cgrp); - } - + if (cgroup_parent(link->cgrp)) + cgroup_put(link->cgrp); kfree(link); } @@ -565,9 +789,9 @@ static void put_css_set(struct css_set *cset) if (atomic_add_unless(&cset->refcount, -1, 1)) return; - down_write(&css_set_rwsem); + spin_lock_bh(&css_set_lock); put_css_set_locked(cset); - up_write(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); } /* @@ -756,15 +980,15 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset, link->cset = cset; link->cgrp = cgrp; - if (list_empty(&cgrp->cset_links)) - cgroup_update_populated(cgrp, true); - list_move(&link->cset_link, &cgrp->cset_links); - /* - * Always add links to the tail of the list so that the list - * is sorted by order of hierarchy creation + * Always add links to the tail of the lists so that the lists are + * in choronological order. */ + list_move_tail(&link->cset_link, &cgrp->cset_links); list_add_tail(&link->cgrp_link, &cset->cgrp_links); + + if (cgroup_parent(cgrp)) + cgroup_get(cgrp); } /** @@ -790,11 +1014,11 @@ static struct css_set *find_css_set(struct css_set *old_cset, /* First see if we already have a cgroup group that matches * the desired set */ - down_read(&css_set_rwsem); + spin_lock_bh(&css_set_lock); cset = find_existing_css_set(old_cset, cgrp, template); if (cset) get_css_set(cset); - up_read(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); if (cset) return cset; @@ -815,13 +1039,14 @@ static struct css_set *find_css_set(struct css_set *old_cset, INIT_LIST_HEAD(&cset->mg_tasks); INIT_LIST_HEAD(&cset->mg_preload_node); INIT_LIST_HEAD(&cset->mg_node); + INIT_LIST_HEAD(&cset->task_iters); INIT_HLIST_NODE(&cset->hlist); /* Copy the set of subsystem state objects generated in * find_existing_css_set() */ memcpy(cset->subsys, template, sizeof(cset->subsys)); - down_write(&css_set_rwsem); + spin_lock_bh(&css_set_lock); /* Add reference counts and links from the new css_set. */ list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; @@ -839,11 +1064,15 @@ static struct css_set *find_css_set(struct css_set *old_cset, key = css_set_hash(cset->subsys); hash_add(css_set_table, &cset->hlist, key); - for_each_subsys(ss, ssid) + for_each_subsys(ss, ssid) { + struct cgroup_subsys_state *css = cset->subsys[ssid]; + list_add_tail(&cset->e_cset_node[ssid], - &cset->subsys[ssid]->cgroup->e_csets[ssid]); + &css->cgroup->e_csets[ssid]); + css_get(css); + } - up_write(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); return cset; } @@ -882,7 +1111,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root) static void cgroup_free_root(struct cgroup_root *root) { if (root) { - /* hierarhcy ID shoulid already have been released */ + /* hierarchy ID should already have been released */ WARN_ON_ONCE(root->hierarchy_id); idr_destroy(&root->cgroup_idr); @@ -907,14 +1136,15 @@ static void cgroup_destroy_root(struct cgroup_root *root) * Release all the links from cset_links to this hierarchy's * root cgroup */ - down_write(&css_set_rwsem); + spin_lock_bh(&css_set_lock); list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) { list_del(&link->cset_link); list_del(&link->cgrp_link); kfree(link); } - up_write(&css_set_rwsem); + + spin_unlock_bh(&css_set_lock); if (!list_empty(&root->root_list)) { list_del(&root->root_list); @@ -936,7 +1166,7 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, struct cgroup *res = NULL; lockdep_assert_held(&cgroup_mutex); - lockdep_assert_held(&css_set_rwsem); + lockdep_assert_held(&css_set_lock); if (cset == &init_css_set) { res = &root->cgrp; @@ -959,7 +1189,7 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, /* * Return the cgroup for "task" from the given hierarchy. Must be - * called with cgroup_mutex and css_set_rwsem held. + * called with cgroup_mutex and css_set_lock held. */ static struct cgroup *task_cgroup_from_root(struct task_struct *task, struct cgroup_root *root) @@ -998,17 +1228,19 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, * update of a tasks cgroup pointer by cgroup_attach_task() */ -static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask); static struct kernfs_syscall_ops cgroup_kf_syscall_ops; static const struct file_operations proc_cgroupstats_operations; static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, char *buf) { + struct cgroup_subsys *ss = cft->ss; + if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s", - cft->ss->name, cft->name); + cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name, + cft->name); else strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX); return buf; @@ -1018,43 +1250,25 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, * cgroup_file_mode - deduce file mode of a control file * @cft: the control file in question * - * returns cft->mode if ->mode is not 0 - * returns S_IRUGO|S_IWUSR if it has both a read and a write handler - * returns S_IRUGO if it has only a read handler - * returns S_IWUSR if it has only a write hander + * S_IRUGO for read, S_IWUSR for write. */ static umode_t cgroup_file_mode(const struct cftype *cft) { umode_t mode = 0; - if (cft->mode) - return cft->mode; - if (cft->read_u64 || cft->read_s64 || cft->seq_show) mode |= S_IRUGO; - if (cft->write_u64 || cft->write_s64 || cft->write) - mode |= S_IWUSR; + if (cft->write_u64 || cft->write_s64 || cft->write) { + if (cft->flags & CFTYPE_WORLD_WRITABLE) + mode |= S_IWUGO; + else + mode |= S_IWUSR; + } return mode; } -static void cgroup_get(struct cgroup *cgrp) -{ - WARN_ON_ONCE(cgroup_is_dead(cgrp)); - css_get(&cgrp->self); -} - -static bool cgroup_tryget(struct cgroup *cgrp) -{ - return css_tryget(&cgrp->self); -} - -static void cgroup_put(struct cgroup *cgrp) -{ - css_put(&cgrp->self); -} - /** * cgroup_calc_child_subsys_mask - calculate child_subsys_mask * @cgrp: the target cgroup @@ -1068,11 +1282,11 @@ static void cgroup_put(struct cgroup *cgrp) * @subtree_control is to be applied to @cgrp. The returned mask is always * a superset of @subtree_control and follows the usual hierarchy rules. */ -static unsigned int cgroup_calc_child_subsys_mask(struct cgroup *cgrp, - unsigned int subtree_control) +static unsigned long cgroup_calc_child_subsys_mask(struct cgroup *cgrp, + unsigned long subtree_control) { struct cgroup *parent = cgroup_parent(cgrp); - unsigned int cur_ss_mask = subtree_control; + unsigned long cur_ss_mask = subtree_control; struct cgroup_subsys *ss; int ssid; @@ -1082,11 +1296,10 @@ static unsigned int cgroup_calc_child_subsys_mask(struct cgroup *cgrp, return cur_ss_mask; while (true) { - unsigned int new_ss_mask = cur_ss_mask; + unsigned long new_ss_mask = cur_ss_mask; - for_each_subsys(ss, ssid) - if (cur_ss_mask & (1 << ssid)) - new_ss_mask |= ss->depends_on; + for_each_subsys_which(ss, ssid, &cur_ss_mask) + new_ss_mask |= ss->depends_on; /* * Mask out subsystems which aren't available. This can @@ -1192,41 +1405,85 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) char name[CGROUP_FILE_NAME_MAX]; lockdep_assert_held(&cgroup_mutex); + + if (cft->file_offset) { + struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss); + struct cgroup_file *cfile = (void *)css + cft->file_offset; + + spin_lock_irq(&cgroup_file_kn_lock); + cfile->kn = NULL; + spin_unlock_irq(&cgroup_file_kn_lock); + } + kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); } /** - * cgroup_clear_dir - remove subsys files in a cgroup directory - * @cgrp: target cgroup - * @subsys_mask: mask of the subsystem ids whose files should be removed + * css_clear_dir - remove subsys files in a cgroup directory + * @css: taget css + * @cgrp_override: specify if target cgroup is different from css->cgroup */ -static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask) +static void css_clear_dir(struct cgroup_subsys_state *css, + struct cgroup *cgrp_override) { - struct cgroup_subsys *ss; - int i; + struct cgroup *cgrp = cgrp_override ?: css->cgroup; + struct cftype *cfts; - for_each_subsys(ss, i) { - struct cftype *cfts; + list_for_each_entry(cfts, &css->ss->cfts, node) + cgroup_addrm_files(css, cgrp, cfts, false); +} - if (!(subsys_mask & (1 << i))) - continue; - list_for_each_entry(cfts, &ss->cfts, node) - cgroup_addrm_files(cgrp, cfts, false); +/** + * css_populate_dir - create subsys files in a cgroup directory + * @css: target css + * @cgrp_overried: specify if target cgroup is different from css->cgroup + * + * On failure, no file is added. + */ +static int css_populate_dir(struct cgroup_subsys_state *css, + struct cgroup *cgrp_override) +{ + struct cgroup *cgrp = cgrp_override ?: css->cgroup; + struct cftype *cfts, *failed_cfts; + int ret; + + if (!css->ss) { + if (cgroup_on_dfl(cgrp)) + cfts = cgroup_dfl_base_files; + else + cfts = cgroup_legacy_base_files; + + return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true); } + + list_for_each_entry(cfts, &css->ss->cfts, node) { + ret = cgroup_addrm_files(css, cgrp, cfts, true); + if (ret < 0) { + failed_cfts = cfts; + goto err; + } + } + return 0; +err: + list_for_each_entry(cfts, &css->ss->cfts, node) { + if (cfts == failed_cfts) + break; + cgroup_addrm_files(css, cgrp, cfts, false); + } + return ret; } -static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask) +static int rebind_subsystems(struct cgroup_root *dst_root, + unsigned long ss_mask) { + struct cgroup *dcgrp = &dst_root->cgrp; struct cgroup_subsys *ss; - unsigned int tmp_ss_mask; + unsigned long tmp_ss_mask; int ssid, i, ret; lockdep_assert_held(&cgroup_mutex); - for_each_subsys(ss, ssid) { - if (!(ss_mask & (1 << ssid))) - continue; - + for_each_subsys_which(ss, ssid, &ss_mask) { /* if @ss has non-root csses attached to it, can't move */ if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss))) return -EBUSY; @@ -1241,10 +1498,13 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask) if (dst_root == &cgrp_dfl_root) tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask; - ret = cgroup_populate_dir(&dst_root->cgrp, tmp_ss_mask); - if (ret) { - if (dst_root != &cgrp_dfl_root) - return ret; + for_each_subsys_which(ss, ssid, &tmp_ss_mask) { + struct cgroup *scgrp = &ss->root->cgrp; + int tssid; + + ret = css_populate_dir(cgroup_css(scgrp, ss), dcgrp); + if (!ret) + continue; /* * Rebinding back to the default root is not allowed to @@ -1252,61 +1512,67 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask) * be rare. Moving subsystems back and forth even more so. * Just warn about it and continue. */ - if (cgrp_dfl_root_visible) { - pr_warn("failed to create files (%d) while rebinding 0x%x to default root\n", - ret, ss_mask); - pr_warn("you may retry by moving them to a different hierarchy and unbinding\n"); + if (dst_root == &cgrp_dfl_root) { + if (cgrp_dfl_root_visible) { + pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n", + ret, ss_mask); + pr_warn("you may retry by moving them to a different hierarchy and unbinding\n"); + } + continue; + } + + for_each_subsys_which(ss, tssid, &tmp_ss_mask) { + if (tssid == ssid) + break; + css_clear_dir(cgroup_css(scgrp, ss), dcgrp); } + return ret; } /* * Nothing can fail from this point on. Remove files for the * removed subsystems and rebind each subsystem. */ - for_each_subsys(ss, ssid) - if (ss_mask & (1 << ssid)) - cgroup_clear_dir(&ss->root->cgrp, 1 << ssid); - - for_each_subsys(ss, ssid) { - struct cgroup_root *src_root; - struct cgroup_subsys_state *css; + for_each_subsys_which(ss, ssid, &ss_mask) { + struct cgroup_root *src_root = ss->root; + struct cgroup *scgrp = &src_root->cgrp; + struct cgroup_subsys_state *css = cgroup_css(scgrp, ss); struct css_set *cset; - if (!(ss_mask & (1 << ssid))) - continue; - - src_root = ss->root; - css = cgroup_css(&src_root->cgrp, ss); + WARN_ON(!css || cgroup_css(dcgrp, ss)); - WARN_ON(!css || cgroup_css(&dst_root->cgrp, ss)); + css_clear_dir(css, NULL); - RCU_INIT_POINTER(src_root->cgrp.subsys[ssid], NULL); - rcu_assign_pointer(dst_root->cgrp.subsys[ssid], css); + RCU_INIT_POINTER(scgrp->subsys[ssid], NULL); + rcu_assign_pointer(dcgrp->subsys[ssid], css); ss->root = dst_root; - css->cgroup = &dst_root->cgrp; + css->cgroup = dcgrp; - down_write(&css_set_rwsem); + spin_lock_bh(&css_set_lock); hash_for_each(css_set_table, i, cset, hlist) list_move_tail(&cset->e_cset_node[ss->id], - &dst_root->cgrp.e_csets[ss->id]); - up_write(&css_set_rwsem); + &dcgrp->e_csets[ss->id]); + spin_unlock_bh(&css_set_lock); src_root->subsys_mask &= ~(1 << ssid); - src_root->cgrp.subtree_control &= ~(1 << ssid); - cgroup_refresh_child_subsys_mask(&src_root->cgrp); + scgrp->subtree_control &= ~(1 << ssid); + cgroup_refresh_child_subsys_mask(scgrp); /* default hierarchy doesn't enable controllers by default */ dst_root->subsys_mask |= 1 << ssid; - if (dst_root != &cgrp_dfl_root) { - dst_root->cgrp.subtree_control |= 1 << ssid; - cgroup_refresh_child_subsys_mask(&dst_root->cgrp); + if (dst_root == &cgrp_dfl_root) { + static_branch_enable(cgroup_subsys_on_dfl_key[ssid]); + } else { + dcgrp->subtree_control |= 1 << ssid; + cgroup_refresh_child_subsys_mask(dcgrp); + static_branch_disable(cgroup_subsys_on_dfl_key[ssid]); } if (ss->bind) ss->bind(css); } - kernfs_activate(dst_root->cgrp.kn); + kernfs_activate(dcgrp->kn); return 0; } @@ -1317,9 +1583,10 @@ static int cgroup_show_options(struct seq_file *seq, struct cgroup_subsys *ss; int ssid; - for_each_subsys(ss, ssid) - if (root->subsys_mask & (1 << ssid)) - seq_show_option(seq, ss->name, NULL); + if (root != &cgrp_dfl_root) + for_each_subsys(ss, ssid) + if (root->subsys_mask & (1 << ssid)) + seq_show_option(seq, ss->legacy_name, NULL); if (root->flags & CGRP_ROOT_NOPREFIX) seq_puts(seq, ",noprefix"); if (root->flags & CGRP_ROOT_XATTR) @@ -1339,7 +1606,7 @@ static int cgroup_show_options(struct seq_file *seq, } struct cgroup_sb_opts { - unsigned int subsys_mask; + unsigned long subsys_mask; unsigned int flags; char *release_agent; bool cpuset_clone_children; @@ -1352,7 +1619,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) { char *token, *o = data; bool all_ss = false, one_ss = false; - unsigned int mask = -1U; + unsigned long mask = -1UL; struct cgroup_subsys *ss; int nr_opts = 0; int i; @@ -1433,9 +1700,9 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) } for_each_subsys(ss, i) { - if (strcmp(token, ss->name)) + if (strcmp(token, ss->legacy_name)) continue; - if (ss->disabled) + if (!cgroup_ssid_enabled(i)) continue; /* Mutually exclusive option 'all' + subsystem name */ @@ -1466,7 +1733,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) */ if (all_ss || (!one_ss && !opts->none && !opts->name)) for_each_subsys(ss, i) - if (!ss->disabled) + if (cgroup_ssid_enabled(i)) opts->subsys_mask |= (1 << i); /* @@ -1496,7 +1763,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) int ret = 0; struct cgroup_root *root = cgroup_root_from_kf(kf_root); struct cgroup_sb_opts opts; - unsigned int added_mask, removed_mask; + unsigned long added_mask, removed_mask; if (root == &cgrp_dfl_root) { pr_err("remount is not allowed\n"); @@ -1562,7 +1829,7 @@ static void cgroup_enable_task_cg_lists(void) { struct task_struct *p, *g; - down_write(&css_set_rwsem); + spin_lock_bh(&css_set_lock); if (use_task_css_set_links) goto out_unlock; @@ -1592,14 +1859,16 @@ static void cgroup_enable_task_cg_lists(void) if (!(p->flags & PF_EXITING)) { struct css_set *cset = task_css_set(p); - list_add(&p->cg_list, &cset->tasks); + if (!css_set_populated(cset)) + css_set_update_populated(cset, true); + list_add_tail(&p->cg_list, &cset->tasks); get_css_set(cset); } spin_unlock_irq(&p->sighand->siglock); } while_each_thread(g, p); read_unlock(&tasklist_lock); out_unlock: - up_write(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); } static void init_cgroup_housekeeping(struct cgroup *cgrp) @@ -1642,17 +1911,16 @@ static void init_cgroup_root(struct cgroup_root *root, set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } -static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) +static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) { LIST_HEAD(tmp_links); struct cgroup *root_cgrp = &root->cgrp; - struct cftype *base_files; struct css_set *cset; int i, ret; lockdep_assert_held(&cgroup_mutex); - ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_NOWAIT); + ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL); if (ret < 0) goto out; root_cgrp->id = ret; @@ -1663,7 +1931,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) goto out; /* - * We're accessing css_set_count without locking css_set_rwsem here, + * We're accessing css_set_count without locking css_set_lock here, * but that's OK - it can only be increased by someone holding * cgroup_lock, and that's us. The worst that can happen is that we * have some link structures left over @@ -1685,12 +1953,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) } root_cgrp->kn = root->kf_root->kn; - if (root == &cgrp_dfl_root) - base_files = cgroup_dfl_base_files; - else - base_files = cgroup_legacy_base_files; - - ret = cgroup_addrm_files(root_cgrp, base_files, true); + ret = css_populate_dir(&root_cgrp->self, NULL); if (ret) goto destroy_root; @@ -1710,10 +1973,13 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) * Link the root cgroup in this hierarchy into all the css_set * objects. */ - down_write(&css_set_rwsem); - hash_for_each(css_set_table, i, cset, hlist) + spin_lock_bh(&css_set_lock); + hash_for_each(css_set_table, i, cset, hlist) { link_css_set(&tmp_links, cset, root_cgrp); - up_write(&css_set_rwsem); + if (css_set_populated(cset)) + cgroup_update_populated(root_cgrp, true); + } + spin_unlock_bh(&css_set_lock); BUG_ON(!list_empty(&root_cgrp->self.children)); BUG_ON(atomic_read(&root->nr_cgrps) != 1); @@ -1946,7 +2212,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) char *path = NULL; mutex_lock(&cgroup_mutex); - down_read(&css_set_rwsem); + spin_lock_bh(&css_set_lock); root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id); @@ -1959,7 +2225,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) path = buf; } - up_read(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); mutex_unlock(&cgroup_mutex); return path; } @@ -1971,6 +2237,9 @@ struct cgroup_taskset { struct list_head src_csets; struct list_head dst_csets; + /* the subsys currently being processed */ + int ssid; + /* * Fields for cgroup_taskset_*() iteration. * @@ -1987,28 +2256,75 @@ struct cgroup_taskset { struct task_struct *cur_task; }; +#define CGROUP_TASKSET_INIT(tset) (struct cgroup_taskset){ \ + .src_csets = LIST_HEAD_INIT(tset.src_csets), \ + .dst_csets = LIST_HEAD_INIT(tset.dst_csets), \ + .csets = &tset.src_csets, \ +} + +/** + * cgroup_taskset_add - try to add a migration target task to a taskset + * @task: target task + * @tset: target taskset + * + * Add @task, which is a migration target, to @tset. This function becomes + * noop if @task doesn't need to be migrated. @task's css_set should have + * been added as a migration source and @task->cg_list will be moved from + * the css_set's tasks list to mg_tasks one. + */ +static void cgroup_taskset_add(struct task_struct *task, + struct cgroup_taskset *tset) +{ + struct css_set *cset; + + lockdep_assert_held(&css_set_lock); + + /* @task either already exited or can't exit until the end */ + if (task->flags & PF_EXITING) + return; + + /* leave @task alone if post_fork() hasn't linked it yet */ + if (list_empty(&task->cg_list)) + return; + + cset = task_css_set(task); + if (!cset->mg_src_cgrp) + return; + + list_move_tail(&task->cg_list, &cset->mg_tasks); + if (list_empty(&cset->mg_node)) + list_add_tail(&cset->mg_node, &tset->src_csets); + if (list_empty(&cset->mg_dst_cset->mg_node)) + list_move_tail(&cset->mg_dst_cset->mg_node, + &tset->dst_csets); +} + /** * cgroup_taskset_first - reset taskset and return the first task * @tset: taskset of interest + * @dst_cssp: output variable for the destination css * * @tset iteration is initialized and the first task is returned. */ -struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset) +struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset, + struct cgroup_subsys_state **dst_cssp) { tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node); tset->cur_task = NULL; - return cgroup_taskset_next(tset); + return cgroup_taskset_next(tset, dst_cssp); } /** * cgroup_taskset_next - iterate to the next task in taskset * @tset: taskset of interest + * @dst_cssp: output variable for the destination css * * Return the next task in @tset. Iteration must have been initialized * with cgroup_taskset_first(). */ -struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) +struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, + struct cgroup_subsys_state **dst_cssp) { struct css_set *cset = tset->cur_cset; struct task_struct *task = tset->cur_task; @@ -2023,6 +2339,18 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) if (&task->cg_list != &cset->mg_tasks) { tset->cur_cset = cset; tset->cur_task = task; + + /* + * This function may be called both before and + * after cgroup_taskset_migrate(). The two cases + * can be distinguished by looking at whether @cset + * has its ->mg_dst_cset set. + */ + if (cset->mg_dst_cset) + *dst_cssp = cset->mg_dst_cset->subsys[tset->ssid]; + else + *dst_cssp = cset->subsys[tset->ssid]; + return task; } @@ -2034,47 +2362,92 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) } /** - * cgroup_task_migrate - move a task from one cgroup to another. - * @old_cgrp: the cgroup @tsk is being migrated from - * @tsk: the task being migrated - * @new_cset: the new css_set @tsk is being attached to + * cgroup_taskset_migrate - migrate a taskset to a cgroup + * @tset: taget taskset + * @dst_cgrp: destination cgroup * - * Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked. + * Migrate tasks in @tset to @dst_cgrp. This function fails iff one of the + * ->can_attach callbacks fails and guarantees that either all or none of + * the tasks in @tset are migrated. @tset is consumed regardless of + * success. */ -static void cgroup_task_migrate(struct cgroup *old_cgrp, - struct task_struct *tsk, - struct css_set *new_cset) +static int cgroup_taskset_migrate(struct cgroup_taskset *tset, + struct cgroup *dst_cgrp) { - struct css_set *old_cset; - - lockdep_assert_held(&cgroup_mutex); - lockdep_assert_held(&css_set_rwsem); + struct cgroup_subsys_state *css, *failed_css = NULL; + struct task_struct *task, *tmp_task; + struct css_set *cset, *tmp_cset; + int i, ret; - /* - * We are synchronized through threadgroup_lock() against PF_EXITING - * setting such that we can't race against cgroup_exit() changing the - * css_set to init_css_set and dropping the old one. - */ - WARN_ON_ONCE(tsk->flags & PF_EXITING); - old_cset = task_css_set(tsk); + /* methods shouldn't be called if no task is actually migrating */ + if (list_empty(&tset->src_csets)) + return 0; - get_css_set(new_cset); - rcu_assign_pointer(tsk->cgroups, new_cset); + /* check that we can legitimately attach to the cgroup */ + for_each_e_css(css, i, dst_cgrp) { + if (css->ss->can_attach) { + tset->ssid = i; + ret = css->ss->can_attach(tset); + if (ret) { + failed_css = css; + goto out_cancel_attach; + } + } + } /* - * Use move_tail so that cgroup_taskset_first() still returns the - * leader after migration. This works because cgroup_migrate() - * ensures that the dst_cset of the leader is the first on the - * tset's dst_csets list. + * Now that we're guaranteed success, proceed to move all tasks to + * the new cgroup. There are no failure cases after here, so this + * is the commit point. */ - list_move_tail(&tsk->cg_list, &new_cset->mg_tasks); + spin_lock_bh(&css_set_lock); + list_for_each_entry(cset, &tset->src_csets, mg_node) { + list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) { + struct css_set *from_cset = task_css_set(task); + struct css_set *to_cset = cset->mg_dst_cset; + + get_css_set(to_cset); + css_set_move_task(task, from_cset, to_cset, true); + put_css_set_locked(from_cset); + } + } + spin_unlock_bh(&css_set_lock); /* - * We just gained a reference on old_cset by taking it from the - * task. As trading it for new_cset is protected by cgroup_mutex, - * we're safe to drop it here; it will be freed under RCU. + * Migration is committed, all target tasks are now on dst_csets. + * Nothing is sensitive to fork() after this point. Notify + * controllers that migration is complete. */ - put_css_set_locked(old_cset); + tset->csets = &tset->dst_csets; + + for_each_e_css(css, i, dst_cgrp) { + if (css->ss->attach) { + tset->ssid = i; + css->ss->attach(tset); + } + } + + ret = 0; + goto out_release_tset; + +out_cancel_attach: + for_each_e_css(css, i, dst_cgrp) { + if (css == failed_css) + break; + if (css->ss->cancel_attach) { + tset->ssid = i; + css->ss->cancel_attach(tset); + } + } +out_release_tset: + spin_lock_bh(&css_set_lock); + list_splice_init(&tset->dst_csets, &tset->src_csets); + list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) { + list_splice_tail_init(&cset->mg_tasks, &cset->tasks); + list_del_init(&cset->mg_node); + } + spin_unlock_bh(&css_set_lock); + return ret; } /** @@ -2090,14 +2463,14 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets) lockdep_assert_held(&cgroup_mutex); - down_write(&css_set_rwsem); + spin_lock_bh(&css_set_lock); list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) { cset->mg_src_cgrp = NULL; cset->mg_dst_cset = NULL; list_del_init(&cset->mg_preload_node); put_css_set_locked(cset); } - up_write(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); } /** @@ -2110,10 +2483,11 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets) * @src_cset and add it to @preloaded_csets, which should later be cleaned * up by cgroup_migrate_finish(). * - * This function may be called without holding threadgroup_lock even if the - * target is a process. Threads may be created and destroyed but as long - * as cgroup_mutex is not dropped, no new css_set can be put into play and - * the preloaded css_sets are guaranteed to cover all migrations. + * This function may be called without holding cgroup_threadgroup_rwsem + * even if the target is a process. Threads may be created and destroyed + * but as long as cgroup_mutex is not dropped, no new css_set can be put + * into play and the preloaded css_sets are guaranteed to cover all + * migrations. */ static void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp, @@ -2122,7 +2496,7 @@ static void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *src_cgrp; lockdep_assert_held(&cgroup_mutex); - lockdep_assert_held(&css_set_rwsem); + lockdep_assert_held(&css_set_lock); src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); @@ -2211,12 +2585,12 @@ err: /** * cgroup_migrate - migrate a process or task to a cgroup - * @cgrp: the destination cgroup * @leader: the leader of the process or the task to migrate * @threadgroup: whether @leader points to the whole process or a single task + * @cgrp: the destination cgroup * * Migrate a process or task denoted by @leader to @cgrp. If migrating a - * process, the caller must be holding threadgroup_lock of @leader. The + * process, the caller must be holding cgroup_threadgroup_rwsem. The * caller is also responsible for invoking cgroup_migrate_add_src() and * cgroup_migrate_prepare_dst() on the targets before invoking this * function and following up with cgroup_migrate_finish(). @@ -2227,115 +2601,29 @@ err: * decided for all targets by invoking group_migrate_prepare_dst() before * actually starting migrating. */ -static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader, - bool threadgroup) -{ - struct cgroup_taskset tset = { - .src_csets = LIST_HEAD_INIT(tset.src_csets), - .dst_csets = LIST_HEAD_INIT(tset.dst_csets), - .csets = &tset.src_csets, - }; - struct cgroup_subsys_state *css, *failed_css = NULL; - struct css_set *cset, *tmp_cset; - struct task_struct *task, *tmp_task; - int i, ret; +static int cgroup_migrate(struct task_struct *leader, bool threadgroup, + struct cgroup *cgrp) +{ + struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset); + struct task_struct *task; /* * Prevent freeing of tasks while we take a snapshot. Tasks that are * already PF_EXITING could be freed from underneath us unless we * take an rcu_read_lock. */ - down_write(&css_set_rwsem); + spin_lock_bh(&css_set_lock); rcu_read_lock(); task = leader; do { - /* @task either already exited or can't exit until the end */ - if (task->flags & PF_EXITING) - goto next; - - /* leave @task alone if post_fork() hasn't linked it yet */ - if (list_empty(&task->cg_list)) - goto next; - - cset = task_css_set(task); - if (!cset->mg_src_cgrp) - goto next; - - /* - * cgroup_taskset_first() must always return the leader. - * Take care to avoid disturbing the ordering. - */ - list_move_tail(&task->cg_list, &cset->mg_tasks); - if (list_empty(&cset->mg_node)) - list_add_tail(&cset->mg_node, &tset.src_csets); - if (list_empty(&cset->mg_dst_cset->mg_node)) - list_move_tail(&cset->mg_dst_cset->mg_node, - &tset.dst_csets); - next: + cgroup_taskset_add(task, &tset); if (!threadgroup) break; } while_each_thread(leader, task); rcu_read_unlock(); - up_write(&css_set_rwsem); - - /* methods shouldn't be called if no task is actually migrating */ - if (list_empty(&tset.src_csets)) - return 0; - - /* check that we can legitimately attach to the cgroup */ - for_each_e_css(css, i, cgrp) { - if (css->ss->can_attach) { - ret = css->ss->can_attach(css, &tset); - if (ret) { - failed_css = css; - goto out_cancel_attach; - } - } - } - - /* - * Now that we're guaranteed success, proceed to move all tasks to - * the new cgroup. There are no failure cases after here, so this - * is the commit point. - */ - down_write(&css_set_rwsem); - list_for_each_entry(cset, &tset.src_csets, mg_node) { - list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) - cgroup_task_migrate(cset->mg_src_cgrp, task, - cset->mg_dst_cset); - } - up_write(&css_set_rwsem); - - /* - * Migration is committed, all target tasks are now on dst_csets. - * Nothing is sensitive to fork() after this point. Notify - * controllers that migration is complete. - */ - tset.csets = &tset.dst_csets; - - for_each_e_css(css, i, cgrp) - if (css->ss->attach) - css->ss->attach(css, &tset); - - ret = 0; - goto out_release_tset; + spin_unlock_bh(&css_set_lock); -out_cancel_attach: - for_each_e_css(css, i, cgrp) { - if (css == failed_css) - break; - if (css->ss->cancel_attach) - css->ss->cancel_attach(css, &tset); - } -out_release_tset: - down_write(&css_set_rwsem); - list_splice_init(&tset.dst_csets, &tset.src_csets); - list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) { - list_splice_tail_init(&cset->mg_tasks, &cset->tasks); - list_del_init(&cset->mg_node); - } - up_write(&css_set_rwsem); - return ret; + return cgroup_taskset_migrate(&tset, cgrp); } /** @@ -2344,7 +2632,7 @@ out_release_tset: * @leader: the task or the leader of the threadgroup to be attached * @threadgroup: attach the whole threadgroup? * - * Call holding cgroup_mutex and threadgroup_lock of @leader. + * Call holding cgroup_mutex and cgroup_threadgroup_rwsem. */ static int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, bool threadgroup) @@ -2354,7 +2642,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, int ret; /* look up all src csets */ - down_read(&css_set_rwsem); + spin_lock_bh(&css_set_lock); rcu_read_lock(); task = leader; do { @@ -2364,17 +2652,58 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, break; } while_each_thread(leader, task); rcu_read_unlock(); - up_read(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); /* prepare dst csets and commit */ ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets); if (!ret) - ret = cgroup_migrate(dst_cgrp, leader, threadgroup); + ret = cgroup_migrate(leader, threadgroup, dst_cgrp); cgroup_migrate_finish(&preloaded_csets); return ret; } +static int cgroup_procs_write_permission(struct task_struct *task, + struct cgroup *dst_cgrp, + struct kernfs_open_file *of) +{ + const struct cred *cred = current_cred(); + const struct cred *tcred = get_task_cred(task); + int ret = 0; + + /* + * even if we're attaching all tasks in the thread group, we only + * need to check permissions on one of them. + */ + if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && + !uid_eq(cred->euid, tcred->uid) && + !uid_eq(cred->euid, tcred->suid)) + ret = -EACCES; + + if (!ret && cgroup_on_dfl(dst_cgrp)) { + struct super_block *sb = of->file->f_path.dentry->d_sb; + struct cgroup *cgrp; + struct inode *inode; + + spin_lock_bh(&css_set_lock); + cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); + spin_unlock_bh(&css_set_lock); + + while (!cgroup_is_descendant(dst_cgrp, cgrp)) + cgrp = cgroup_parent(cgrp); + + ret = -ENOMEM; + inode = kernfs_get_inode(sb, cgrp->procs_file.kn); + if (inode) { + ret = inode_permission(inode, MAY_WRITE); + iput(inode); + } + } + + put_cred(tcred); + return ret; +} + /* * Find the task_struct of the task to attach by vpid and pass it along to the * function to attach either it or all tasks in its threadgroup. Will lock @@ -2384,7 +2713,6 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off, bool threadgroup) { struct task_struct *tsk; - const struct cred *cred = current_cred(), *tcred; struct cgroup *cgrp; pid_t pid; int ret; @@ -2396,29 +2724,17 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, if (!cgrp) return -ENODEV; -retry_find_task: + percpu_down_write(&cgroup_threadgroup_rwsem); rcu_read_lock(); if (pid) { tsk = find_task_by_vpid(pid); if (!tsk) { - rcu_read_unlock(); ret = -ESRCH; - goto out_unlock_cgroup; - } - /* - * even if we're attaching all tasks in the thread group, we - * only need to check permissions on one of them. - */ - tcred = __task_cred(tsk); - if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && - !uid_eq(cred->euid, tcred->uid) && - !uid_eq(cred->euid, tcred->suid)) { - rcu_read_unlock(); - ret = -EACCES; - goto out_unlock_cgroup; + goto out_unlock_rcu; } - } else + } else { tsk = current; + } if (threadgroup) tsk = tsk->group_leader; @@ -2430,36 +2746,25 @@ retry_find_task: */ if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) { ret = -EINVAL; - rcu_read_unlock(); - goto out_unlock_cgroup; + goto out_unlock_rcu; } get_task_struct(tsk); rcu_read_unlock(); - threadgroup_lock(tsk); - if (threadgroup) { - if (!thread_group_leader(tsk)) { - /* - * a race with de_thread from another thread's exec() - * may strip us of our leadership, if this happens, - * there is no choice but to throw this task away and - * try again; this is - * "double-double-toil-and-trouble-check locking". - */ - threadgroup_unlock(tsk); - put_task_struct(tsk); - goto retry_find_task; - } - } - - ret = cgroup_attach_task(cgrp, tsk, threadgroup); - - threadgroup_unlock(tsk); + ret = cgroup_procs_write_permission(tsk, cgrp, of); + if (!ret) + ret = cgroup_attach_task(cgrp, tsk, threadgroup); put_task_struct(tsk); -out_unlock_cgroup: + goto out_unlock_threadgroup; + +out_unlock_rcu: + rcu_read_unlock(); +out_unlock_threadgroup: + percpu_up_write(&cgroup_threadgroup_rwsem); cgroup_kn_unlock(of->kn); + cpuset_post_attach_flush(); return ret ?: nbytes; } @@ -2480,9 +2785,9 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) if (root == &cgrp_dfl_root) continue; - down_read(&css_set_rwsem); + spin_lock_bh(&css_set_lock); from_cgrp = task_cgroup_from_root(from, root); - up_read(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); retval = cgroup_attach_task(from_cgrp, tsk, false); if (retval) @@ -2541,19 +2846,17 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) return 0; } -static void cgroup_print_ss_mask(struct seq_file *seq, unsigned int ss_mask) +static void cgroup_print_ss_mask(struct seq_file *seq, unsigned long ss_mask) { struct cgroup_subsys *ss; bool printed = false; int ssid; - for_each_subsys(ss, ssid) { - if (ss_mask & (1 << ssid)) { - if (printed) - seq_putc(seq, ' '); - seq_printf(seq, "%s", ss->name); - printed = true; - } + for_each_subsys_which(ss, ssid, &ss_mask) { + if (printed) + seq_putc(seq, ' '); + seq_printf(seq, "%s", ss->name); + printed = true; } if (printed) seq_putc(seq, '\n'); @@ -2599,14 +2902,17 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v) static int cgroup_update_dfl_csses(struct cgroup *cgrp) { LIST_HEAD(preloaded_csets); + struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset); struct cgroup_subsys_state *css; struct css_set *src_cset; int ret; lockdep_assert_held(&cgroup_mutex); + percpu_down_write(&cgroup_threadgroup_rwsem); + /* look up all csses currently attached to @cgrp's subtree */ - down_read(&css_set_rwsem); + spin_lock_bh(&css_set_lock); css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) { struct cgrp_cset_link *link; @@ -2618,68 +2924,31 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) cgroup_migrate_add_src(link->cset, cgrp, &preloaded_csets); } - up_read(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); /* NULL dst indicates self on default hierarchy */ ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets); if (ret) goto out_finish; + spin_lock_bh(&css_set_lock); list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) { - struct task_struct *last_task = NULL, *task; + struct task_struct *task, *ntask; /* src_csets precede dst_csets, break on the first dst_cset */ if (!src_cset->mg_src_cgrp) break; - /* - * All tasks in src_cset need to be migrated to the - * matching dst_cset. Empty it process by process. We - * walk tasks but migrate processes. The leader might even - * belong to a different cset but such src_cset would also - * be among the target src_csets because the default - * hierarchy enforces per-process membership. - */ - while (true) { - down_read(&css_set_rwsem); - task = list_first_entry_or_null(&src_cset->tasks, - struct task_struct, cg_list); - if (task) { - task = task->group_leader; - WARN_ON_ONCE(!task_css_set(task)->mg_src_cgrp); - get_task_struct(task); - } - up_read(&css_set_rwsem); - - if (!task) - break; - - /* guard against possible infinite loop */ - if (WARN(last_task == task, - "cgroup: update_dfl_csses failed to make progress, aborting in inconsistent state\n")) - goto out_finish; - last_task = task; - - threadgroup_lock(task); - /* raced against de_thread() from another thread? */ - if (!thread_group_leader(task)) { - threadgroup_unlock(task); - put_task_struct(task); - continue; - } - - ret = cgroup_migrate(src_cset->dfl_cgrp, task, true); - - threadgroup_unlock(task); - put_task_struct(task); - - if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret)) - goto out_finish; - } + /* all tasks in src_csets need to be migrated */ + list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list) + cgroup_taskset_add(task, &tset); } + spin_unlock_bh(&css_set_lock); + ret = cgroup_taskset_migrate(&tset, cgrp); out_finish: cgroup_migrate_finish(&preloaded_csets); + percpu_up_write(&cgroup_threadgroup_rwsem); return ret; } @@ -2688,8 +2957,8 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - unsigned int enable = 0, disable = 0; - unsigned int css_enable, css_disable, old_sc, new_sc, old_ss, new_ss; + unsigned long enable = 0, disable = 0; + unsigned long css_enable, css_disable, old_sc, new_sc, old_ss, new_ss; struct cgroup *cgrp, *child; struct cgroup_subsys *ss; char *tok; @@ -2701,11 +2970,13 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, */ buf = strstrip(buf); while ((tok = strsep(&buf, " "))) { + unsigned long tmp_ss_mask = ~cgrp_dfl_root_inhibit_ss_mask; + if (tok[0] == '\0') continue; - for_each_subsys(ss, ssid) { - if (ss->disabled || strcmp(tok + 1, ss->name) || - ((1 << ss->id) & cgrp_dfl_root_inhibit_ss_mask)) + for_each_subsys_which(ss, ssid, &tmp_ss_mask) { + if (!cgroup_ssid_enabled(ssid) || + strcmp(tok + 1, ss->name)) continue; if (*tok == '+') { @@ -2792,10 +3063,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, * still around. In such cases, wait till it's gone using * offline_waitq. */ - for_each_subsys(ss, ssid) { - if (!(css_enable & (1 << ssid))) - continue; - + for_each_subsys_which(ss, ssid, &css_enable) { cgroup_for_each_live_child(child, cgrp) { DEFINE_WAIT(wait); @@ -2832,7 +3100,8 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, ret = create_css(child, ss, cgrp->subtree_control & (1 << ssid)); else - ret = cgroup_populate_dir(child, 1 << ssid); + ret = css_populate_dir(cgroup_css(child, ss), + NULL); if (ret) goto err_undo_css; } @@ -2865,7 +3134,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, if (css_disable & (1 << ssid)) { kill_css(css); } else { - cgroup_clear_dir(child, 1 << ssid); + css_clear_dir(css, NULL); if (ss->css_reset) ss->css_reset(css); } @@ -2913,15 +3182,16 @@ err_undo_css: if (css_enable & (1 << ssid)) kill_css(css); else - cgroup_clear_dir(child, 1 << ssid); + css_clear_dir(css, NULL); } } goto out_unlock; } -static int cgroup_populated_show(struct seq_file *seq, void *v) +static int cgroup_events_show(struct seq_file *seq, void *v) { - seq_printf(seq, "%d\n", (bool)seq_css(seq)->cgroup->populated_cnt); + seq_printf(seq, "populated %d\n", + cgroup_is_populated(seq_css(seq)->cgroup)); return 0; } @@ -3064,7 +3334,8 @@ static int cgroup_kn_set_ugid(struct kernfs_node *kn) return kernfs_setattr(kn, &iattr); } -static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) +static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, + struct cftype *cft) { char name[CGROUP_FILE_NAME_MAX]; struct kernfs_node *kn; @@ -3086,31 +3357,38 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) return ret; } - if (cft->seq_show == cgroup_populated_show) - cgrp->populated_kn = kn; + if (cft->file_offset) { + struct cgroup_file *cfile = (void *)css + cft->file_offset; + + spin_lock_irq(&cgroup_file_kn_lock); + cfile->kn = kn; + spin_unlock_irq(&cgroup_file_kn_lock); + } + return 0; } /** * cgroup_addrm_files - add or remove files to a cgroup directory - * @cgrp: the target cgroup + * @css: the target css + * @cgrp: the target cgroup (usually css->cgroup) * @cfts: array of cftypes to be added * @is_add: whether to add or remove * * Depending on @is_add, add or remove files defined by @cfts on @cgrp. - * For removals, this function never fails. If addition fails, this - * function doesn't remove files already added. The caller is responsible - * for cleaning up. + * For removals, this function never fails. */ -static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], +static int cgroup_addrm_files(struct cgroup_subsys_state *css, + struct cgroup *cgrp, struct cftype cfts[], bool is_add) { - struct cftype *cft; + struct cftype *cft, *cft_end = NULL; int ret; lockdep_assert_held(&cgroup_mutex); - for (cft = cfts; cft->name[0] != '\0'; cft++) { +restart: + for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) { /* does cft->flags tell us to skip this file on @cgrp? */ if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp)) continue; @@ -3122,11 +3400,13 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], continue; if (is_add) { - ret = cgroup_add_file(cgrp, cft); + ret = cgroup_add_file(css, cgrp, cft); if (ret) { pr_warn("%s: failed to add %s, err=%d\n", __func__, cft->name, ret); - return ret; + cft_end = cft; + is_add = false; + goto restart; } } else { cgroup_rm_file(cgrp, cft); @@ -3152,7 +3432,7 @@ static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) if (cgroup_is_dead(cgrp)) continue; - ret = cgroup_addrm_files(cgrp, cfts, is_add); + ret = cgroup_addrm_files(css, cgrp, cfts, is_add); if (ret) break; } @@ -3264,7 +3544,7 @@ static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { int ret; - if (ss->disabled) + if (!cgroup_ssid_enabled(ss->id)) return 0; if (!cfts || cfts[0].name[0] == '\0') @@ -3314,21 +3594,28 @@ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { struct cftype *cft; - /* - * If legacy_flies_on_dfl, we want to show the legacy files on the - * dfl hierarchy but iff the target subsystem hasn't been updated - * for the dfl hierarchy yet. - */ - if (!cgroup_legacy_files_on_dfl || - ss->dfl_cftypes != ss->legacy_cftypes) { - for (cft = cfts; cft && cft->name[0] != '\0'; cft++) - cft->flags |= __CFTYPE_NOT_ON_DFL; - } - + for (cft = cfts; cft && cft->name[0] != '\0'; cft++) + cft->flags |= __CFTYPE_NOT_ON_DFL; return cgroup_add_cftypes(ss, cfts); } /** + * cgroup_file_notify - generate a file modified event for a cgroup_file + * @cfile: target cgroup_file + * + * @cfile must have been obtained by setting cftype->file_offset. + */ +void cgroup_file_notify(struct cgroup_file *cfile) +{ + unsigned long flags; + + spin_lock_irqsave(&cgroup_file_kn_lock, flags); + if (cfile->kn) + kernfs_notify(cfile->kn); + spin_unlock_irqrestore(&cgroup_file_kn_lock, flags); +} + +/** * cgroup_task_count - count the number of tasks in a cgroup. * @cgrp: the cgroup in question * @@ -3339,10 +3626,10 @@ static int cgroup_task_count(const struct cgroup *cgrp) int count = 0; struct cgrp_cset_link *link; - down_read(&css_set_rwsem); + spin_lock_bh(&css_set_lock); list_for_each_entry(link, &cgrp->cset_links, cset_link) count += atomic_read(&link->cset->refcount); - up_read(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); return count; } @@ -3574,22 +3861,25 @@ bool css_has_online_children(struct cgroup_subsys_state *css) } /** - * css_advance_task_iter - advance a task itererator to the next css_set + * css_task_iter_advance_css_set - advance a task itererator to the next css_set * @it: the iterator to advance * * Advance @it to the next css_set to walk. */ -static void css_advance_task_iter(struct css_task_iter *it) +static void css_task_iter_advance_css_set(struct css_task_iter *it) { struct list_head *l = it->cset_pos; struct cgrp_cset_link *link; struct css_set *cset; + lockdep_assert_held(&css_set_lock); + /* Advance to the next non-empty css_set */ do { l = l->next; if (l == it->cset_head) { it->cset_pos = NULL; + it->task_pos = NULL; return; } @@ -3600,7 +3890,7 @@ static void css_advance_task_iter(struct css_task_iter *it) link = list_entry(l, struct cgrp_cset_link, cset_link); cset = link->cset; } - } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks)); + } while (!css_set_populated(cset)); it->cset_pos = l; @@ -3611,6 +3901,52 @@ static void css_advance_task_iter(struct css_task_iter *it) it->tasks_head = &cset->tasks; it->mg_tasks_head = &cset->mg_tasks; + + /* + * We don't keep css_sets locked across iteration steps and thus + * need to take steps to ensure that iteration can be resumed after + * the lock is re-acquired. Iteration is performed at two levels - + * css_sets and tasks in them. + * + * Once created, a css_set never leaves its cgroup lists, so a + * pinned css_set is guaranteed to stay put and we can resume + * iteration afterwards. + * + * Tasks may leave @cset across iteration steps. This is resolved + * by registering each iterator with the css_set currently being + * walked and making css_set_move_task() advance iterators whose + * next task is leaving. + */ + if (it->cur_cset) { + list_del(&it->iters_node); + put_css_set_locked(it->cur_cset); + } + get_css_set(cset); + it->cur_cset = cset; + list_add(&it->iters_node, &cset->task_iters); +} + +static void css_task_iter_advance(struct css_task_iter *it) +{ + struct list_head *l = it->task_pos; + + lockdep_assert_held(&css_set_lock); + WARN_ON_ONCE(!l); + + /* + * Advance iterator to find next entry. cset->tasks is consumed + * first and then ->mg_tasks. After ->mg_tasks, we move onto the + * next cset. + */ + l = l->next; + + if (l == it->tasks_head) + l = it->mg_tasks_head->next; + + if (l == it->mg_tasks_head) + css_task_iter_advance_css_set(it); + else + it->task_pos = l; } /** @@ -3622,19 +3958,16 @@ static void css_advance_task_iter(struct css_task_iter *it) * css_task_iter_next() to walk through the tasks until the function * returns NULL. On completion of iteration, css_task_iter_end() must be * called. - * - * Note that this function acquires a lock which is released when the - * iteration finishes. The caller can't sleep while iteration is in - * progress. */ void css_task_iter_start(struct cgroup_subsys_state *css, struct css_task_iter *it) - __acquires(css_set_rwsem) { /* no one should try to iterate before mounting cgroups */ WARN_ON_ONCE(!use_task_css_set_links); - down_read(&css_set_rwsem); + memset(it, 0, sizeof(*it)); + + spin_lock_bh(&css_set_lock); it->ss = css->ss; @@ -3645,7 +3978,9 @@ void css_task_iter_start(struct cgroup_subsys_state *css, it->cset_head = it->cset_pos; - css_advance_task_iter(it); + css_task_iter_advance_css_set(it); + + spin_unlock_bh(&css_set_lock); } /** @@ -3658,30 +3993,23 @@ void css_task_iter_start(struct cgroup_subsys_state *css, */ struct task_struct *css_task_iter_next(struct css_task_iter *it) { - struct task_struct *res; - struct list_head *l = it->task_pos; - - /* If the iterator cg is NULL, we have no tasks */ - if (!it->cset_pos) - return NULL; - res = list_entry(l, struct task_struct, cg_list); + if (it->cur_task) { + put_task_struct(it->cur_task); + it->cur_task = NULL; + } - /* - * Advance iterator to find next entry. cset->tasks is consumed - * first and then ->mg_tasks. After ->mg_tasks, we move onto the - * next cset. - */ - l = l->next; + spin_lock_bh(&css_set_lock); - if (l == it->tasks_head) - l = it->mg_tasks_head->next; + if (it->task_pos) { + it->cur_task = list_entry(it->task_pos, struct task_struct, + cg_list); + get_task_struct(it->cur_task); + css_task_iter_advance(it); + } - if (l == it->mg_tasks_head) - css_advance_task_iter(it); - else - it->task_pos = l; + spin_unlock_bh(&css_set_lock); - return res; + return it->cur_task; } /** @@ -3691,9 +4019,16 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) * Finish task iteration started by css_task_iter_start(). */ void css_task_iter_end(struct css_task_iter *it) - __releases(css_set_rwsem) { - up_read(&css_set_rwsem); + if (it->cur_cset) { + spin_lock_bh(&css_set_lock); + list_del(&it->iters_node); + put_css_set_locked(it->cur_cset); + spin_unlock_bh(&css_set_lock); + } + + if (it->cur_task) + put_task_struct(it->cur_task); } /** @@ -3718,10 +4053,10 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) mutex_lock(&cgroup_mutex); /* all tasks in @from are being moved, all csets are source */ - down_read(&css_set_rwsem); + spin_lock_bh(&css_set_lock); list_for_each_entry(link, &from->cset_links, cset_link) cgroup_migrate_add_src(link->cset, to, &preloaded_csets); - up_read(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); ret = cgroup_migrate_prepare_dst(to, &preloaded_csets); if (ret) @@ -3739,7 +4074,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) css_task_iter_end(&it); if (task) { - ret = cgroup_migrate(to, task, false); + ret = cgroup_migrate(task, false, to); put_task_struct(task); } } while (task && !ret); @@ -4236,13 +4571,13 @@ static int cgroup_clone_children_write(struct cgroup_subsys_state *css, static struct cftype cgroup_dfl_base_files[] = { { .name = "cgroup.procs", + .file_offset = offsetof(struct cgroup, procs_file), .seq_start = cgroup_pidlist_start, .seq_next = cgroup_pidlist_next, .seq_stop = cgroup_pidlist_stop, .seq_show = cgroup_pidlist_show, .private = CGROUP_FILE_PROCS, .write = cgroup_procs_write, - .mode = S_IRUGO | S_IWUSR, }, { .name = "cgroup.controllers", @@ -4260,9 +4595,10 @@ static struct cftype cgroup_dfl_base_files[] = { .write = cgroup_subtree_control_write, }, { - .name = "cgroup.populated", + .name = "cgroup.events", .flags = CFTYPE_NOT_ON_ROOT, - .seq_show = cgroup_populated_show, + .file_offset = offsetof(struct cgroup, events_file), + .seq_show = cgroup_events_show, }, { } /* terminate */ }; @@ -4277,7 +4613,6 @@ static struct cftype cgroup_legacy_base_files[] = { .seq_show = cgroup_pidlist_show, .private = CGROUP_FILE_PROCS, .write = cgroup_procs_write, - .mode = S_IRUGO | S_IWUSR, }, { .name = "cgroup.clone_children", @@ -4297,7 +4632,6 @@ static struct cftype cgroup_legacy_base_files[] = { .seq_show = cgroup_pidlist_show, .private = CGROUP_FILE_TASKS, .write = cgroup_tasks_write, - .mode = S_IRUGO | S_IWUSR, }, { .name = "notify_on_release", @@ -4314,37 +4648,6 @@ static struct cftype cgroup_legacy_base_files[] = { { } /* terminate */ }; -/** - * cgroup_populate_dir - create subsys files in a cgroup directory - * @cgrp: target cgroup - * @subsys_mask: mask of the subsystem ids whose files should be added - * - * On failure, no file is added. - */ -static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask) -{ - struct cgroup_subsys *ss; - int i, ret = 0; - - /* process cftsets of each subsystem */ - for_each_subsys(ss, i) { - struct cftype *cfts; - - if (!(subsys_mask & (1 << i))) - continue; - - list_for_each_entry(cfts, &ss->cfts, node) { - ret = cgroup_addrm_files(cgrp, cfts, true); - if (ret < 0) - goto err; - } - } - return 0; -err: - cgroup_clear_dir(cgrp, subsys_mask); - return ret; -} - /* * css destruction is four-stage process. * @@ -4481,6 +4784,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css, INIT_LIST_HEAD(&css->sibling); INIT_LIST_HEAD(&css->children); css->serial_nr = css_serial_nr_next++; + atomic_set(&css->online_cnt, 0); if (cgroup_parent(cgrp)) { css->parent = cgroup_css(cgroup_parent(cgrp), ss); @@ -4503,6 +4807,10 @@ static int online_css(struct cgroup_subsys_state *css) if (!ret) { css->flags |= CSS_ONLINE; rcu_assign_pointer(css->cgroup->subsys[ss->id], css); + + atomic_inc(&css->online_cnt); + if (css->parent) + atomic_inc(&css->parent->online_cnt); } return ret; } @@ -4556,13 +4864,13 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, if (err) goto err_free_css; - err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_NOWAIT); + err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL); if (err < 0) goto err_free_percpu_ref; css->id = err; if (visible) { - err = cgroup_populate_dir(cgrp, 1 << ss->id); + err = css_populate_dir(css, NULL); if (err) goto err_free_id; } @@ -4588,7 +4896,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, err_list_del: list_del_rcu(&css->sibling); - cgroup_clear_dir(css->cgroup, 1 << css->ss->id); + css_clear_dir(css, NULL); err_free_id: cgroup_idr_remove(&ss->css_idr, css->id); err_free_percpu_ref: @@ -4605,7 +4913,6 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, struct cgroup_root *root; struct cgroup_subsys *ss; struct kernfs_node *kn; - struct cftype *base_files; int ssid, ret; /* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable. @@ -4633,7 +4940,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, * Temporarily set the pointer to NULL, so idr_find() won't return * a half-baked cgroup. */ - cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT); + cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL); if (cgrp->id < 0) { ret = -ENOMEM; goto out_cancel_ref; @@ -4681,12 +4988,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, if (ret) goto out_destroy; - if (cgroup_on_dfl(cgrp)) - base_files = cgroup_dfl_base_files; - else - base_files = cgroup_legacy_base_files; - - ret = cgroup_addrm_files(cgrp, base_files, true); + ret = css_populate_dir(&cgrp->self, NULL); if (ret) goto out_destroy; @@ -4740,10 +5042,15 @@ static void css_killed_work_fn(struct work_struct *work) container_of(work, struct cgroup_subsys_state, destroy_work); mutex_lock(&cgroup_mutex); - offline_css(css); - mutex_unlock(&cgroup_mutex); - css_put(css); + do { + offline_css(css); + css_put(css); + /* @css can't go away while we're holding cgroup_mutex */ + css = css->parent; + } while (css && atomic_dec_and_test(&css->online_cnt)); + + mutex_unlock(&cgroup_mutex); } /* css kill confirmation processing requires process context, bounce */ @@ -4752,8 +5059,10 @@ static void css_killed_ref_fn(struct percpu_ref *ref) struct cgroup_subsys_state *css = container_of(ref, struct cgroup_subsys_state, refcnt); - INIT_WORK(&css->destroy_work, css_killed_work_fn); - queue_work(cgroup_destroy_wq, &css->destroy_work); + if (atomic_dec_and_test(&css->online_cnt)) { + INIT_WORK(&css->destroy_work, css_killed_work_fn); + queue_work(cgroup_destroy_wq, &css->destroy_work); + } } /** @@ -4773,7 +5082,7 @@ static void kill_css(struct cgroup_subsys_state *css) * This must happen before css is disassociated with its cgroup. * See seq_css() for details. */ - cgroup_clear_dir(css->cgroup, 1 << css->ss->id); + css_clear_dir(css, NULL); /* * Killing would put the base ref, but we need to keep it alive @@ -4822,19 +5131,15 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { struct cgroup_subsys_state *css; - bool empty; int ssid; lockdep_assert_held(&cgroup_mutex); /* - * css_set_rwsem synchronizes access to ->cset_links and prevents - * @cgrp from being removed while put_css_set() is in progress. + * Only migration can raise populated from zero and we're already + * holding cgroup_mutex. */ - down_read(&css_set_rwsem); - empty = list_empty(&cgrp->cset_links); - up_read(&css_set_rwsem); - if (!empty) + if (cgroup_is_populated(cgrp)) return -EBUSY; /* @@ -4930,7 +5235,10 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) * init_css_set is in the subsystem's root cgroup. */ init_css_set.subsys[ss->id] = css; - need_forkexit_callback |= ss->fork || ss->exit; + have_fork_callback |= (bool)ss->fork << ss->id; + have_exit_callback |= (bool)ss->exit << ss->id; + have_free_callback |= (bool)ss->free << ss->id; + have_canfork_callback |= (bool)ss->can_fork << ss->id; /* At system boot, before all subsystems have been * registered, no tasks have been forked, so we don't @@ -4969,6 +5277,8 @@ int __init cgroup_init_early(void) ss->id = i; ss->name = cgroup_subsys_name[i]; + if (!ss->legacy_name) + ss->legacy_name = cgroup_subsys_name[i]; if (ss->early_init) cgroup_init_subsys(ss, true); @@ -4976,6 +5286,8 @@ int __init cgroup_init_early(void) return 0; } +static unsigned long cgroup_disable_mask __initdata; + /** * cgroup_init - cgroup initialization * @@ -4986,8 +5298,9 @@ int __init cgroup_init(void) { struct cgroup_subsys *ss; unsigned long key; - int ssid, err; + int ssid; + BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); @@ -5021,14 +5334,15 @@ int __init cgroup_init(void) * disabled flag and cftype registration needs kmalloc, * both of which aren't available during early_init. */ - if (ss->disabled) + if (cgroup_disable_mask & (1 << ssid)) { + static_branch_disable(cgroup_subsys_enabled_key[ssid]); + printk(KERN_INFO "Disabling %s control group subsystem\n", + ss->name); continue; + } cgrp_dfl_root.subsys_mask |= 1 << ss->id; - if (cgroup_legacy_files_on_dfl && !ss->dfl_cftypes) - ss->dfl_cftypes = ss->legacy_cftypes; - if (!ss->dfl_cftypes) cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id; @@ -5043,17 +5357,10 @@ int __init cgroup_init(void) ss->bind(init_css_set.subsys[ssid]); } - err = sysfs_create_mount_point(fs_kobj, "cgroup"); - if (err) - return err; - - err = register_filesystem(&cgroup_fs_type); - if (err < 0) { - sysfs_remove_mount_point(fs_kobj, "cgroup"); - return err; - } + WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup")); + WARN_ON(register_filesystem(&cgroup_fs_type)); + WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations)); - proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations); return 0; } @@ -5101,7 +5408,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, goto out; mutex_lock(&cgroup_mutex); - down_read(&css_set_rwsem); + spin_lock_bh(&css_set_lock); for_each_root(root) { struct cgroup_subsys *ss; @@ -5112,26 +5419,48 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, continue; seq_printf(m, "%d:", root->hierarchy_id); - for_each_subsys(ss, ssid) - if (root->subsys_mask & (1 << ssid)) - seq_printf(m, "%s%s", count++ ? "," : "", ss->name); + if (root != &cgrp_dfl_root) + for_each_subsys(ss, ssid) + if (root->subsys_mask & (1 << ssid)) + seq_printf(m, "%s%s", count++ ? "," : "", + ss->legacy_name); if (strlen(root->name)) seq_printf(m, "%sname=%s", count ? "," : "", root->name); seq_putc(m, ':'); + cgrp = task_cgroup_from_root(tsk, root); - path = cgroup_path(cgrp, buf, PATH_MAX); - if (!path) { - retval = -ENAMETOOLONG; - goto out_unlock; + + /* + * On traditional hierarchies, all zombie tasks show up as + * belonging to the root cgroup. On the default hierarchy, + * while a zombie doesn't show up in "cgroup.procs" and + * thus can't be migrated, its /proc/PID/cgroup keeps + * reporting the cgroup it belonged to before exiting. If + * the cgroup is removed before the zombie is reaped, + * " (deleted)" is appended to the cgroup path. + */ + if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) { + path = cgroup_path(cgrp, buf, PATH_MAX); + if (!path) { + retval = -ENAMETOOLONG; + goto out_unlock; + } + } else { + path = "/"; } + seq_puts(m, path); - seq_putc(m, '\n'); + + if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp)) + seq_puts(m, " (deleted)\n"); + else + seq_putc(m, '\n'); } retval = 0; out_unlock: - up_read(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); mutex_unlock(&cgroup_mutex); kfree(buf); out: @@ -5154,8 +5483,9 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v) for_each_subsys(ss, i) seq_printf(m, "%s\t%d\t%d\t%d\n", - ss->name, ss->root->hierarchy_id, - atomic_read(&ss->root->nr_cgrps), !ss->disabled); + ss->legacy_name, ss->root->hierarchy_id, + atomic_read(&ss->root->nr_cgrps), + cgroup_ssid_enabled(i)); mutex_unlock(&cgroup_mutex); return 0; @@ -5173,6 +5503,19 @@ static const struct file_operations proc_cgroupstats_operations = { .release = single_release, }; +static void **subsys_canfork_priv_p(void *ss_priv[CGROUP_CANFORK_COUNT], int i) +{ + if (CGROUP_CANFORK_START <= i && i < CGROUP_CANFORK_END) + return &ss_priv[i - CGROUP_CANFORK_START]; + return NULL; +} + +static void *subsys_canfork_priv(void *ss_priv[CGROUP_CANFORK_COUNT], int i) +{ + void **private = subsys_canfork_priv_p(ss_priv, i); + return private ? *private : NULL; +} + /** * cgroup_fork - initialize cgroup related fields during copy_process() * @child: pointer to task_struct of forking parent process. @@ -5188,6 +5531,57 @@ void cgroup_fork(struct task_struct *child) } /** + * cgroup_can_fork - called on a new task before the process is exposed + * @child: the task in question. + * + * This calls the subsystem can_fork() callbacks. If the can_fork() callback + * returns an error, the fork aborts with that error code. This allows for + * a cgroup subsystem to conditionally allow or deny new forks. + */ +int cgroup_can_fork(struct task_struct *child, + void *ss_priv[CGROUP_CANFORK_COUNT]) +{ + struct cgroup_subsys *ss; + int i, j, ret; + + for_each_subsys_which(ss, i, &have_canfork_callback) { + ret = ss->can_fork(child, subsys_canfork_priv_p(ss_priv, i)); + if (ret) + goto out_revert; + } + + return 0; + +out_revert: + for_each_subsys(ss, j) { + if (j >= i) + break; + if (ss->cancel_fork) + ss->cancel_fork(child, subsys_canfork_priv(ss_priv, j)); + } + + return ret; +} + +/** + * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork() + * @child: the task in question + * + * This calls the cancel_fork() callbacks if a fork failed *after* + * cgroup_can_fork() succeded. + */ +void cgroup_cancel_fork(struct task_struct *child, + void *ss_priv[CGROUP_CANFORK_COUNT]) +{ + struct cgroup_subsys *ss; + int i; + + for_each_subsys(ss, i) + if (ss->cancel_fork) + ss->cancel_fork(child, subsys_canfork_priv(ss_priv, i)); +} + +/** * cgroup_post_fork - called on a new task after adding it to the task list * @child: the task in question * @@ -5197,7 +5591,8 @@ void cgroup_fork(struct task_struct *child) * cgroup_task_iter_start() - to guarantee that the new task ends up on its * list. */ -void cgroup_post_fork(struct task_struct *child) +void cgroup_post_fork(struct task_struct *child, + void *old_ss_priv[CGROUP_CANFORK_COUNT]) { struct cgroup_subsys *ss; int i; @@ -5211,7 +5606,7 @@ void cgroup_post_fork(struct task_struct *child) * @child during its iteration. * * If we won the race, @child is associated with %current's - * css_set. Grabbing css_set_rwsem guarantees both that the + * css_set. Grabbing css_set_lock guarantees both that the * association is stable, and, on completion of the parent's * migration, @child is visible in the source of migration or * already in the destination cgroup. This guarantee is necessary @@ -5226,14 +5621,13 @@ void cgroup_post_fork(struct task_struct *child) if (use_task_css_set_links) { struct css_set *cset; - down_write(&css_set_rwsem); + spin_lock_bh(&css_set_lock); cset = task_css_set(current); if (list_empty(&child->cg_list)) { - rcu_assign_pointer(child->cgroups, cset); - list_add(&child->cg_list, &cset->tasks); get_css_set(cset); + css_set_move_task(child, NULL, cset, false); } - up_write(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); } /* @@ -5241,11 +5635,8 @@ void cgroup_post_fork(struct task_struct *child) * css_set; otherwise, @child might change state between ->fork() * and addition to css_set. */ - if (need_forkexit_callback) { - for_each_subsys(ss, i) - if (ss->fork) - ss->fork(child); - } + for_each_subsys_which(ss, i, &have_fork_callback) + ss->fork(child, subsys_canfork_priv(old_ss_priv, i)); } /** @@ -5271,43 +5662,42 @@ void cgroup_exit(struct task_struct *tsk) { struct cgroup_subsys *ss; struct css_set *cset; - bool put_cset = false; int i; /* * Unlink from @tsk from its css_set. As migration path can't race - * with us, we can check cg_list without grabbing css_set_rwsem. + * with us, we can check css_set and cg_list without synchronization. */ + cset = task_css_set(tsk); + if (!list_empty(&tsk->cg_list)) { - down_write(&css_set_rwsem); - list_del_init(&tsk->cg_list); - up_write(&css_set_rwsem); - put_cset = true; + spin_lock_bh(&css_set_lock); + css_set_move_task(tsk, cset, NULL, false); + spin_unlock_bh(&css_set_lock); + } else { + get_css_set(cset); } - /* Reassign the task to the init_css_set. */ - cset = task_css_set(tsk); - RCU_INIT_POINTER(tsk->cgroups, &init_css_set); + /* see cgroup_post_fork() for details */ + for_each_subsys_which(ss, i, &have_exit_callback) + ss->exit(tsk); +} - if (need_forkexit_callback) { - /* see cgroup_post_fork() for details */ - for_each_subsys(ss, i) { - if (ss->exit) { - struct cgroup_subsys_state *old_css = cset->subsys[i]; - struct cgroup_subsys_state *css = task_css(tsk, i); +void cgroup_free(struct task_struct *task) +{ + struct css_set *cset = task_css_set(task); + struct cgroup_subsys *ss; + int ssid; - ss->exit(css, old_css, tsk); - } - } - } + for_each_subsys_which(ss, ssid, &have_free_callback) + ss->free(task); - if (put_cset) - put_css_set(cset); + put_css_set(cset); } static void check_for_release(struct cgroup *cgrp) { - if (notify_on_release(cgrp) && !cgroup_has_tasks(cgrp) && + if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) && !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp)) schedule_work(&cgrp->release_agent_work); } @@ -5383,26 +5773,16 @@ static int __init cgroup_disable(char *str) continue; for_each_subsys(ss, i) { - if (!strcmp(token, ss->name)) { - ss->disabled = 1; - printk(KERN_INFO "Disabling %s control group" - " subsystem\n", ss->name); - break; - } + if (strcmp(token, ss->name) && + strcmp(token, ss->legacy_name)) + continue; + cgroup_disable_mask |= 1 << i; } } return 1; } __setup("cgroup_disable=", cgroup_disable); -static int __init cgroup_set_legacy_files_on_dfl(char *str) -{ - printk("cgroup: using legacy files on the default hierarchy\n"); - cgroup_legacy_files_on_dfl = true; - return 0; -} -__setup("cgroup__DEVEL__legacy_files_on_dfl", cgroup_set_legacy_files_on_dfl); - /** * css_tryget_online_from_dir - get corresponding css from a cgroup dentry * @dentry: directory dentry of interest @@ -5506,7 +5886,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) if (!name_buf) return -ENOMEM; - down_read(&css_set_rwsem); + spin_lock_bh(&css_set_lock); rcu_read_lock(); cset = rcu_dereference(current->cgroups); list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { @@ -5517,7 +5897,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) c->root->hierarchy_id, name_buf); } rcu_read_unlock(); - up_read(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); kfree(name_buf); return 0; } @@ -5528,7 +5908,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v) struct cgroup_subsys_state *css = seq_css(seq); struct cgrp_cset_link *link; - down_read(&css_set_rwsem); + spin_lock_bh(&css_set_lock); list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { struct css_set *cset = link->cset; struct task_struct *task; @@ -5551,13 +5931,13 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v) overflow: seq_puts(seq, " ...\n"); } - up_read(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); return 0; } static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) { - return (!cgroup_has_tasks(css->cgroup) && + return (!cgroup_is_populated(css->cgroup) && !css_has_online_children(&css->cgroup->self)); } diff --git a/kernel/kernel/cgroup_freezer.c b/kernel/kernel/cgroup_freezer.c index 92b98cc0e..2d3df82c5 100644 --- a/kernel/kernel/cgroup_freezer.c +++ b/kernel/kernel/cgroup_freezer.c @@ -155,12 +155,10 @@ static void freezer_css_free(struct cgroup_subsys_state *css) * @freezer->lock. freezer_attach() makes the new tasks conform to the * current state and all following state changes can see the new tasks. */ -static void freezer_attach(struct cgroup_subsys_state *new_css, - struct cgroup_taskset *tset) +static void freezer_attach(struct cgroup_taskset *tset) { - struct freezer *freezer = css_freezer(new_css); struct task_struct *task; - bool clear_frozen = false; + struct cgroup_subsys_state *new_css; mutex_lock(&freezer_mutex); @@ -174,22 +172,21 @@ static void freezer_attach(struct cgroup_subsys_state *new_css, * current state before executing the following - !frozen tasks may * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. */ - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, new_css, tset) { + struct freezer *freezer = css_freezer(new_css); + if (!(freezer->state & CGROUP_FREEZING)) { __thaw_task(task); } else { freeze_task(task); - freezer->state &= ~CGROUP_FROZEN; - clear_frozen = true; + /* clear FROZEN and propagate upwards */ + while (freezer && (freezer->state & CGROUP_FROZEN)) { + freezer->state &= ~CGROUP_FROZEN; + freezer = parent_freezer(freezer); + } } } - /* propagate FROZEN clearing upwards */ - while (clear_frozen && (freezer = parent_freezer(freezer))) { - freezer->state &= ~CGROUP_FROZEN; - clear_frozen = freezer->state & CGROUP_FREEZING; - } - mutex_unlock(&freezer_mutex); } @@ -203,7 +200,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css, * to do anything as freezer_attach() will put @task into the appropriate * state. */ -static void freezer_fork(struct task_struct *task) +static void freezer_fork(struct task_struct *task, void *private) { struct freezer *freezer; diff --git a/kernel/kernel/cgroup_pids.c b/kernel/kernel/cgroup_pids.c new file mode 100644 index 000000000..b50d5a167 --- /dev/null +++ b/kernel/kernel/cgroup_pids.c @@ -0,0 +1,316 @@ +/* + * Process number limiting controller for cgroups. + * + * Used to allow a cgroup hierarchy to stop any new processes from fork()ing + * after a certain limit is reached. + * + * Since it is trivial to hit the task limit without hitting any kmemcg limits + * in place, PIDs are a fundamental resource. As such, PID exhaustion must be + * preventable in the scope of a cgroup hierarchy by allowing resource limiting + * of the number of tasks in a cgroup. + * + * In order to use the `pids` controller, set the maximum number of tasks in + * pids.max (this is not available in the root cgroup for obvious reasons). The + * number of processes currently in the cgroup is given by pids.current. + * Organisational operations are not blocked by cgroup policies, so it is + * possible to have pids.current > pids.max. However, it is not possible to + * violate a cgroup policy through fork(). fork() will return -EAGAIN if forking + * would cause a cgroup policy to be violated. + * + * To set a cgroup to have no limit, set pids.max to "max". This is the default + * for all new cgroups (N.B. that PID limits are hierarchical, so the most + * stringent limit in the hierarchy is followed). + * + * pids.current tracks all child cgroup hierarchies, so parent/pids.current is + * a superset of parent/child/pids.current. + * + * Copyright (C) 2015 Aleksa Sarai <cyphar@cyphar.com> + * + * This file is subject to the terms and conditions of version 2 of the GNU + * General Public License. See the file COPYING in the main directory of the + * Linux distribution for more details. + */ + +#include <linux/kernel.h> +#include <linux/threads.h> +#include <linux/atomic.h> +#include <linux/cgroup.h> +#include <linux/slab.h> + +#define PIDS_MAX (PID_MAX_LIMIT + 1ULL) +#define PIDS_MAX_STR "max" + +struct pids_cgroup { + struct cgroup_subsys_state css; + + /* + * Use 64-bit types so that we can safely represent "max" as + * %PIDS_MAX = (%PID_MAX_LIMIT + 1). + */ + atomic64_t counter; + int64_t limit; +}; + +static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css) +{ + return container_of(css, struct pids_cgroup, css); +} + +static struct pids_cgroup *parent_pids(struct pids_cgroup *pids) +{ + return css_pids(pids->css.parent); +} + +static struct cgroup_subsys_state * +pids_css_alloc(struct cgroup_subsys_state *parent) +{ + struct pids_cgroup *pids; + + pids = kzalloc(sizeof(struct pids_cgroup), GFP_KERNEL); + if (!pids) + return ERR_PTR(-ENOMEM); + + pids->limit = PIDS_MAX; + atomic64_set(&pids->counter, 0); + return &pids->css; +} + +static void pids_css_free(struct cgroup_subsys_state *css) +{ + kfree(css_pids(css)); +} + +/** + * pids_cancel - uncharge the local pid count + * @pids: the pid cgroup state + * @num: the number of pids to cancel + * + * This function will WARN if the pid count goes under 0, because such a case is + * a bug in the pids controller proper. + */ +static void pids_cancel(struct pids_cgroup *pids, int num) +{ + /* + * A negative count (or overflow for that matter) is invalid, + * and indicates a bug in the `pids` controller proper. + */ + WARN_ON_ONCE(atomic64_add_negative(-num, &pids->counter)); +} + +/** + * pids_uncharge - hierarchically uncharge the pid count + * @pids: the pid cgroup state + * @num: the number of pids to uncharge + */ +static void pids_uncharge(struct pids_cgroup *pids, int num) +{ + struct pids_cgroup *p; + + for (p = pids; parent_pids(p); p = parent_pids(p)) + pids_cancel(p, num); +} + +/** + * pids_charge - hierarchically charge the pid count + * @pids: the pid cgroup state + * @num: the number of pids to charge + * + * This function does *not* follow the pid limit set. It cannot fail and the new + * pid count may exceed the limit. This is only used for reverting failed + * attaches, where there is no other way out than violating the limit. + */ +static void pids_charge(struct pids_cgroup *pids, int num) +{ + struct pids_cgroup *p; + + for (p = pids; parent_pids(p); p = parent_pids(p)) + atomic64_add(num, &p->counter); +} + +/** + * pids_try_charge - hierarchically try to charge the pid count + * @pids: the pid cgroup state + * @num: the number of pids to charge + * + * This function follows the set limit. It will fail if the charge would cause + * the new value to exceed the hierarchical limit. Returns 0 if the charge + * succeded, otherwise -EAGAIN. + */ +static int pids_try_charge(struct pids_cgroup *pids, int num) +{ + struct pids_cgroup *p, *q; + + for (p = pids; parent_pids(p); p = parent_pids(p)) { + int64_t new = atomic64_add_return(num, &p->counter); + + /* + * Since new is capped to the maximum number of pid_t, if + * p->limit is %PIDS_MAX then we know that this test will never + * fail. + */ + if (new > p->limit) + goto revert; + } + + return 0; + +revert: + for (q = pids; q != p; q = parent_pids(q)) + pids_cancel(q, num); + pids_cancel(p, num); + + return -EAGAIN; +} + +static int pids_can_attach(struct cgroup_taskset *tset) +{ + struct task_struct *task; + struct cgroup_subsys_state *dst_css; + + cgroup_taskset_for_each(task, dst_css, tset) { + struct pids_cgroup *pids = css_pids(dst_css); + struct cgroup_subsys_state *old_css; + struct pids_cgroup *old_pids; + + /* + * No need to pin @old_css between here and cancel_attach() + * because cgroup core protects it from being freed before + * the migration completes or fails. + */ + old_css = task_css(task, pids_cgrp_id); + old_pids = css_pids(old_css); + + pids_charge(pids, 1); + pids_uncharge(old_pids, 1); + } + + return 0; +} + +static void pids_cancel_attach(struct cgroup_taskset *tset) +{ + struct task_struct *task; + struct cgroup_subsys_state *dst_css; + + cgroup_taskset_for_each(task, dst_css, tset) { + struct pids_cgroup *pids = css_pids(dst_css); + struct cgroup_subsys_state *old_css; + struct pids_cgroup *old_pids; + + old_css = task_css(task, pids_cgrp_id); + old_pids = css_pids(old_css); + + pids_charge(old_pids, 1); + pids_uncharge(pids, 1); + } +} + +/* + * task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies + * on threadgroup_change_begin() held by the copy_process(). + */ +static int pids_can_fork(struct task_struct *task, void **priv_p) +{ + struct cgroup_subsys_state *css; + struct pids_cgroup *pids; + + css = task_css_check(current, pids_cgrp_id, true); + pids = css_pids(css); + return pids_try_charge(pids, 1); +} + +static void pids_cancel_fork(struct task_struct *task, void *priv) +{ + struct cgroup_subsys_state *css; + struct pids_cgroup *pids; + + css = task_css_check(current, pids_cgrp_id, true); + pids = css_pids(css); + pids_uncharge(pids, 1); +} + +static void pids_free(struct task_struct *task) +{ + struct pids_cgroup *pids = css_pids(task_css(task, pids_cgrp_id)); + + pids_uncharge(pids, 1); +} + +static ssize_t pids_max_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct cgroup_subsys_state *css = of_css(of); + struct pids_cgroup *pids = css_pids(css); + int64_t limit; + int err; + + buf = strstrip(buf); + if (!strcmp(buf, PIDS_MAX_STR)) { + limit = PIDS_MAX; + goto set_limit; + } + + err = kstrtoll(buf, 0, &limit); + if (err) + return err; + + if (limit < 0 || limit >= PIDS_MAX) + return -EINVAL; + +set_limit: + /* + * Limit updates don't need to be mutex'd, since it isn't + * critical that any racing fork()s follow the new limit. + */ + pids->limit = limit; + return nbytes; +} + +static int pids_max_show(struct seq_file *sf, void *v) +{ + struct cgroup_subsys_state *css = seq_css(sf); + struct pids_cgroup *pids = css_pids(css); + int64_t limit = pids->limit; + + if (limit >= PIDS_MAX) + seq_printf(sf, "%s\n", PIDS_MAX_STR); + else + seq_printf(sf, "%lld\n", limit); + + return 0; +} + +static s64 pids_current_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct pids_cgroup *pids = css_pids(css); + + return atomic64_read(&pids->counter); +} + +static struct cftype pids_files[] = { + { + .name = "max", + .write = pids_max_write, + .seq_show = pids_max_show, + .flags = CFTYPE_NOT_ON_ROOT, + }, + { + .name = "current", + .read_s64 = pids_current_read, + .flags = CFTYPE_NOT_ON_ROOT, + }, + { } /* terminate */ +}; + +struct cgroup_subsys pids_cgrp_subsys = { + .css_alloc = pids_css_alloc, + .css_free = pids_css_free, + .can_attach = pids_can_attach, + .cancel_attach = pids_cancel_attach, + .can_fork = pids_can_fork, + .cancel_fork = pids_cancel_fork, + .free = pids_free, + .legacy_cftypes = pids_files, + .dfl_cftypes = pids_files, +}; diff --git a/kernel/kernel/configs/xen.config b/kernel/kernel/configs/xen.config new file mode 100644 index 000000000..ff756221f --- /dev/null +++ b/kernel/kernel/configs/xen.config @@ -0,0 +1,48 @@ +# global stuff - these enable us to allow some +# of the not so generic stuff below for xen +CONFIG_PARAVIRT=y +CONFIG_NET=y +CONFIG_NET_CORE=y +CONFIG_NETDEVICES=y +CONFIG_BLOCK=y +CONFIG_WATCHDOG=y +CONFIG_TARGET_CORE=y +CONFIG_SCSI=y +CONFIG_FB=y +CONFIG_INPUT_MISC=y +CONFIG_MEMORY_HOTPLUG=y +CONFIG_TTY=y +# Technically not required but otherwise produces +# pretty useless systems starting from allnoconfig +# You want TCP/IP and ELF binaries right? +CONFIG_INET=y +CONFIG_BINFMT_ELF=y +# generic config +CONFIG_XEN=y +CONFIG_XEN_DOM0=y +# backend drivers +CONFIG_XEN_BACKEND=y +CONFIG_XEN_BLKDEV_BACKEND=m +CONFIG_XEN_NETDEV_BACKEND=m +CONFIG_HVC_XEN=y +CONFIG_XEN_WDT=m +CONFIG_XEN_SCSI_BACKEND=m +# frontend drivers +CONFIG_XEN_FBDEV_FRONTEND=m +CONFIG_HVC_XEN_FRONTEND=y +CONFIG_INPUT_XEN_KBDDEV_FRONTEND=m +CONFIG_XEN_SCSI_FRONTEND=m +# others +CONFIG_XEN_BALLOON=y +CONFIG_XEN_SCRUB_PAGES=y +CONFIG_XEN_DEV_EVTCHN=m +CONFIG_XEN_BLKDEV_FRONTEND=m +CONFIG_XEN_NETDEV_FRONTEND=m +CONFIG_XENFS=m +CONFIG_XEN_COMPAT_XENFS=y +CONFIG_XEN_SYS_HYPERVISOR=y +CONFIG_XEN_XENBUS_FRONTEND=y +CONFIG_XEN_GNTDEV=m +CONFIG_XEN_GRANT_DEV_ALLOC=m +CONFIG_SWIOTLB_XEN=y +CONFIG_XEN_PRIVCMD=m diff --git a/kernel/kernel/context_tracking.c b/kernel/kernel/context_tracking.c index 72d59a1a6..d8560ee3b 100644 --- a/kernel/kernel/context_tracking.c +++ b/kernel/kernel/context_tracking.c @@ -30,12 +30,23 @@ EXPORT_SYMBOL_GPL(context_tracking_enabled); DEFINE_PER_CPU(struct context_tracking, context_tracking); EXPORT_SYMBOL_GPL(context_tracking); -void context_tracking_cpu_set(int cpu) +static bool context_tracking_recursion_enter(void) { - if (!per_cpu(context_tracking.active, cpu)) { - per_cpu(context_tracking.active, cpu) = true; - static_key_slow_inc(&context_tracking_enabled); - } + int recursion; + + recursion = __this_cpu_inc_return(context_tracking.recursion); + if (recursion == 1) + return true; + + WARN_ONCE((recursion < 1), "Invalid context tracking recursion value %d\n", recursion); + __this_cpu_dec(context_tracking.recursion); + + return false; +} + +static void context_tracking_recursion_exit(void) +{ + __this_cpu_dec(context_tracking.recursion); } /** @@ -47,34 +58,14 @@ void context_tracking_cpu_set(int cpu) * instructions to execute won't use any RCU read side critical section * because this function sets RCU in extended quiescent state. */ -void context_tracking_enter(enum ctx_state state) +void __context_tracking_enter(enum ctx_state state) { - unsigned long flags; - - /* - * Repeat the user_enter() check here because some archs may be calling - * this from asm and if no CPU needs context tracking, they shouldn't - * go further. Repeat the check here until they support the inline static - * key check. - */ - if (!context_tracking_is_enabled()) - return; - - /* - * Some contexts may involve an exception occuring in an irq, - * leading to that nesting: - * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() - * This would mess up the dyntick_nesting count though. And rcu_irq_*() - * helpers are enough to protect RCU uses inside the exception. So - * just return immediately if we detect we are in an IRQ. - */ - if (in_interrupt()) - return; - /* Kernel threads aren't supposed to go to userspace */ WARN_ON_ONCE(!current->mm); - local_irq_save(flags); + if (!context_tracking_recursion_enter()) + return; + if ( __this_cpu_read(context_tracking.state) != state) { if (__this_cpu_read(context_tracking.active)) { /* @@ -105,6 +96,28 @@ void context_tracking_enter(enum ctx_state state) */ __this_cpu_write(context_tracking.state, state); } + context_tracking_recursion_exit(); +} +NOKPROBE_SYMBOL(__context_tracking_enter); +EXPORT_SYMBOL_GPL(__context_tracking_enter); + +void context_tracking_enter(enum ctx_state state) +{ + unsigned long flags; + + /* + * Some contexts may involve an exception occuring in an irq, + * leading to that nesting: + * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() + * This would mess up the dyntick_nesting count though. And rcu_irq_*() + * helpers are enough to protect RCU uses inside the exception. So + * just return immediately if we detect we are in an IRQ. + */ + if (in_interrupt()) + return; + + local_irq_save(flags); + __context_tracking_enter(state); local_irq_restore(flags); } NOKPROBE_SYMBOL(context_tracking_enter); @@ -112,7 +125,7 @@ EXPORT_SYMBOL_GPL(context_tracking_enter); void context_tracking_user_enter(void) { - context_tracking_enter(CONTEXT_USER); + user_enter(); } NOKPROBE_SYMBOL(context_tracking_user_enter); @@ -128,17 +141,11 @@ NOKPROBE_SYMBOL(context_tracking_user_enter); * This call supports re-entrancy. This way it can be called from any exception * handler without needing to know if we came from userspace or not. */ -void context_tracking_exit(enum ctx_state state) +void __context_tracking_exit(enum ctx_state state) { - unsigned long flags; - - if (!context_tracking_is_enabled()) + if (!context_tracking_recursion_enter()) return; - if (in_interrupt()) - return; - - local_irq_save(flags); if (__this_cpu_read(context_tracking.state) == state) { if (__this_cpu_read(context_tracking.active)) { /* @@ -153,6 +160,20 @@ void context_tracking_exit(enum ctx_state state) } __this_cpu_write(context_tracking.state, CONTEXT_KERNEL); } + context_tracking_recursion_exit(); +} +NOKPROBE_SYMBOL(__context_tracking_exit); +EXPORT_SYMBOL_GPL(__context_tracking_exit); + +void context_tracking_exit(enum ctx_state state) +{ + unsigned long flags; + + if (in_interrupt()) + return; + + local_irq_save(flags); + __context_tracking_exit(state); local_irq_restore(flags); } NOKPROBE_SYMBOL(context_tracking_exit); @@ -160,28 +181,30 @@ EXPORT_SYMBOL_GPL(context_tracking_exit); void context_tracking_user_exit(void) { - context_tracking_exit(CONTEXT_USER); + user_exit(); } NOKPROBE_SYMBOL(context_tracking_user_exit); -/** - * __context_tracking_task_switch - context switch the syscall callbacks - * @prev: the task that is being switched out - * @next: the task that is being switched in - * - * The context tracking uses the syscall slow path to implement its user-kernel - * boundaries probes on syscalls. This way it doesn't impact the syscall fast - * path on CPUs that don't do context tracking. - * - * But we need to clear the flag on the previous task because it may later - * migrate to some CPU that doesn't do the context tracking. As such the TIF - * flag may not be desired there. - */ -void __context_tracking_task_switch(struct task_struct *prev, - struct task_struct *next) +void __init context_tracking_cpu_set(int cpu) { - clear_tsk_thread_flag(prev, TIF_NOHZ); - set_tsk_thread_flag(next, TIF_NOHZ); + static __initdata bool initialized = false; + + if (!per_cpu(context_tracking.active, cpu)) { + per_cpu(context_tracking.active, cpu) = true; + static_key_slow_inc(&context_tracking_enabled); + } + + if (initialized) + return; + + /* + * Set TIF_NOHZ to init/0 and let it propagate to all tasks through fork + * This assumes that init is the only task at this early boot stage. + */ + set_tsk_thread_flag(&init_task, TIF_NOHZ); + WARN_ON_ONCE(!tasklist_empty()); + + initialized = true; } #ifdef CONFIG_CONTEXT_TRACKING_FORCE diff --git a/kernel/kernel/cpu.c b/kernel/kernel/cpu.c index 0351ac422..8edd3c716 100644 --- a/kernel/kernel/cpu.c +++ b/kernel/kernel/cpu.c @@ -21,6 +21,7 @@ #include <linux/suspend.h> #include <linux/lockdep.h> #include <linux/tick.h> +#include <linux/irq.h> #include <trace/events/power.h> #include "smpboot.h" @@ -126,8 +127,8 @@ struct hotplug_pcp { }; #ifdef CONFIG_PREEMPT_RT_FULL -# define hotplug_lock(hp) rt_spin_lock(&(hp)->lock) -# define hotplug_unlock(hp) rt_spin_unlock(&(hp)->lock) +# define hotplug_lock(hp) rt_spin_lock__no_mg(&(hp)->lock) +# define hotplug_unlock(hp) rt_spin_unlock__no_mg(&(hp)->lock) #else # define hotplug_lock(hp) mutex_lock(&(hp)->mutex) # define hotplug_unlock(hp) mutex_unlock(&(hp)->mutex) @@ -384,19 +385,6 @@ void get_online_cpus(void) } EXPORT_SYMBOL_GPL(get_online_cpus); -bool try_get_online_cpus(void) -{ - if (cpu_hotplug.active_writer == current) - return true; - if (!mutex_trylock(&cpu_hotplug.lock)) - return false; - cpuhp_lock_acquire_tryread(); - atomic_inc(&cpu_hotplug.refcount); - mutex_unlock(&cpu_hotplug.lock); - return true; -} -EXPORT_SYMBOL_GPL(try_get_online_cpus); - void put_online_cpus(void) { int refcount; @@ -473,21 +461,22 @@ void cpu_hotplug_done(void) void cpu_hotplug_disable(void) { cpu_maps_update_begin(); - cpu_hotplug_disabled = 1; + cpu_hotplug_disabled++; cpu_maps_update_done(); } +EXPORT_SYMBOL_GPL(cpu_hotplug_disable); void cpu_hotplug_enable(void) { cpu_maps_update_begin(); - cpu_hotplug_disabled = 0; + WARN_ON(--cpu_hotplug_disabled < 0); cpu_maps_update_done(); } - +EXPORT_SYMBOL_GPL(cpu_hotplug_enable); #endif /* CONFIG_HOTPLUG_CPU */ /* Need to know about CPUs going up/down? */ -int __ref register_cpu_notifier(struct notifier_block *nb) +int register_cpu_notifier(struct notifier_block *nb) { int ret; cpu_maps_update_begin(); @@ -496,7 +485,7 @@ int __ref register_cpu_notifier(struct notifier_block *nb) return ret; } -int __ref __register_cpu_notifier(struct notifier_block *nb) +int __register_cpu_notifier(struct notifier_block *nb) { return raw_notifier_chain_register(&cpu_chain, nb); } @@ -526,7 +515,7 @@ static void cpu_notify_nofail(unsigned long val, void *v) EXPORT_SYMBOL(register_cpu_notifier); EXPORT_SYMBOL(__register_cpu_notifier); -void __ref unregister_cpu_notifier(struct notifier_block *nb) +void unregister_cpu_notifier(struct notifier_block *nb) { cpu_maps_update_begin(); raw_notifier_chain_unregister(&cpu_chain, nb); @@ -534,7 +523,7 @@ void __ref unregister_cpu_notifier(struct notifier_block *nb) } EXPORT_SYMBOL(unregister_cpu_notifier); -void __ref __unregister_cpu_notifier(struct notifier_block *nb) +void __unregister_cpu_notifier(struct notifier_block *nb) { raw_notifier_chain_unregister(&cpu_chain, nb); } @@ -585,8 +574,8 @@ static inline void check_for_tasks(int dead_cpu) { struct task_struct *g, *p; - read_lock_irq(&tasklist_lock); - do_each_thread(g, p) { + read_lock(&tasklist_lock); + for_each_process_thread(g, p) { if (!p->on_rq) continue; /* @@ -601,8 +590,8 @@ static inline void check_for_tasks(int dead_cpu) pr_warn("Task %s (pid=%d) is on cpu %d (state=%ld, flags=%x)\n", p->comm, task_pid_nr(p), dead_cpu, p->state, p->flags); - } while_each_thread(g, p); - read_unlock_irq(&tasklist_lock); + } + read_unlock(&tasklist_lock); } struct take_cpu_down_param { @@ -611,7 +600,7 @@ struct take_cpu_down_param { }; /* Take this CPU down. */ -static int __ref take_cpu_down(void *_param) +static int take_cpu_down(void *_param) { struct take_cpu_down_param *param = _param; int err; @@ -625,12 +614,12 @@ static int __ref take_cpu_down(void *_param) /* Give up timekeeping duties */ tick_handover_do_timer(); /* Park the stopper thread */ - kthread_park(current); + stop_machine_park((long)param->hcpu); return 0; } /* Requires cpu_add_remove_lock to be held */ -static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) +static int _cpu_down(unsigned int cpu, int tasks_frozen) { int mycpu, err, nr_calls = 0; void *hcpu = (void *)(long)cpu; @@ -692,14 +681,14 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) * will observe it. * * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might - * not imply sync_sched(), so explicitly call both. + * not imply sync_sched(), so wait for both. * * Do sync before park smpboot threads to take care the rcu boost case. */ -#ifdef CONFIG_PREEMPT - synchronize_sched(); -#endif - synchronize_rcu(); + if (IS_ENABLED(CONFIG_PREEMPT)) + synchronize_rcu_mult(call_rcu, call_rcu_sched); + else + synchronize_rcu(); __cpu_unplug_wait(cpu); smpboot_park_threads(cpu); @@ -708,14 +697,19 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) cpu_unplug_sync(cpu); /* - * So now all preempt/rcu users must observe !cpu_active(). + * Prevent irq alloc/free while the dying cpu reorganizes the + * interrupt affinities. */ + irq_lock_sparse(); - err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); + /* + * So now all preempt/rcu users must observe !cpu_active(). + */ + err = stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); if (err) { /* CPU didn't die: tell everyone. Can't complain. */ - smpboot_unpark_threads(cpu); cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); + irq_unlock_sparse(); goto out_release; } BUG_ON(cpu_online(cpu)); @@ -732,6 +726,9 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) smp_mb(); /* Read from cpu_dead_idle before __cpu_die(). */ per_cpu(cpu_dead_idle, cpu) = false; + /* Interrupts are moved away from the dying cpu, reenable alloc/free */ + irq_unlock_sparse(); + hotplug_cpu__broadcast_tick_pull(cpu); /* This actually kills the CPU. */ __cpu_die(cpu); @@ -754,7 +751,7 @@ restore_cpus: return err; } -int __ref cpu_down(unsigned int cpu) +int cpu_down(unsigned int cpu) { int err; @@ -784,6 +781,7 @@ static int smpboot_thread_call(struct notifier_block *nfb, switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DOWN_FAILED: case CPU_ONLINE: smpboot_unpark_threads(cpu); break; @@ -800,7 +798,7 @@ static struct notifier_block smpboot_thread_notifier = { .priority = CPU_PRI_SMPBOOT, }; -void __cpuinit smpboot_thread_init(void) +void smpboot_thread_init(void) { register_cpu_notifier(&smpboot_thread_notifier); } @@ -840,6 +838,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen) /* Arch-specific enabling code. */ ret = __cpu_up(cpu, idle); + if (ret != 0) goto out_notify; BUG_ON(!cpu_online(cpu)); @@ -918,13 +917,18 @@ int disable_nonboot_cpus(void) } } - if (!error) { + if (!error) BUG_ON(num_online_cpus() > 1); - /* Make sure the CPUs won't be enabled by someone else */ - cpu_hotplug_disabled = 1; - } else { + else pr_err("Non-boot CPUs are not disabled\n"); - } + + /* + * Make sure the CPUs won't be enabled by someone else. We need to do + * this even in case of failure as all disable_nonboot_cpus() users are + * supposed to do enable_nonboot_cpus() on the failure path. + */ + cpu_hotplug_disabled++; + cpu_maps_update_done(); return error; } @@ -937,13 +941,13 @@ void __weak arch_enable_nonboot_cpus_end(void) { } -void __ref enable_nonboot_cpus(void) +void enable_nonboot_cpus(void) { int cpu, error; /* Allow everyone to use the CPU hotplug again */ cpu_maps_update_begin(); - cpu_hotplug_disabled = 0; + WARN_ON(--cpu_hotplug_disabled < 0); if (cpumask_empty(frozen_cpus)) goto out; diff --git a/kernel/kernel/cpu_pm.c b/kernel/kernel/cpu_pm.c index 9656a3c36..009cc9a17 100644 --- a/kernel/kernel/cpu_pm.c +++ b/kernel/kernel/cpu_pm.c @@ -180,7 +180,7 @@ EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter); * low power state that may have caused some blocks in the same power domain * to reset. * - * Must be called after cpu_pm_exit has been called on all cpus in the power + * Must be called after cpu_cluster_pm_enter has been called for the power * domain, and before cpu_pm_exit has been called on any cpu in the power * domain. Notified drivers can include VFP co-processor, interrupt controller * and its PM extensions, local CPU timers context save/restore which diff --git a/kernel/kernel/cpuset.c b/kernel/kernel/cpuset.c index f0acff0f6..2ade63219 100644 --- a/kernel/kernel/cpuset.c +++ b/kernel/kernel/cpuset.c @@ -286,6 +286,8 @@ static struct cpuset top_cpuset = { static DEFINE_MUTEX(cpuset_mutex); static DEFINE_SPINLOCK(callback_lock); +static struct workqueue_struct *cpuset_migrate_mm_wq; + /* * CPU / memory hotplug is handled asynchronously. */ @@ -473,7 +475,8 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) /* On legacy hiearchy, we must be a subset of our parent cpuset. */ ret = -EACCES; - if (!cgroup_on_dfl(cur->css.cgroup) && !is_cpuset_subset(trial, par)) + if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && + !is_cpuset_subset(trial, par)) goto out; /* @@ -497,7 +500,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) * be changed to have empty cpus_allowed or mems_allowed. */ ret = -ENOSPC; - if ((cgroup_has_tasks(cur->css.cgroup) || cur->attach_in_progress)) { + if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) { if (!cpumask_empty(cur->cpus_allowed) && cpumask_empty(trial->cpus_allowed)) goto out; @@ -879,7 +882,8 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) * If it becomes empty, inherit the effective mask of the * parent, which is guaranteed to have some CPUs. */ - if (cgroup_on_dfl(cp->css.cgroup) && cpumask_empty(new_cpus)) + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && + cpumask_empty(new_cpus)) cpumask_copy(new_cpus, parent->effective_cpus); /* Skip the whole subtree if the cpumask remains the same. */ @@ -896,7 +900,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) cpumask_copy(cp->effective_cpus, new_cpus); spin_unlock_irq(&callback_lock); - WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && + WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); update_tasks_cpumask(cp); @@ -969,31 +973,51 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, } /* - * cpuset_migrate_mm - * - * Migrate memory region from one set of nodes to another. - * - * Temporarilly set tasks mems_allowed to target nodes of migration, - * so that the migration code can allocate pages on these nodes. - * - * While the mm_struct we are migrating is typically from some - * other task, the task_struct mems_allowed that we are hacking - * is for our current task, which must allocate new pages for that - * migrating memory region. + * Migrate memory region from one set of nodes to another. This is + * performed asynchronously as it can be called from process migration path + * holding locks involved in process management. All mm migrations are + * performed in the queued order and can be waited for by flushing + * cpuset_migrate_mm_wq. */ +struct cpuset_migrate_mm_work { + struct work_struct work; + struct mm_struct *mm; + nodemask_t from; + nodemask_t to; +}; + +static void cpuset_migrate_mm_workfn(struct work_struct *work) +{ + struct cpuset_migrate_mm_work *mwork = + container_of(work, struct cpuset_migrate_mm_work, work); + + /* on a wq worker, no need to worry about %current's mems_allowed */ + do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL); + mmput(mwork->mm); + kfree(mwork); +} + static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to) { - struct task_struct *tsk = current; + struct cpuset_migrate_mm_work *mwork; - tsk->mems_allowed = *to; - - do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); + mwork = kzalloc(sizeof(*mwork), GFP_KERNEL); + if (mwork) { + mwork->mm = mm; + mwork->from = *from; + mwork->to = *to; + INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn); + queue_work(cpuset_migrate_mm_wq, &mwork->work); + } else { + mmput(mm); + } +} - rcu_read_lock(); - guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed); - rcu_read_unlock(); +void cpuset_post_attach_flush(void) +{ + flush_workqueue(cpuset_migrate_mm_wq); } /* @@ -1094,7 +1118,8 @@ static void update_tasks_nodemask(struct cpuset *cs) mpol_rebind_mm(mm, &cs->mems_allowed); if (migrate) cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems); - mmput(mm); + else + mmput(mm); } css_task_iter_end(&it); @@ -1135,7 +1160,8 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) * If it becomes empty, inherit the effective mask of the * parent, which is guaranteed to have some MEMs. */ - if (cgroup_on_dfl(cp->css.cgroup) && nodes_empty(*new_mems)) + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && + nodes_empty(*new_mems)) *new_mems = parent->effective_mems; /* Skip the whole subtree if the nodemask remains the same. */ @@ -1152,7 +1178,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) cp->effective_mems = *new_mems; spin_unlock_irq(&callback_lock); - WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && + WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && !nodes_equal(cp->mems_allowed, cp->effective_mems)); update_tasks_nodemask(cp); @@ -1426,25 +1452,26 @@ static int fmeter_getrate(struct fmeter *fmp) static struct cpuset *cpuset_attach_old_cs; /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ -static int cpuset_can_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static int cpuset_can_attach(struct cgroup_taskset *tset) { - struct cpuset *cs = css_cs(css); + struct cgroup_subsys_state *css; + struct cpuset *cs; struct task_struct *task; int ret; /* used later by cpuset_attach() */ - cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset)); + cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css)); + cs = css_cs(css); mutex_lock(&cpuset_mutex); /* allow moving tasks into an empty cpuset if on default hierarchy */ ret = -ENOSPC; - if (!cgroup_on_dfl(css->cgroup) && + if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) goto out_unlock; - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, css, tset) { ret = task_can_attach(task, cs->cpus_allowed); if (ret) goto out_unlock; @@ -1464,9 +1491,14 @@ out_unlock: return ret; } -static void cpuset_cancel_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void cpuset_cancel_attach(struct cgroup_taskset *tset) { + struct cgroup_subsys_state *css; + struct cpuset *cs; + + cgroup_taskset_first(tset, &css); + cs = css_cs(css); + mutex_lock(&cpuset_mutex); css_cs(css)->attach_in_progress--; mutex_unlock(&cpuset_mutex); @@ -1479,17 +1511,19 @@ static void cpuset_cancel_attach(struct cgroup_subsys_state *css, */ static cpumask_var_t cpus_attach; -static void cpuset_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void cpuset_attach(struct cgroup_taskset *tset) { /* static buf protected by cpuset_mutex */ static nodemask_t cpuset_attach_nodemask_to; - struct mm_struct *mm; struct task_struct *task; - struct task_struct *leader = cgroup_taskset_first(tset); - struct cpuset *cs = css_cs(css); + struct task_struct *leader; + struct cgroup_subsys_state *css; + struct cpuset *cs; struct cpuset *oldcs = cpuset_attach_old_cs; + cgroup_taskset_first(tset, &css); + cs = css_cs(css); + mutex_lock(&cpuset_mutex); /* prepare for attach */ @@ -1500,7 +1534,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css, guarantee_online_mems(cs, &cpuset_attach_nodemask_to); - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, css, tset) { /* * can_attach beforehand should guarantee that this doesn't * fail. TODO: have a better way to handle failure here @@ -1512,26 +1546,30 @@ static void cpuset_attach(struct cgroup_subsys_state *css, } /* - * Change mm, possibly for multiple threads in a threadgroup. This is - * expensive and may sleep. + * Change mm for all threadgroup leaders. This is expensive and may + * sleep and should be moved outside migration path proper. */ cpuset_attach_nodemask_to = cs->effective_mems; - mm = get_task_mm(leader); - if (mm) { - mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); - - /* - * old_mems_allowed is the same with mems_allowed here, except - * if this task is being moved automatically due to hotplug. - * In that case @mems_allowed has been updated and is empty, - * so @old_mems_allowed is the right nodesets that we migrate - * mm from. - */ - if (is_memory_migrate(cs)) { - cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, - &cpuset_attach_nodemask_to); + cgroup_taskset_for_each_leader(leader, css, tset) { + struct mm_struct *mm = get_task_mm(leader); + + if (mm) { + mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); + + /* + * old_mems_allowed is the same with mems_allowed + * here, except if this task is being moved + * automatically due to hotplug. In that case + * @mems_allowed has been updated and is empty, so + * @old_mems_allowed is the right nodesets that we + * migrate mm from. + */ + if (is_memory_migrate(cs)) + cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, + &cpuset_attach_nodemask_to); + else + mmput(mm); } - mmput(mm); } cs->old_mems_allowed = cpuset_attach_nodemask_to; @@ -1594,9 +1632,6 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, case FILE_MEMORY_PRESSURE_ENABLED: cpuset_memory_pressure_enabled = !!val; break; - case FILE_MEMORY_PRESSURE: - retval = -EACCES; - break; case FILE_SPREAD_PAGE: retval = update_flag(CS_SPREAD_PAGE, cs, val); break; @@ -1698,6 +1733,7 @@ out_unlock: mutex_unlock(&cpuset_mutex); kernfs_unbreak_active_protection(of->kn); css_put(&cs->css); + flush_workqueue(cpuset_migrate_mm_wq); return retval ?: nbytes; } @@ -1863,9 +1899,6 @@ static struct cftype files[] = { { .name = "memory_pressure", .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_MEMORY_PRESSURE, - .mode = S_IRUGO, }, { @@ -1952,7 +1985,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cpuset_inc(); spin_lock_irq(&callback_lock); - if (cgroup_on_dfl(cs->css.cgroup)) { + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) { cpumask_copy(cs->effective_cpus, parent->effective_cpus); cs->effective_mems = parent->effective_mems; } @@ -2029,7 +2062,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) mutex_lock(&cpuset_mutex); spin_lock_irq(&callback_lock); - if (cgroup_on_dfl(root_css->cgroup)) { + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) { cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); top_cpuset.mems_allowed = node_possible_map; } else { @@ -2210,7 +2243,7 @@ retry: cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus); mems_updated = !nodes_equal(new_mems, cs->effective_mems); - if (cgroup_on_dfl(cs->css.cgroup)) + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) hotplug_update_tasks(cs, &new_cpus, &new_mems, cpus_updated, mems_updated); else @@ -2241,7 +2274,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) static cpumask_t new_cpus; static nodemask_t new_mems; bool cpus_updated, mems_updated; - bool on_dfl = cgroup_on_dfl(top_cpuset.css.cgroup); + bool on_dfl = cgroup_subsys_on_dfl(cpuset_cgrp_subsys); mutex_lock(&cpuset_mutex); @@ -2346,6 +2379,9 @@ void __init cpuset_init_smp(void) top_cpuset.effective_mems = node_states[N_MEMORY]; register_hotmemory_notifier(&cpuset_track_online_nodes_nb); + + cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0); + BUG_ON(!cpuset_migrate_mm_wq); } /** @@ -2598,22 +2634,22 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, } /** - * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed - * @tsk: pointer to task_struct of some task. + * cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed * - * Description: Prints @task's name, cpuset name, and cached copy of its + * Description: Prints current's name, cpuset name, and cached copy of its * mems_allowed to the kernel log. */ -void cpuset_print_task_mems_allowed(struct task_struct *tsk) +void cpuset_print_current_mems_allowed(void) { struct cgroup *cgrp; rcu_read_lock(); - cgrp = task_cs(tsk)->css.cgroup; - pr_info("%s cpuset=", tsk->comm); + cgrp = task_cs(current)->css.cgroup; + pr_info("%s cpuset=", current->comm); pr_cont_cgroup_name(cgrp); - pr_cont(" mems_allowed=%*pbl\n", nodemask_pr_args(&tsk->mems_allowed)); + pr_cont(" mems_allowed=%*pbl\n", + nodemask_pr_args(¤t->mems_allowed)); rcu_read_unlock(); } diff --git a/kernel/kernel/cred.c b/kernel/kernel/cred.c index ec1c07667..71179a09c 100644 --- a/kernel/kernel/cred.c +++ b/kernel/kernel/cred.c @@ -20,11 +20,16 @@ #include <linux/cn_proc.h> #if 0 -#define kdebug(FMT, ...) \ - printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__) +#define kdebug(FMT, ...) \ + printk("[%-5.5s%5u] " FMT "\n", \ + current->comm, current->pid, ##__VA_ARGS__) #else -#define kdebug(FMT, ...) \ - no_printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__) +#define kdebug(FMT, ...) \ +do { \ + if (0) \ + no_printk("[%-5.5s%5u] " FMT "\n", \ + current->comm, current->pid, ##__VA_ARGS__); \ +} while (0) #endif static struct kmem_cache *cred_jar; diff --git a/kernel/kernel/events/callchain.c b/kernel/kernel/events/callchain.c index d65948725..9c418002b 100644 --- a/kernel/kernel/events/callchain.c +++ b/kernel/kernel/events/callchain.c @@ -3,7 +3,7 @@ * * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> * * For licensing details see kernel-base/COPYING diff --git a/kernel/kernel/events/core.c b/kernel/kernel/events/core.c index 5146610a8..760f41d98 100644 --- a/kernel/kernel/events/core.c +++ b/kernel/kernel/events/core.c @@ -3,7 +3,7 @@ * * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> * * For licensing details see kernel-base/COPYING @@ -36,7 +36,7 @@ #include <linux/kernel_stat.h> #include <linux/cgroup.h> #include <linux/perf_event.h> -#include <linux/ftrace_event.h> +#include <linux/trace_events.h> #include <linux/hw_breakpoint.h> #include <linux/mm_types.h> #include <linux/module.h> @@ -51,9 +51,11 @@ static struct workqueue_struct *perf_wq; +typedef int (*remote_function_f)(void *); + struct remote_function_call { struct task_struct *p; - int (*func)(void *info); + remote_function_f func; void *info; int ret; }; @@ -86,7 +88,7 @@ static void remote_function(void *data) * -EAGAIN - when the process moved away */ static int -task_function_call(struct task_struct *p, int (*func) (void *info), void *info) +task_function_call(struct task_struct *p, remote_function_f func, void *info) { struct remote_function_call data = { .p = p, @@ -110,7 +112,7 @@ task_function_call(struct task_struct *p, int (*func) (void *info), void *info) * * returns: @func return value or -ENXIO when the cpu is offline */ -static int cpu_function_call(int cpu, int (*func) (void *info), void *info) +static int cpu_function_call(int cpu, remote_function_f func, void *info) { struct remote_function_call data = { .p = NULL, @@ -161,6 +163,7 @@ static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; static atomic_t nr_task_events __read_mostly; static atomic_t nr_freq_events __read_mostly; +static atomic_t nr_switch_events __read_mostly; static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); @@ -193,7 +196,7 @@ static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS; static int perf_sample_allowed_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100; -void update_perf_cpu_limits(void) +static void update_perf_cpu_limits(void) { u64 tmp = perf_sample_period_ns; @@ -432,7 +435,7 @@ static inline void update_cgrp_time_from_event(struct perf_event *event) if (!is_cgroup_event(event)) return; - cgrp = perf_cgroup_from_task(current); + cgrp = perf_cgroup_from_task(current, event->ctx); /* * Do not update time when cgroup is not active */ @@ -455,7 +458,7 @@ perf_cgroup_set_timestamp(struct task_struct *task, if (!task || !ctx->nr_cgroups) return; - cgrp = perf_cgroup_from_task(task); + cgrp = perf_cgroup_from_task(task, ctx); info = this_cpu_ptr(cgrp->info); info->timestamp = ctx->timestamp; } @@ -469,7 +472,7 @@ perf_cgroup_set_timestamp(struct task_struct *task, * mode SWOUT : schedule out everything * mode SWIN : schedule in based on cgroup for next */ -void perf_cgroup_switch(struct task_struct *task, int mode) +static void perf_cgroup_switch(struct task_struct *task, int mode) { struct perf_cpu_context *cpuctx; struct pmu *pmu; @@ -486,7 +489,6 @@ void perf_cgroup_switch(struct task_struct *task, int mode) * we reschedule only in the presence of cgroup * constrained events. */ - rcu_read_lock(); list_for_each_entry_rcu(pmu, &pmus, entry) { cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); @@ -519,8 +521,10 @@ void perf_cgroup_switch(struct task_struct *task, int mode) * set cgrp before ctxsw in to allow * event_filter_match() to not have to pass * task around + * we pass the cpuctx->ctx to perf_cgroup_from_task() + * because cgorup events are only per-cpu */ - cpuctx->cgrp = perf_cgroup_from_task(task); + cpuctx->cgrp = perf_cgroup_from_task(task, &cpuctx->ctx); cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); } perf_pmu_enable(cpuctx->ctx.pmu); @@ -528,8 +532,6 @@ void perf_cgroup_switch(struct task_struct *task, int mode) } } - rcu_read_unlock(); - local_irq_restore(flags); } @@ -539,17 +541,20 @@ static inline void perf_cgroup_sched_out(struct task_struct *task, struct perf_cgroup *cgrp1; struct perf_cgroup *cgrp2 = NULL; + rcu_read_lock(); /* * we come here when we know perf_cgroup_events > 0 + * we do not need to pass the ctx here because we know + * we are holding the rcu lock */ - cgrp1 = perf_cgroup_from_task(task); + cgrp1 = perf_cgroup_from_task(task, NULL); /* * next is NULL when called from perf_event_enable_on_exec() * that will systematically cause a cgroup_switch() */ if (next) - cgrp2 = perf_cgroup_from_task(next); + cgrp2 = perf_cgroup_from_task(next, NULL); /* * only schedule out current cgroup events if we know @@ -558,6 +563,8 @@ static inline void perf_cgroup_sched_out(struct task_struct *task, */ if (cgrp1 != cgrp2) perf_cgroup_switch(task, PERF_CGROUP_SWOUT); + + rcu_read_unlock(); } static inline void perf_cgroup_sched_in(struct task_struct *prev, @@ -566,13 +573,16 @@ static inline void perf_cgroup_sched_in(struct task_struct *prev, struct perf_cgroup *cgrp1; struct perf_cgroup *cgrp2 = NULL; + rcu_read_lock(); /* * we come here when we know perf_cgroup_events > 0 + * we do not need to pass the ctx here because we know + * we are holding the rcu lock */ - cgrp1 = perf_cgroup_from_task(task); + cgrp1 = perf_cgroup_from_task(task, NULL); /* prev can never be NULL */ - cgrp2 = perf_cgroup_from_task(prev); + cgrp2 = perf_cgroup_from_task(prev, NULL); /* * only need to schedule in cgroup events if we are changing @@ -581,6 +591,8 @@ static inline void perf_cgroup_sched_in(struct task_struct *prev, */ if (cgrp1 != cgrp2) perf_cgroup_switch(task, PERF_CGROUP_SWIN); + + rcu_read_unlock(); } static inline int perf_cgroup_connect(int fd, struct perf_event *event, @@ -747,62 +759,31 @@ perf_cgroup_mark_enabled(struct perf_event *event, /* * function must be called with interrupts disbled */ -static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr) +static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr) { struct perf_cpu_context *cpuctx; - enum hrtimer_restart ret = HRTIMER_NORESTART; int rotations = 0; WARN_ON(!irqs_disabled()); cpuctx = container_of(hr, struct perf_cpu_context, hrtimer); - rotations = perf_rotate_context(cpuctx); - /* - * arm timer if needed - */ - if (rotations) { + raw_spin_lock(&cpuctx->hrtimer_lock); + if (rotations) hrtimer_forward_now(hr, cpuctx->hrtimer_interval); - ret = HRTIMER_RESTART; - } - - return ret; -} - -/* CPU is going down */ -void perf_cpu_hrtimer_cancel(int cpu) -{ - struct perf_cpu_context *cpuctx; - struct pmu *pmu; - unsigned long flags; - - if (WARN_ON(cpu != smp_processor_id())) - return; - - local_irq_save(flags); - - rcu_read_lock(); - - list_for_each_entry_rcu(pmu, &pmus, entry) { - cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - - if (pmu->task_ctx_nr == perf_sw_context) - continue; - - hrtimer_cancel(&cpuctx->hrtimer); - } - - rcu_read_unlock(); + else + cpuctx->hrtimer_active = 0; + raw_spin_unlock(&cpuctx->hrtimer_lock); - local_irq_restore(flags); + return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART; } -static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu) +static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu) { - struct hrtimer *hr = &cpuctx->hrtimer; + struct hrtimer *timer = &cpuctx->hrtimer; struct pmu *pmu = cpuctx->ctx.pmu; - int timer; + u64 interval; /* no multiplexing needed for SW PMU */ if (pmu->task_ctx_nr == perf_sw_context) @@ -812,31 +793,37 @@ static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu) * check default is sane, if not set then force to * default interval (1/tick) */ - timer = pmu->hrtimer_interval_ms; - if (timer < 1) - timer = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER; + interval = pmu->hrtimer_interval_ms; + if (interval < 1) + interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER; - cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); + cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval); - hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); - hr->function = perf_cpu_hrtimer_handler; + raw_spin_lock_init(&cpuctx->hrtimer_lock); + hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); + timer->function = perf_mux_hrtimer_handler; + timer->irqsafe = 1; } -static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx) +static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx) { - struct hrtimer *hr = &cpuctx->hrtimer; + struct hrtimer *timer = &cpuctx->hrtimer; struct pmu *pmu = cpuctx->ctx.pmu; + unsigned long flags; /* not for SW PMU */ if (pmu->task_ctx_nr == perf_sw_context) - return; + return 0; - if (hrtimer_active(hr)) - return; + raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags); + if (!cpuctx->hrtimer_active) { + cpuctx->hrtimer_active = 1; + hrtimer_forward_now(timer, cpuctx->hrtimer_interval); + hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); + } + raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags); - if (!hrtimer_callback_running(hr)) - __hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval, - 0, HRTIMER_MODE_REL_PINNED, 0); + return 0; } void perf_pmu_disable(struct pmu *pmu) @@ -1073,13 +1060,13 @@ retry: /* * One of the few rules of preemptible RCU is that one cannot do * rcu_read_unlock() while holding a scheduler (or nested) lock when - * part of the read side critical section was preemptible -- see + * part of the read side critical section was irqs-enabled -- see * rcu_read_unlock_special(). * * Since ctx->lock nests under rq->lock we must ensure the entire read - * side critical section is non-preemptible. + * side critical section has interrupts disabled. */ - preempt_disable(); + local_irq_save(*flags); rcu_read_lock(); ctx = rcu_dereference(task->perf_event_ctxp[ctxn]); if (ctx) { @@ -1093,21 +1080,22 @@ retry: * if so. If we locked the right context, then it * can't get swapped on us any more. */ - raw_spin_lock_irqsave(&ctx->lock, *flags); + raw_spin_lock(&ctx->lock); if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) { - raw_spin_unlock_irqrestore(&ctx->lock, *flags); + raw_spin_unlock(&ctx->lock); rcu_read_unlock(); - preempt_enable(); + local_irq_restore(*flags); goto retry; } if (!atomic_inc_not_zero(&ctx->refcount)) { - raw_spin_unlock_irqrestore(&ctx->lock, *flags); + raw_spin_unlock(&ctx->lock); ctx = NULL; } } rcu_read_unlock(); - preempt_enable(); + if (!ctx) + local_irq_restore(*flags); return ctx; } @@ -1266,11 +1254,7 @@ static inline void perf_event__state_init(struct perf_event *event) PERF_EVENT_STATE_INACTIVE; } -/* - * Called at perf_event creation and when events are attached/detached from a - * group. - */ -static void perf_event__read_size(struct perf_event *event) +static void __perf_event_read_size(struct perf_event *event, int nr_siblings) { int entry = sizeof(u64); /* value */ int size = 0; @@ -1286,7 +1270,7 @@ static void perf_event__read_size(struct perf_event *event) entry += sizeof(u64); if (event->attr.read_format & PERF_FORMAT_GROUP) { - nr += event->group_leader->nr_siblings; + nr += nr_siblings; size += sizeof(u64); } @@ -1294,14 +1278,11 @@ static void perf_event__read_size(struct perf_event *event) event->read_size = size; } -static void perf_event__header_size(struct perf_event *event) +static void __perf_event_header_size(struct perf_event *event, u64 sample_type) { struct perf_sample_data *data; - u64 sample_type = event->attr.sample_type; u16 size = 0; - perf_event__read_size(event); - if (sample_type & PERF_SAMPLE_IP) size += sizeof(data->ip); @@ -1326,6 +1307,17 @@ static void perf_event__header_size(struct perf_event *event) event->header_size = size; } +/* + * Called at perf_event creation and when events are attached/detached from a + * group. + */ +static void perf_event__header_size(struct perf_event *event) +{ + __perf_event_read_size(event, + event->group_leader->nr_siblings); + __perf_event_header_size(event, event->attr.sample_type); +} + static void perf_event__id_header_size(struct perf_event *event) { struct perf_sample_data *data; @@ -1353,6 +1345,27 @@ static void perf_event__id_header_size(struct perf_event *event) event->id_header_size = size; } +static bool perf_event_validate_size(struct perf_event *event) +{ + /* + * The values computed here will be over-written when we actually + * attach the event. + */ + __perf_event_read_size(event, event->group_leader->nr_siblings + 1); + __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ); + perf_event__id_header_size(event); + + /* + * Sum the lot; should not exceed the 64k limit we have on records. + * Conservative limit to allow for callchains and other variable fields. + */ + if (event->read_size + event->header_size + + event->id_header_size + sizeof(struct perf_event_header) >= 16*1024) + return false; + + return true; +} + static void perf_group_attach(struct perf_event *event) { struct perf_event *group_leader = event->group_leader, *pos; @@ -1526,11 +1539,17 @@ static int __init perf_workqueue_init(void) core_initcall(perf_workqueue_init); +static inline int pmu_filter_match(struct perf_event *event) +{ + struct pmu *pmu = event->pmu; + return pmu->filter_match ? pmu->filter_match(event) : 1; +} + static inline int event_filter_match(struct perf_event *event) { return (event->cpu == -1 || event->cpu == smp_processor_id()) - && perf_cgroup_match(event); + && perf_cgroup_match(event) && pmu_filter_match(event); } static void @@ -1931,11 +1950,11 @@ group_sched_in(struct perf_event *group_event, if (group_event->state == PERF_EVENT_STATE_OFF) return 0; - pmu->start_txn(pmu); + pmu->start_txn(pmu, PERF_PMU_TXN_ADD); if (event_sched_in(group_event, cpuctx, ctx)) { pmu->cancel_txn(pmu); - perf_cpu_hrtimer_restart(cpuctx); + perf_mux_hrtimer_restart(cpuctx); return -EAGAIN; } @@ -1982,7 +2001,7 @@ group_error: pmu->cancel_txn(pmu); - perf_cpu_hrtimer_restart(cpuctx); + perf_mux_hrtimer_restart(cpuctx); return -EAGAIN; } @@ -2255,7 +2274,7 @@ static int __perf_event_enable(void *info) */ if (leader != event) { group_sched_out(leader, cpuctx, ctx); - perf_cpu_hrtimer_restart(cpuctx); + perf_mux_hrtimer_restart(cpuctx); } if (leader->attr.pinned) { update_group_times(leader); @@ -2637,6 +2656,9 @@ static void perf_pmu_sched_task(struct task_struct *prev, local_irq_restore(flags); } +static void perf_event_switch(struct task_struct *task, + struct task_struct *next_prev, bool sched_in); + #define for_each_task_context_nr(ctxn) \ for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++) @@ -2659,6 +2681,9 @@ void __perf_event_task_sched_out(struct task_struct *task, if (__this_cpu_read(perf_sched_cb_usages)) perf_pmu_sched_task(task, next, false); + if (atomic_read(&nr_switch_events)) + perf_event_switch(task, next, false); + for_each_task_context_nr(ctxn) perf_event_context_sched_out(task, ctxn, next); @@ -2849,6 +2874,9 @@ void __perf_event_task_sched_in(struct task_struct *prev, if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) perf_cgroup_sched_in(prev, task); + if (atomic_read(&nr_switch_events)) + perf_event_switch(task, prev, true); + if (__this_cpu_read(perf_sched_cb_usages)) perf_pmu_sched_task(prev, task, true); } @@ -3127,15 +3155,16 @@ static int event_enable_on_exec(struct perf_event *event, * Enable all of a task's events that have been marked enable-on-exec. * This expects task == current. */ -static void perf_event_enable_on_exec(struct perf_event_context *ctx) +static void perf_event_enable_on_exec(int ctxn) { - struct perf_event_context *clone_ctx = NULL; + struct perf_event_context *ctx, *clone_ctx = NULL; struct perf_event *event; unsigned long flags; int enabled = 0; int ret; local_irq_save(flags); + ctx = current->perf_event_ctxp[ctxn]; if (!ctx || !ctx->nr_events) goto out; @@ -3178,28 +3207,30 @@ out: void perf_event_exec(void) { - struct perf_event_context *ctx; int ctxn; rcu_read_lock(); - for_each_task_context_nr(ctxn) { - ctx = current->perf_event_ctxp[ctxn]; - if (!ctx) - continue; - - perf_event_enable_on_exec(ctx); - } + for_each_task_context_nr(ctxn) + perf_event_enable_on_exec(ctxn); rcu_read_unlock(); } +struct perf_read_data { + struct perf_event *event; + bool group; + int ret; +}; + /* * Cross CPU call to read the hardware event */ static void __perf_event_read(void *info) { - struct perf_event *event = info; + struct perf_read_data *data = info; + struct perf_event *sub, *event = data->event; struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + struct pmu *pmu = event->pmu; /* * If this is a task context, we need to check whether it is @@ -3216,9 +3247,35 @@ static void __perf_event_read(void *info) update_context_time(ctx); update_cgrp_time_from_event(event); } + update_event_times(event); - if (event->state == PERF_EVENT_STATE_ACTIVE) - event->pmu->read(event); + if (event->state != PERF_EVENT_STATE_ACTIVE) + goto unlock; + + if (!data->group) { + pmu->read(event); + data->ret = 0; + goto unlock; + } + + pmu->start_txn(pmu, PERF_PMU_TXN_READ); + + pmu->read(event); + + list_for_each_entry(sub, &event->sibling_list, group_entry) { + update_event_times(sub); + if (sub->state == PERF_EVENT_STATE_ACTIVE) { + /* + * Use sibling's PMU rather than @event's since + * sibling could be on different (eg: software) PMU. + */ + sub->pmu->read(sub); + } + } + + data->ret = pmu->commit_txn(pmu); + +unlock: raw_spin_unlock(&ctx->lock); } @@ -3230,15 +3287,76 @@ static inline u64 perf_event_count(struct perf_event *event) return __perf_event_count(event); } -static u64 perf_event_read(struct perf_event *event) +/* + * NMI-safe method to read a local event, that is an event that + * is: + * - either for the current task, or for this CPU + * - does not have inherit set, for inherited task events + * will not be local and we cannot read them atomically + * - must not have a pmu::count method + */ +u64 perf_event_read_local(struct perf_event *event) { + unsigned long flags; + u64 val; + + /* + * Disabling interrupts avoids all counter scheduling (context + * switches, timer based rotation and IPIs). + */ + local_irq_save(flags); + + /* If this is a per-task event, it must be for current */ + WARN_ON_ONCE((event->attach_state & PERF_ATTACH_TASK) && + event->hw.target != current); + + /* If this is a per-CPU event, it must be for this CPU */ + WARN_ON_ONCE(!(event->attach_state & PERF_ATTACH_TASK) && + event->cpu != smp_processor_id()); + + /* + * It must not be an event with inherit set, we cannot read + * all child counters from atomic context. + */ + WARN_ON_ONCE(event->attr.inherit); + + /* + * It must not have a pmu::count method, those are not + * NMI safe. + */ + WARN_ON_ONCE(event->pmu->count); + + /* + * If the event is currently on this CPU, its either a per-task event, + * or local to this CPU. Furthermore it means its ACTIVE (otherwise + * oncpu == -1). + */ + if (event->oncpu == smp_processor_id()) + event->pmu->read(event); + + val = local64_read(&event->count); + local_irq_restore(flags); + + return val; +} + +static int perf_event_read(struct perf_event *event, bool group) +{ + int ret = 0; + /* * If event is enabled and currently active on a CPU, update the * value in the event structure: */ if (event->state == PERF_EVENT_STATE_ACTIVE) { + struct perf_read_data data = { + .event = event, + .group = group, + .ret = 0, + }; smp_call_function_single(event->oncpu, - __perf_event_read, event, 1); + __perf_event_read, &data, 1); + ret = data.ret; } else if (event->state == PERF_EVENT_STATE_INACTIVE) { struct perf_event_context *ctx = event->ctx; unsigned long flags; @@ -3253,11 +3371,14 @@ static u64 perf_event_read(struct perf_event *event) update_context_time(ctx); update_cgrp_time_from_event(event); } - update_event_times(event); + if (group) + update_group_times(event); + else + update_event_times(event); raw_spin_unlock_irqrestore(&ctx->lock, flags); } - return perf_event_count(event); + return ret; } /* @@ -3314,7 +3435,7 @@ find_lively_task_by_vpid(pid_t vpid) /* Reuse ptrace permission checks for now. */ err = -EACCES; - if (!ptrace_may_access(task, PTRACE_MODE_READ)) + if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) goto errout; return task; @@ -3472,6 +3593,10 @@ static void unaccount_event(struct perf_event *event) atomic_dec(&nr_task_events); if (event->attr.freq) atomic_dec(&nr_freq_events); + if (event->attr.context_switch) { + static_key_slow_dec_deferred(&perf_sched_events); + atomic_dec(&nr_switch_events); + } if (is_cgroup_event(event)) static_key_slow_dec_deferred(&perf_sched_events); if (has_branch_stack(event)) @@ -3695,7 +3820,7 @@ static void put_event(struct perf_event *event) * see the comment there. * * 2) there is a lock-inversion with mmap_sem through - * perf_event_read_group(), which takes faults while + * perf_read_group(), which takes faults while * holding ctx->mutex, however this is called after * the last filedesc died, so there is no possibility * to trigger the AB-BA case. @@ -3769,14 +3894,18 @@ u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) *running = 0; mutex_lock(&event->child_mutex); - total += perf_event_read(event); + + (void)perf_event_read(event, false); + total += perf_event_count(event); + *enabled += event->total_time_enabled + atomic64_read(&event->child_total_time_enabled); *running += event->total_time_running + atomic64_read(&event->child_total_time_running); list_for_each_entry(child, &event->child_list, child_list) { - total += perf_event_read(child); + (void)perf_event_read(child, false); + total += perf_event_count(child); *enabled += child->total_time_enabled; *running += child->total_time_running; } @@ -3786,55 +3915,95 @@ u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) } EXPORT_SYMBOL_GPL(perf_event_read_value); -static int perf_event_read_group(struct perf_event *event, - u64 read_format, char __user *buf) +static int __perf_read_group_add(struct perf_event *leader, + u64 read_format, u64 *values) { - struct perf_event *leader = event->group_leader, *sub; - struct perf_event_context *ctx = leader->ctx; - int n = 0, size = 0, ret; - u64 count, enabled, running; - u64 values[5]; + struct perf_event *sub; + int n = 1; /* skip @nr */ + int ret; - lockdep_assert_held(&ctx->mutex); + ret = perf_event_read(leader, true); + if (ret) + return ret; - count = perf_event_read_value(leader, &enabled, &running); + /* + * Since we co-schedule groups, {enabled,running} times of siblings + * will be identical to those of the leader, so we only publish one + * set. + */ + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { + values[n++] += leader->total_time_enabled + + atomic64_read(&leader->child_total_time_enabled); + } - values[n++] = 1 + leader->nr_siblings; - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) - values[n++] = enabled; - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) - values[n++] = running; - values[n++] = count; + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { + values[n++] += leader->total_time_running + + atomic64_read(&leader->child_total_time_running); + } + + /* + * Write {count,id} tuples for every sibling. + */ + values[n++] += perf_event_count(leader); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); - size = n * sizeof(u64); + list_for_each_entry(sub, &leader->sibling_list, group_entry) { + values[n++] += perf_event_count(sub); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(sub); + } - if (copy_to_user(buf, values, size)) - return -EFAULT; + return 0; +} - ret = size; +static int perf_read_group(struct perf_event *event, + u64 read_format, char __user *buf) +{ + struct perf_event *leader = event->group_leader, *child; + struct perf_event_context *ctx = leader->ctx; + int ret; + u64 *values; - list_for_each_entry(sub, &leader->sibling_list, group_entry) { - n = 0; + lockdep_assert_held(&ctx->mutex); - values[n++] = perf_event_read_value(sub, &enabled, &running); - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_event_id(sub); + values = kzalloc(event->read_size, GFP_KERNEL); + if (!values) + return -ENOMEM; - size = n * sizeof(u64); + values[0] = 1 + leader->nr_siblings; - if (copy_to_user(buf + ret, values, size)) { - return -EFAULT; - } + /* + * By locking the child_mutex of the leader we effectively + * lock the child list of all siblings.. XXX explain how. + */ + mutex_lock(&leader->child_mutex); + + ret = __perf_read_group_add(leader, read_format, values); + if (ret) + goto unlock; - ret += size; + list_for_each_entry(child, &leader->child_list, child_list) { + ret = __perf_read_group_add(child, read_format, values); + if (ret) + goto unlock; } + mutex_unlock(&leader->child_mutex); + + ret = event->read_size; + if (copy_to_user(buf, values, event->read_size)) + ret = -EFAULT; + goto out; + +unlock: + mutex_unlock(&leader->child_mutex); +out: + kfree(values); return ret; } -static int perf_event_read_one(struct perf_event *event, +static int perf_read_one(struct perf_event *event, u64 read_format, char __user *buf) { u64 enabled, running; @@ -3872,7 +4041,7 @@ static bool is_event_hup(struct perf_event *event) * Read the performance event - simple non blocking version for now */ static ssize_t -perf_read_hw(struct perf_event *event, char __user *buf, size_t count) +__perf_read(struct perf_event *event, char __user *buf, size_t count) { u64 read_format = event->attr.read_format; int ret; @@ -3890,9 +4059,9 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count) WARN_ON_ONCE(event->ctx->parent_ctx); if (read_format & PERF_FORMAT_GROUP) - ret = perf_event_read_group(event, read_format, buf); + ret = perf_read_group(event, read_format, buf); else - ret = perf_event_read_one(event, read_format, buf); + ret = perf_read_one(event, read_format, buf); return ret; } @@ -3905,7 +4074,7 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) int ret; ctx = perf_event_ctx_lock(event); - ret = perf_read_hw(event, buf, count); + ret = __perf_read(event, buf, count); perf_event_ctx_unlock(event, ctx); return ret; @@ -3936,7 +4105,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) static void _perf_event_reset(struct perf_event *event) { - (void)perf_event_read(event); + (void)perf_event_read(event, false); local64_set(&event->count, 0); perf_event_update_userpage(event); } @@ -4052,7 +4221,14 @@ retry: goto retry; } - __perf_event_period(&pe); + if (event->attr.freq) { + event->attr.sample_freq = value; + } else { + event->attr.sample_period = value; + event->hw.sample_period = value; + } + + local64_set(&event->hw.period_left, 0); raw_spin_unlock_irq(&ctx->lock); return 0; @@ -4411,14 +4587,6 @@ static void ring_buffer_wakeup(struct perf_event *event) rcu_read_unlock(); } -static void rb_free_rcu(struct rcu_head *rcu_head) -{ - struct ring_buffer *rb; - - rb = container_of(rcu_head, struct ring_buffer, rcu_head); - rb_free(rb); -} - struct ring_buffer *ring_buffer_get(struct perf_event *event) { struct ring_buffer *rb; @@ -5220,9 +5388,15 @@ void perf_output_sample(struct perf_output_handle *handle, if (sample_type & PERF_SAMPLE_RAW) { if (data->raw) { - perf_output_put(handle, data->raw->size); - __output_copy(handle, data->raw->data, - data->raw->size); + u32 raw_size = data->raw->size; + u32 real_size = round_up(raw_size + sizeof(u32), + sizeof(u64)) - sizeof(u32); + u64 zero = 0; + + perf_output_put(handle, real_size); + __output_copy(handle, data->raw->data, raw_size); + if (real_size - raw_size) + __output_copy(handle, &zero, real_size - raw_size); } else { struct { u32 size; @@ -5354,8 +5528,7 @@ void perf_prepare_sample(struct perf_event_header *header, else size += sizeof(u32); - WARN_ON_ONCE(size & (sizeof(u64)-1)); - header->size += size; + header->size += round_up(size, sizeof(u64)); } if (sample_type & PERF_SAMPLE_BRANCH_STACK) { @@ -5424,9 +5597,9 @@ void perf_prepare_sample(struct perf_event_header *header, } } -static void perf_event_output(struct perf_event *event, - struct perf_sample_data *data, - struct pt_regs *regs) +void perf_event_output(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) { struct perf_output_handle handle; struct perf_event_header header; @@ -5506,6 +5679,17 @@ perf_event_aux_ctx(struct perf_event_context *ctx, } static void +perf_event_aux_task_ctx(perf_event_aux_output_cb output, void *data, + struct perf_event_context *task_ctx) +{ + rcu_read_lock(); + preempt_disable(); + perf_event_aux_ctx(task_ctx, output, data); + preempt_enable(); + rcu_read_unlock(); +} + +static void perf_event_aux(perf_event_aux_output_cb output, void *data, struct perf_event_context *task_ctx) { @@ -5514,14 +5698,23 @@ perf_event_aux(perf_event_aux_output_cb output, void *data, struct pmu *pmu; int ctxn; + /* + * If we have task_ctx != NULL we only notify + * the task context itself. The task_ctx is set + * only for EXIT events before releasing task + * context. + */ + if (task_ctx) { + perf_event_aux_task_ctx(output, data, task_ctx); + return; + } + rcu_read_lock(); list_for_each_entry_rcu(pmu, &pmus, entry) { cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); if (cpuctx->unique_pmu != pmu) goto next; perf_event_aux_ctx(&cpuctx->ctx, output, data); - if (task_ctx) - goto next; ctxn = pmu->task_ctx_nr; if (ctxn < 0) goto next; @@ -5531,12 +5724,6 @@ perf_event_aux(perf_event_aux_output_cb output, void *data, next: put_cpu_ptr(pmu->pmu_cpu_context); } - - if (task_ctx) { - preempt_disable(); - perf_event_aux_ctx(task_ctx, output, data); - preempt_enable(); - } rcu_read_unlock(); } @@ -5855,7 +6042,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) * need to add enough zero bytes after the string to handle * the 64bit alignment we do later. */ - name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64)); + name = file_path(file, buf, PATH_MAX - sizeof(u64)); if (IS_ERR(name)) { name = "//toolong"; goto cpy_name; @@ -6018,6 +6205,124 @@ void perf_event_aux_event(struct perf_event *event, unsigned long head, } /* + * Lost/dropped samples logging + */ +void perf_log_lost_samples(struct perf_event *event, u64 lost) +{ + struct perf_output_handle handle; + struct perf_sample_data sample; + int ret; + + struct { + struct perf_event_header header; + u64 lost; + } lost_samples_event = { + .header = { + .type = PERF_RECORD_LOST_SAMPLES, + .misc = 0, + .size = sizeof(lost_samples_event), + }, + .lost = lost, + }; + + perf_event_header__init_id(&lost_samples_event.header, &sample, event); + + ret = perf_output_begin(&handle, event, + lost_samples_event.header.size); + if (ret) + return; + + perf_output_put(&handle, lost_samples_event); + perf_event__output_id_sample(event, &handle, &sample); + perf_output_end(&handle); +} + +/* + * context_switch tracking + */ + +struct perf_switch_event { + struct task_struct *task; + struct task_struct *next_prev; + + struct { + struct perf_event_header header; + u32 next_prev_pid; + u32 next_prev_tid; + } event_id; +}; + +static int perf_event_switch_match(struct perf_event *event) +{ + return event->attr.context_switch; +} + +static void perf_event_switch_output(struct perf_event *event, void *data) +{ + struct perf_switch_event *se = data; + struct perf_output_handle handle; + struct perf_sample_data sample; + int ret; + + if (!perf_event_switch_match(event)) + return; + + /* Only CPU-wide events are allowed to see next/prev pid/tid */ + if (event->ctx->task) { + se->event_id.header.type = PERF_RECORD_SWITCH; + se->event_id.header.size = sizeof(se->event_id.header); + } else { + se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE; + se->event_id.header.size = sizeof(se->event_id); + se->event_id.next_prev_pid = + perf_event_pid(event, se->next_prev); + se->event_id.next_prev_tid = + perf_event_tid(event, se->next_prev); + } + + perf_event_header__init_id(&se->event_id.header, &sample, event); + + ret = perf_output_begin(&handle, event, se->event_id.header.size); + if (ret) + return; + + if (event->ctx->task) + perf_output_put(&handle, se->event_id.header); + else + perf_output_put(&handle, se->event_id); + + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +} + +static void perf_event_switch(struct task_struct *task, + struct task_struct *next_prev, bool sched_in) +{ + struct perf_switch_event switch_event; + + /* N.B. caller checks nr_switch_events != 0 */ + + switch_event = (struct perf_switch_event){ + .task = task, + .next_prev = next_prev, + .event_id = { + .header = { + /* .type */ + .misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT, + /* .size */ + }, + /* .next_prev_pid */ + /* .next_prev_tid */ + }, + }; + + perf_event_aux(perf_event_switch_output, + &switch_event, + NULL); +} + +/* * IRQ throttle logging */ @@ -6076,8 +6381,6 @@ static void perf_log_itrace_start(struct perf_event *event) event->hw.itrace_started) return; - event->hw.itrace_started = 1; - rec.header.type = PERF_RECORD_ITRACE_START; rec.header.misc = 0; rec.header.size = sizeof(rec); @@ -6186,9 +6489,6 @@ struct swevent_htable { /* Recursion avoidance in each contexts */ int recursion[PERF_NR_CONTEXTS]; - - /* Keeps track of cpu being initialized/exited */ - bool online; }; static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); @@ -6446,14 +6746,8 @@ static int perf_swevent_add(struct perf_event *event, int flags) hwc->state = !(flags & PERF_EF_START); head = find_swevent_head(swhash, event); - if (!head) { - /* - * We can race with cpu hotplug code. Do not - * WARN if the cpu just got unplugged. - */ - WARN_ON_ONCE(swhash->online); + if (WARN_ON_ONCE(!head)) return -EINVAL; - } hlist_add_head_rcu(&event->hlist_entry, head); perf_event_update_userpage(event); @@ -6521,7 +6815,6 @@ static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) int err = 0; mutex_lock(&swhash->hlist_mutex); - if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) { struct swevent_hlist *hlist; @@ -6637,6 +6930,10 @@ static int perf_tp_filter_match(struct perf_event *event, { void *record = data->raw->data; + /* only top level events have filters set */ + if (event->parent) + event = event->parent; + if (likely(!event->filter) || filter_match_preds(event->filter, record)) return 1; return 0; @@ -6785,8 +7082,8 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) if (event->tp_event->prog) return -EEXIST; - if (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) - /* bpf programs can only be attached to kprobes */ + if (!(event->tp_event->flags & TRACE_EVENT_FL_UKPROBE)) + /* bpf programs can only be attached to u/kprobes */ return -EINVAL; prog = bpf_prog_get(prog_fd); @@ -6907,9 +7204,8 @@ static void perf_swevent_start_hrtimer(struct perf_event *event) } else { period = max_t(u64, 10000, hwc->sample_period); } - __hrtimer_start_range_ns(&hwc->hrtimer, - ns_to_ktime(period), 0, - HRTIMER_MODE_REL_PINNED, 0); + hrtimer_start(&hwc->hrtimer, ns_to_ktime(period), + HRTIMER_MODE_REL_PINNED); } static void perf_swevent_cancel_hrtimer(struct perf_event *event) @@ -7112,24 +7408,49 @@ static void perf_pmu_nop_void(struct pmu *pmu) { } +static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags) +{ +} + static int perf_pmu_nop_int(struct pmu *pmu) { return 0; } -static void perf_pmu_start_txn(struct pmu *pmu) +static DEFINE_PER_CPU(unsigned int, nop_txn_flags); + +static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags) { + __this_cpu_write(nop_txn_flags, flags); + + if (flags & ~PERF_PMU_TXN_ADD) + return; + perf_pmu_disable(pmu); } static int perf_pmu_commit_txn(struct pmu *pmu) { + unsigned int flags = __this_cpu_read(nop_txn_flags); + + __this_cpu_write(nop_txn_flags, 0); + + if (flags & ~PERF_PMU_TXN_ADD) + return 0; + perf_pmu_enable(pmu); return 0; } static void perf_pmu_cancel_txn(struct pmu *pmu) { + unsigned int flags = __this_cpu_read(nop_txn_flags); + + __this_cpu_write(nop_txn_flags, 0); + + if (flags & ~PERF_PMU_TXN_ADD) + return; + perf_pmu_enable(pmu); } @@ -7211,6 +7532,8 @@ perf_event_mux_interval_ms_show(struct device *dev, return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms); } +static DEFINE_MUTEX(mux_interval_mutex); + static ssize_t perf_event_mux_interval_ms_store(struct device *dev, struct device_attribute *attr, @@ -7230,17 +7553,21 @@ perf_event_mux_interval_ms_store(struct device *dev, if (timer == pmu->hrtimer_interval_ms) return count; + mutex_lock(&mux_interval_mutex); pmu->hrtimer_interval_ms = timer; /* update all cpuctx for this PMU */ - for_each_possible_cpu(cpu) { + get_online_cpus(); + for_each_online_cpu(cpu) { struct perf_cpu_context *cpuctx; cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); - if (hrtimer_active(&cpuctx->hrtimer)) - hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval); + cpu_function_call(cpu, + (remote_function_f)perf_mux_hrtimer_restart, cpuctx); } + put_online_cpus(); + mutex_unlock(&mux_interval_mutex); return count; } @@ -7345,7 +7672,7 @@ skip_type: lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); cpuctx->ctx.pmu = pmu; - __perf_cpu_hrtimer_init(cpuctx, cpu); + __perf_mux_hrtimer_init(cpuctx, cpu); cpuctx->unique_pmu = pmu; } @@ -7362,7 +7689,7 @@ got_cpu_context: pmu->commit_txn = perf_pmu_commit_txn; pmu->cancel_txn = perf_pmu_cancel_txn; } else { - pmu->start_txn = perf_pmu_nop_void; + pmu->start_txn = perf_pmu_nop_txn; pmu->commit_txn = perf_pmu_nop_int; pmu->cancel_txn = perf_pmu_nop_void; } @@ -7450,7 +7777,7 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) return ret; } -struct pmu *perf_init_event(struct perf_event *event) +static struct pmu *perf_init_event(struct perf_event *event) { struct pmu *pmu = NULL; int idx; @@ -7511,6 +7838,10 @@ static void account_event(struct perf_event *event) if (atomic_inc_return(&nr_freq_events) == 1) tick_nohz_full_kick_all(); } + if (event->attr.context_switch) { + atomic_inc(&nr_switch_events); + static_key_slow_inc(&perf_sched_events.key); + } if (has_branch_stack(event)) static_key_slow_inc(&perf_sched_events.key); if (is_cgroup_event(event)) @@ -8132,13 +8463,35 @@ SYSCALL_DEFINE5(perf_event_open, if (move_group) { gctx = group_leader->ctx; + mutex_lock_double(&gctx->mutex, &ctx->mutex); + } else { + mutex_lock(&ctx->mutex); + } + if (!perf_event_validate_size(event)) { + err = -E2BIG; + goto err_locked; + } + + /* + * Must be under the same ctx::mutex as perf_install_in_context(), + * because we need to serialize with concurrent event creation. + */ + if (!exclusive_event_installable(event, ctx)) { + /* exclusive and group stuff are assumed mutually exclusive */ + WARN_ON_ONCE(move_group); + + err = -EBUSY; + goto err_locked; + } + + WARN_ON_ONCE(ctx->parent_ctx); + + if (move_group) { /* * See perf_event_ctx_lock() for comments on the details * of swizzling perf_event::ctx. */ - mutex_lock_double(&gctx->mutex, &ctx->mutex); - perf_remove_from_context(group_leader, false); list_for_each_entry(sibling, &group_leader->sibling_list, @@ -8146,13 +8499,7 @@ SYSCALL_DEFINE5(perf_event_open, perf_remove_from_context(sibling, false); put_ctx(gctx); } - } else { - mutex_lock(&ctx->mutex); - } - WARN_ON_ONCE(ctx->parent_ctx); - - if (move_group) { /* * Wait for everybody to stop referencing the events through * the old lists, before installing it on new lists. @@ -8184,22 +8531,29 @@ SYSCALL_DEFINE5(perf_event_open, perf_event__state_init(group_leader); perf_install_in_context(ctx, group_leader, group_leader->cpu); get_ctx(ctx); - } - if (!exclusive_event_installable(event, ctx)) { - err = -EBUSY; - mutex_unlock(&ctx->mutex); - fput(event_file); - goto err_context; + /* + * Now that all events are installed in @ctx, nothing + * references @gctx anymore, so drop the last reference we have + * on it. + */ + put_ctx(gctx); } + /* + * Precalculate sample_data sizes; do while holding ctx::mutex such + * that we're serialized against further additions and before + * perf_install_in_context() which is the point the event is active and + * can use these values. + */ + perf_event__header_size(event); + perf_event__id_header_size(event); + perf_install_in_context(ctx, event, event->cpu); perf_unpin_context(ctx); - if (move_group) { + if (move_group) mutex_unlock(&gctx->mutex); - put_ctx(gctx); - } mutex_unlock(&ctx->mutex); put_online_cpus(); @@ -8211,12 +8565,6 @@ SYSCALL_DEFINE5(perf_event_open, mutex_unlock(¤t->perf_event_mutex); /* - * Precalculate sample_data sizes - */ - perf_event__header_size(event); - perf_event__id_header_size(event); - - /* * Drop the reference on the group_event after placing the * new event on the sibling_list. This ensures destruction * of the group leader will find the pointer to itself in @@ -8226,6 +8574,12 @@ SYSCALL_DEFINE5(perf_event_open, fd_install(event_fd, event_file); return event_fd; +err_locked: + if (move_group) + mutex_unlock(&gctx->mutex); + mutex_unlock(&ctx->mutex); +/* err_file: */ + fput(event_file); err_context: perf_unpin_context(ctx); put_ctx(ctx); @@ -8450,10 +8804,8 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) struct perf_event_context *child_ctx, *clone_ctx = NULL; unsigned long flags; - if (likely(!child->perf_event_ctxp[ctxn])) { - perf_event_task(child, NULL, 0); + if (likely(!child->perf_event_ctxp[ctxn])) return; - } local_irq_save(flags); /* @@ -8537,6 +8889,14 @@ void perf_event_exit_task(struct task_struct *child) for_each_task_context_nr(ctxn) perf_event_exit_task_context(child, ctxn); + + /* + * The perf_event_exit_task_context calls perf_event_task + * with child's task_ctx, which generates EXIT events for + * child contexts and sets child->perf_event_ctxp[] to NULL. + * At this point we need to send EXIT events to cpu contexts. + */ + perf_event_task(child, NULL, 0); } static void perf_free_event(struct perf_event *event, @@ -8606,6 +8966,31 @@ void perf_event_delayed_put(struct task_struct *task) WARN_ON_ONCE(task->perf_event_ctxp[ctxn]); } +struct perf_event *perf_event_get(unsigned int fd) +{ + int err; + struct fd f; + struct perf_event *event; + + err = perf_fget_light(fd, &f); + if (err) + return ERR_PTR(err); + + event = f.file->private_data; + atomic_long_inc(&event->refcount); + fdput(f); + + return event; +} + +const struct perf_event_attr *perf_event_attrs(struct perf_event *event) +{ + if (!event) + return ERR_PTR(-EINVAL); + + return &event->attr; +} + /* * inherit a event from parent task to child task: */ @@ -8893,7 +9278,6 @@ static void perf_event_init_cpu(int cpu) struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); mutex_lock(&swhash->hlist_mutex); - swhash->online = true; if (swhash->hlist_refcount > 0) { struct swevent_hlist *hlist; @@ -8904,7 +9288,7 @@ static void perf_event_init_cpu(int cpu) mutex_unlock(&swhash->hlist_mutex); } -#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC +#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE static void __perf_event_exit_context(void *__info) { struct remove_event re = { .detach_group = true }; @@ -8935,14 +9319,7 @@ static void perf_event_exit_cpu_context(int cpu) static void perf_event_exit_cpu(int cpu) { - struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); - perf_event_exit_cpu_context(cpu); - - mutex_lock(&swhash->hlist_mutex); - swhash->online = false; - swevent_hlist_release(swhash); - mutex_unlock(&swhash->hlist_mutex); } #else static inline void perf_event_exit_cpu(int cpu) { } @@ -9090,38 +9467,24 @@ static void perf_cgroup_css_free(struct cgroup_subsys_state *css) static int __perf_cgroup_move(void *info) { struct task_struct *task = info; + rcu_read_lock(); perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN); + rcu_read_unlock(); return 0; } -static void perf_cgroup_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void perf_cgroup_attach(struct cgroup_taskset *tset) { struct task_struct *task; + struct cgroup_subsys_state *css; - cgroup_taskset_for_each(task, tset) + cgroup_taskset_for_each(task, css, tset) task_function_call(task, __perf_cgroup_move, task); } -static void perf_cgroup_exit(struct cgroup_subsys_state *css, - struct cgroup_subsys_state *old_css, - struct task_struct *task) -{ - /* - * cgroup_exit() is called in the copy_process() failure path. - * Ignore this case since the task hasn't ran yet, this avoids - * trying to poke a half freed task state from generic code. - */ - if (!(task->flags & PF_EXITING)) - return; - - task_function_call(task, __perf_cgroup_move, task); -} - struct cgroup_subsys perf_event_cgrp_subsys = { .css_alloc = perf_cgroup_css_alloc, .css_free = perf_cgroup_css_free, - .exit = perf_cgroup_exit, .attach = perf_cgroup_attach, }; #endif /* CONFIG_CGROUP_PERF */ diff --git a/kernel/kernel/events/internal.h b/kernel/kernel/events/internal.h index 9f6ce9ba4..2bbad9c12 100644 --- a/kernel/kernel/events/internal.h +++ b/kernel/kernel/events/internal.h @@ -11,6 +11,7 @@ struct ring_buffer { atomic_t refcount; struct rcu_head rcu_head; + struct irq_work irq_work; #ifdef CONFIG_PERF_USE_VMALLOC struct work_struct work; int page_order; /* allocation order */ @@ -55,6 +56,15 @@ struct ring_buffer { }; extern void rb_free(struct ring_buffer *rb); + +static inline void rb_free_rcu(struct rcu_head *rcu_head) +{ + struct ring_buffer *rb; + + rb = container_of(rcu_head, struct ring_buffer, rcu_head); + rb_free(rb); +} + extern struct ring_buffer * rb_alloc(int nr_pages, long watermark, int cpu, int flags); extern void perf_event_wakeup(struct perf_event *event); @@ -72,15 +82,6 @@ static inline bool rb_has_aux(struct ring_buffer *rb) void perf_event_aux_event(struct perf_event *event, unsigned long head, unsigned long size, u64 flags); -extern void -perf_event_header__init_id(struct perf_event_header *header, - struct perf_sample_data *data, - struct perf_event *event); -extern void -perf_event__output_id_sample(struct perf_event *event, - struct perf_output_handle *handle, - struct perf_sample_data *sample); - extern struct page * perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff); diff --git a/kernel/kernel/events/ring_buffer.c b/kernel/kernel/events/ring_buffer.c index a7604c811..adfdc0536 100644 --- a/kernel/kernel/events/ring_buffer.c +++ b/kernel/kernel/events/ring_buffer.c @@ -3,7 +3,7 @@ * * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> * * For licensing details see kernel-base/COPYING @@ -141,7 +141,7 @@ int perf_output_begin(struct perf_output_handle *handle, perf_output_get_handle(handle); do { - tail = ACCESS_ONCE(rb->user_page->data_tail); + tail = READ_ONCE(rb->user_page->data_tail); offset = head = local_read(&rb->head); if (!rb->overwrite && unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size)) @@ -221,6 +221,8 @@ void perf_output_end(struct perf_output_handle *handle) rcu_read_unlock(); } +static void rb_irq_work(struct irq_work *work); + static void ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) { @@ -241,6 +243,16 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) INIT_LIST_HEAD(&rb->event_list); spin_lock_init(&rb->event_lock); + init_irq_work(&rb->irq_work, rb_irq_work); +} + +static void ring_buffer_put_async(struct ring_buffer *rb) +{ + if (!atomic_dec_and_test(&rb->refcount)) + return; + + rb->rcu_head.next = (void *)rb; + irq_work_queue(&rb->irq_work); } /* @@ -319,7 +331,7 @@ err_put: rb_free_aux(rb); err: - ring_buffer_put(rb); + ring_buffer_put_async(rb); handle->event = NULL; return NULL; @@ -370,7 +382,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size, local_set(&rb->aux_nest, 0); rb_free_aux(rb); - ring_buffer_put(rb); + ring_buffer_put_async(rb); } /* @@ -425,7 +437,10 @@ static struct page *rb_alloc_aux_page(int node, int order) if (page && order) { /* - * Communicate the allocation size to the driver + * Communicate the allocation size to the driver: + * if we managed to secure a high-order allocation, + * set its first page's private to this order; + * !PagePrivate(page) means it's just a normal page. */ split_page(page, order); SetPagePrivate(page); @@ -559,7 +574,18 @@ static void __rb_free_aux(struct ring_buffer *rb) void rb_free_aux(struct ring_buffer *rb) { if (atomic_dec_and_test(&rb->aux_refcount)) + irq_work_queue(&rb->irq_work); +} + +static void rb_irq_work(struct irq_work *work) +{ + struct ring_buffer *rb = container_of(work, struct ring_buffer, irq_work); + + if (!atomic_read(&rb->aux_refcount)) __rb_free_aux(rb); + + if (rb->rcu_head.next == (void *)rb) + call_rcu(&rb->rcu_head, rb_free_rcu); } #ifndef CONFIG_PERF_USE_VMALLOC diff --git a/kernel/kernel/events/uprobes.c b/kernel/kernel/events/uprobes.c index cb346f26a..7dad84913 100644 --- a/kernel/kernel/events/uprobes.c +++ b/kernel/kernel/events/uprobes.c @@ -19,7 +19,7 @@ * Authors: * Srikar Dronamraju * Jim Keniston - * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra */ #include <linux/kernel.h> @@ -86,15 +86,6 @@ struct uprobe { struct arch_uprobe arch; }; -struct return_instance { - struct uprobe *uprobe; - unsigned long func; - unsigned long orig_ret_vaddr; /* original return address */ - bool chained; /* true, if instance is nested */ - - struct return_instance *next; /* keep as stack */ -}; - /* * Execute out of line area: anonymous executable mapping installed * by the probed task to execute the copy of the original instruction @@ -105,17 +96,18 @@ struct return_instance { * allocated. */ struct xol_area { - wait_queue_head_t wq; /* if all slots are busy */ - atomic_t slot_count; /* number of in-use slots */ - unsigned long *bitmap; /* 0 = free slot */ - struct page *page; + wait_queue_head_t wq; /* if all slots are busy */ + atomic_t slot_count; /* number of in-use slots */ + unsigned long *bitmap; /* 0 = free slot */ + struct vm_special_mapping xol_mapping; + struct page *pages[2]; /* * We keep the vma's vm_start rather than a pointer to the vma * itself. The probed process or a naughty kernel module could make * the vma go away, and we must handle that reasonably gracefully. */ - unsigned long vaddr; /* Page(s) of instruction slots */ + unsigned long vaddr; /* Page(s) of instruction slots */ }; /* @@ -366,6 +358,18 @@ set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long v return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)&auprobe->insn); } +static struct uprobe *get_uprobe(struct uprobe *uprobe) +{ + atomic_inc(&uprobe->ref); + return uprobe; +} + +static void put_uprobe(struct uprobe *uprobe) +{ + if (atomic_dec_and_test(&uprobe->ref)) + kfree(uprobe); +} + static int match_uprobe(struct uprobe *l, struct uprobe *r) { if (l->inode < r->inode) @@ -393,10 +397,8 @@ static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset) while (n) { uprobe = rb_entry(n, struct uprobe, rb_node); match = match_uprobe(&u, uprobe); - if (!match) { - atomic_inc(&uprobe->ref); - return uprobe; - } + if (!match) + return get_uprobe(uprobe); if (match < 0) n = n->rb_left; @@ -432,10 +434,8 @@ static struct uprobe *__insert_uprobe(struct uprobe *uprobe) parent = *p; u = rb_entry(parent, struct uprobe, rb_node); match = match_uprobe(uprobe, u); - if (!match) { - atomic_inc(&u->ref); - return u; - } + if (!match) + return get_uprobe(u); if (match < 0) p = &parent->rb_left; @@ -472,12 +472,6 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe) return u; } -static void put_uprobe(struct uprobe *uprobe) -{ - if (atomic_dec_and_test(&uprobe->ref)) - kfree(uprobe); -} - static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) { struct uprobe *uprobe, *cur_uprobe; @@ -1039,14 +1033,14 @@ static void build_probe_list(struct inode *inode, if (u->inode != inode || u->offset < min) break; list_add(&u->pending_list, head); - atomic_inc(&u->ref); + get_uprobe(u); } for (t = n; (t = rb_next(t)); ) { u = rb_entry(t, struct uprobe, rb_node); if (u->inode != inode || u->offset > max) break; list_add(&u->pending_list, head); - atomic_inc(&u->ref); + get_uprobe(u); } } spin_unlock(&uprobes_treelock); @@ -1132,11 +1126,14 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon /* Slot allocation for XOL */ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) { - int ret = -EALREADY; + struct vm_area_struct *vma; + int ret; down_write(&mm->mmap_sem); - if (mm->uprobes_state.xol_area) + if (mm->uprobes_state.xol_area) { + ret = -EALREADY; goto fail; + } if (!area->vaddr) { /* Try to map as high as possible, this is only a hint. */ @@ -1148,11 +1145,15 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) } } - ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE, - VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, &area->page); - if (ret) + vma = _install_special_mapping(mm, area->vaddr, PAGE_SIZE, + VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, + &area->xol_mapping); + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); goto fail; + } + ret = 0; smp_wmb(); /* pairs with get_xol_area() */ mm->uprobes_state.xol_area = area; fail: @@ -1175,21 +1176,24 @@ static struct xol_area *__create_xol_area(unsigned long vaddr) if (!area->bitmap) goto free_area; - area->page = alloc_page(GFP_HIGHUSER); - if (!area->page) + area->xol_mapping.name = "[uprobes]"; + area->xol_mapping.pages = area->pages; + area->pages[0] = alloc_page(GFP_HIGHUSER); + if (!area->pages[0]) goto free_bitmap; + area->pages[1] = NULL; area->vaddr = vaddr; init_waitqueue_head(&area->wq); /* Reserve the 1st slot for get_trampoline_vaddr() */ set_bit(0, area->bitmap); atomic_set(&area->slot_count, 1); - copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE); + copy_to_page(area->pages[0], 0, &insn, UPROBE_SWBP_INSN_SIZE); if (!xol_add_vma(mm, area)) return area; - __free_page(area->page); + __free_page(area->pages[0]); free_bitmap: kfree(area->bitmap); free_area: @@ -1227,7 +1231,7 @@ void uprobe_clear_state(struct mm_struct *mm) if (!area) return; - put_page(area->page); + put_page(area->pages[0]); kfree(area->bitmap); kfree(area); } @@ -1296,7 +1300,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe) if (unlikely(!xol_vaddr)) return 0; - arch_uprobe_copy_ixol(area->page, xol_vaddr, + arch_uprobe_copy_ixol(area->pages[0], xol_vaddr, &uprobe->arch.ixol, sizeof(uprobe->arch.ixol)); return xol_vaddr; @@ -1333,6 +1337,7 @@ static void xol_free_insn_slot(struct task_struct *tsk) clear_bit(slot_nr, area->bitmap); atomic_dec(&area->slot_count); + smp_mb__after_atomic(); /* pairs with prepare_to_wait() */ if (waitqueue_active(&area->wq)) wake_up(&area->wq); @@ -1376,6 +1381,14 @@ unsigned long uprobe_get_trap_addr(struct pt_regs *regs) return instruction_pointer(regs); } +static struct return_instance *free_ret_instance(struct return_instance *ri) +{ + struct return_instance *next = ri->next; + put_uprobe(ri->uprobe); + kfree(ri); + return next; +} + /* * Called with no locks held. * Called in context of a exiting or a exec-ing thread. @@ -1383,7 +1396,7 @@ unsigned long uprobe_get_trap_addr(struct pt_regs *regs) void uprobe_free_utask(struct task_struct *t) { struct uprobe_task *utask = t->utask; - struct return_instance *ri, *tmp; + struct return_instance *ri; if (!utask) return; @@ -1392,13 +1405,8 @@ void uprobe_free_utask(struct task_struct *t) put_uprobe(utask->active_uprobe); ri = utask->return_instances; - while (ri) { - tmp = ri; - ri = ri->next; - - put_uprobe(tmp->uprobe); - kfree(tmp); - } + while (ri) + ri = free_ret_instance(ri); xol_free_insn_slot(t); kfree(utask); @@ -1437,7 +1445,7 @@ static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask) return -ENOMEM; *n = *o; - atomic_inc(&n->uprobe->ref); + get_uprobe(n->uprobe); n->next = NULL; *p = n; @@ -1515,12 +1523,25 @@ static unsigned long get_trampoline_vaddr(void) return trampoline_vaddr; } +static void cleanup_return_instances(struct uprobe_task *utask, bool chained, + struct pt_regs *regs) +{ + struct return_instance *ri = utask->return_instances; + enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL; + + while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) { + ri = free_ret_instance(ri); + utask->depth--; + } + utask->return_instances = ri; +} + static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) { struct return_instance *ri; struct uprobe_task *utask; unsigned long orig_ret_vaddr, trampoline_vaddr; - bool chained = false; + bool chained; if (!get_xol_area()) return; @@ -1536,49 +1557,47 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) return; } - ri = kzalloc(sizeof(struct return_instance), GFP_KERNEL); + ri = kmalloc(sizeof(struct return_instance), GFP_KERNEL); if (!ri) - goto fail; + return; trampoline_vaddr = get_trampoline_vaddr(); orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs); if (orig_ret_vaddr == -1) goto fail; + /* drop the entries invalidated by longjmp() */ + chained = (orig_ret_vaddr == trampoline_vaddr); + cleanup_return_instances(utask, chained, regs); + /* * We don't want to keep trampoline address in stack, rather keep the * original return address of first caller thru all the consequent * instances. This also makes breakpoint unwrapping easier. */ - if (orig_ret_vaddr == trampoline_vaddr) { + if (chained) { if (!utask->return_instances) { /* * This situation is not possible. Likely we have an * attack from user-space. */ - pr_warn("uprobe: unable to set uretprobe pid/tgid=%d/%d\n", - current->pid, current->tgid); + uprobe_warn(current, "handle tail call"); goto fail; } - - chained = true; orig_ret_vaddr = utask->return_instances->orig_ret_vaddr; } - atomic_inc(&uprobe->ref); - ri->uprobe = uprobe; + ri->uprobe = get_uprobe(uprobe); ri->func = instruction_pointer(regs); + ri->stack = user_stack_pointer(regs); ri->orig_ret_vaddr = orig_ret_vaddr; ri->chained = chained; utask->depth++; - - /* add instance to the stack */ ri->next = utask->return_instances; utask->return_instances = ri; return; - fail: kfree(ri); } @@ -1766,46 +1785,58 @@ handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs) up_read(&uprobe->register_rwsem); } -static bool handle_trampoline(struct pt_regs *regs) +static struct return_instance *find_next_ret_chain(struct return_instance *ri) { - struct uprobe_task *utask; - struct return_instance *ri, *tmp; bool chained; + do { + chained = ri->chained; + ri = ri->next; /* can't be NULL if chained */ + } while (chained); + + return ri; +} + +static void handle_trampoline(struct pt_regs *regs) +{ + struct uprobe_task *utask; + struct return_instance *ri, *next; + bool valid; + utask = current->utask; if (!utask) - return false; + goto sigill; ri = utask->return_instances; if (!ri) - return false; - - /* - * TODO: we should throw out return_instance's invalidated by - * longjmp(), currently we assume that the probed function always - * returns. - */ - instruction_pointer_set(regs, ri->orig_ret_vaddr); - - for (;;) { - handle_uretprobe_chain(ri, regs); - - chained = ri->chained; - put_uprobe(ri->uprobe); - - tmp = ri; - ri = ri->next; - kfree(tmp); - utask->depth--; + goto sigill; - if (!chained) - break; - BUG_ON(!ri); - } + do { + /* + * We should throw out the frames invalidated by longjmp(). + * If this chain is valid, then the next one should be alive + * or NULL; the latter case means that nobody but ri->func + * could hit this trampoline on return. TODO: sigaltstack(). + */ + next = find_next_ret_chain(ri); + valid = !next || arch_uretprobe_is_alive(next, RP_CHECK_RET, regs); + + instruction_pointer_set(regs, ri->orig_ret_vaddr); + do { + if (valid) + handle_uretprobe_chain(ri, regs); + ri = free_ret_instance(ri); + utask->depth--; + } while (ri != next); + } while (!valid); utask->return_instances = ri; + return; + + sigill: + uprobe_warn(current, "handle uretprobe, sending SIGILL."); + force_sig_info(SIGILL, SEND_SIG_FORCED, current); - return true; } bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs) @@ -1813,6 +1844,12 @@ bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs) return false; } +bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx, + struct pt_regs *regs) +{ + return true; +} + /* * Run handler and ask thread to singlestep. * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. @@ -1824,13 +1861,8 @@ static void handle_swbp(struct pt_regs *regs) int uninitialized_var(is_swbp); bp_vaddr = uprobe_get_swbp_addr(regs); - if (bp_vaddr == get_trampoline_vaddr()) { - if (handle_trampoline(regs)) - return; - - pr_warn("uprobe: unable to handle uretprobe pid/tgid=%d/%d\n", - current->pid, current->tgid); - } + if (bp_vaddr == get_trampoline_vaddr()) + return handle_trampoline(regs); uprobe = find_active_uprobe(bp_vaddr, &is_swbp); if (!uprobe) { diff --git a/kernel/kernel/exit.c b/kernel/kernel/exit.c index a0cf72b37..73497e458 100644 --- a/kernel/kernel/exit.c +++ b/kernel/kernel/exit.c @@ -436,7 +436,7 @@ static void exit_mm(struct task_struct *tsk) mm_update_next_owner(mm); mmput(mm); if (test_thread_flag(TIF_MEMDIE)) - unmark_oom_victim(); + exit_oom_victim(); } static struct task_struct *find_alive_thread(struct task_struct *p) @@ -706,15 +706,17 @@ void do_exit(long code) smp_mb(); raw_spin_unlock_wait(&tsk->pi_lock); - if (unlikely(in_atomic())) + if (unlikely(in_atomic())) { pr_info("note: %s[%d] exited with preempt_count %d\n", current->comm, task_pid_nr(current), preempt_count()); + preempt_count_set(PREEMPT_ENABLED); + } - acct_update_integrals(tsk); /* sync mm's RSS info before statistics gathering */ if (tsk->mm) sync_mm_rss(tsk->mm); + acct_update_integrals(tsk); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { hrtimer_cancel(&tsk->signal->real_timer); @@ -761,7 +763,9 @@ void do_exit(long code) */ flush_ptrace_hw_breakpoint(tsk); + TASKS_RCU(preempt_disable()); TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu)); + TASKS_RCU(preempt_enable()); exit_notify(tsk, group_dead); proc_exit_connector(tsk); #ifdef CONFIG_NUMA @@ -1471,7 +1475,7 @@ static long do_wait(struct wait_opts *wo) add_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); repeat: /* - * If there is nothing that can match our critiera just get out. + * If there is nothing that can match our criteria, just get out. * We will clear ->notask_error to zero if we see any child that * might later match our criteria, even if we are not able to reap * it yet. diff --git a/kernel/kernel/extable.c b/kernel/kernel/extable.c index c98f92627..e820ccee9 100644 --- a/kernel/kernel/extable.c +++ b/kernel/kernel/extable.c @@ -18,7 +18,6 @@ #include <linux/ftrace.h> #include <linux/memory.h> #include <linux/module.h> -#include <linux/ftrace.h> #include <linux/mutex.h> #include <linux/init.h> diff --git a/kernel/kernel/fork.c b/kernel/kernel/fork.c index 1b0e656f6..46c1e8342 100644 --- a/kernel/kernel/fork.c +++ b/kernel/kernel/fork.c @@ -253,6 +253,7 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(atomic_read(&tsk->usage)); WARN_ON(tsk == current); + cgroup_free(tsk); task_numa_free(tsk); security_task_free(tsk); exit_creds(tsk); @@ -300,6 +301,11 @@ static void set_max_threads(unsigned int max_threads_suggested) max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS); } +#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT +/* Initialized by the architecture: */ +int arch_task_struct_size __read_mostly; +#endif + void __init fork_init(void) { #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR @@ -308,7 +314,7 @@ void __init fork_init(void) #endif /* create a slab on which task_structs can be allocated */ task_struct_cachep = - kmem_cache_create("task_struct", sizeof(struct task_struct), + kmem_cache_create("task_struct", arch_task_struct_size, ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL); #endif @@ -387,6 +393,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) #endif tsk->splice_pipe = NULL; tsk->task_frag.page = NULL; + tsk->wake_q.next = NULL; account_kernel_stack(ti, 1); @@ -462,8 +469,10 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) tmp->vm_mm = mm; if (anon_vma_fork(tmp, mpnt)) goto fail_nomem_anon_vma_fork; - tmp->vm_flags &= ~VM_LOCKED; + tmp->vm_flags &= + ~(VM_LOCKED|VM_LOCKONFAULT|VM_UFFD_MISSING|VM_UFFD_WP); tmp->vm_next = tmp->vm_prev = NULL; + tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; file = tmp->vm_file; if (file) { struct inode *inode = file_inode(file); @@ -1093,6 +1102,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) rcu_assign_pointer(tsk->sighand, sig); if (!sig) return -ENOMEM; + atomic_set(&sig->count, 1); memcpy(sig->action, current->sighand->action, sizeof(sig->action)); return 0; @@ -1117,13 +1127,10 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig) { unsigned long cpu_limit; - /* Thread group counters. */ - thread_group_cputime_init(sig); - - cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); + cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); if (cpu_limit != RLIM_INFINITY) { sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit); - sig->cputimer.running = 1; + sig->cputimer.running = true; } /* The timer lists. */ @@ -1157,6 +1164,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) init_sigpending(&sig->shared_pending); INIT_LIST_HEAD(&sig->posix_timers); seqlock_init(&sig->stats_lock); + prev_cputime_init(&sig->prev_cputime); hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); sig->real_timer.function = it_real_fn; @@ -1170,10 +1178,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) tty_audit_fork(sig); sched_autogroup_fork(sig); -#ifdef CONFIG_CGROUPS - init_rwsem(&sig->group_rwsem); -#endif - sig->oom_score_adj = current->signal->oom_score_adj; sig->oom_score_adj_min = current->signal->oom_score_adj_min; @@ -1270,10 +1274,12 @@ static struct task_struct *copy_process(unsigned long clone_flags, unsigned long stack_size, int __user *child_tidptr, struct pid *pid, - int trace) + int trace, + unsigned long tls) { int retval; struct task_struct *p; + void *cgrp_ss_priv[CGROUP_CANFORK_COUNT] = {}; if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) return ERR_PTR(-EINVAL); @@ -1308,10 +1314,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, /* * If the new process will be in a different pid or user namespace - * do not allow it to share a thread group or signal handlers or - * parent with the forking task. + * do not allow it to share a thread group with the forking task. */ - if (clone_flags & CLONE_SIGHAND) { + if (clone_flags & CLONE_THREAD) { if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) || (task_active_pid_ns(current) != current->nsproxy->pid_ns_for_children)) @@ -1371,9 +1376,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->utime = p->stime = p->gtime = 0; p->utimescaled = p->stimescaled = 0; -#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE - p->prev_cputime.utime = p->prev_cputime.stime = 0; -#endif + prev_cputime_init(&p->prev_cputime); + #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN raw_spin_lock_init(&p->vtime_lock); seqcount_init(&p->vtime_seq); @@ -1396,8 +1400,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->real_start_time = ktime_get_boot_ns(); p->io_context = NULL; p->audit_context = NULL; - if (clone_flags & CLONE_THREAD) - threadgroup_change_begin(current); + threadgroup_change_begin(current); cgroup_fork(p); #ifdef CONFIG_NUMA p->mempolicy = mpol_dup(p->mempolicy); @@ -1481,7 +1484,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, retval = copy_io(clone_flags, p); if (retval) goto bad_fork_cleanup_namespaces; - retval = copy_thread(clone_flags, stack_start, stack_size, p); + retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls); if (retval) goto bad_fork_cleanup_io; @@ -1550,6 +1553,16 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->task_works = NULL; /* + * Ensure that the cgroup subsystem policies allow the new process to be + * forked. It should be noted the the new process's css_set can be changed + * between here and cgroup_post_fork() if an organisation operation is in + * progress. + */ + retval = cgroup_can_fork(p, cgrp_ss_priv); + if (retval) + goto bad_fork_free_pid; + + /* * Make it visible to the rest of the system, but dont wake it up yet. * Need tasklist lock for parent etc handling! */ @@ -1585,7 +1598,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); retval = -ERESTARTNOINTR; - goto bad_fork_free_pid; + goto bad_fork_cancel_cgroup; } if (likely(p->pid)) { @@ -1627,9 +1640,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, write_unlock_irq(&tasklist_lock); proc_fork_connector(p); - cgroup_post_fork(p); - if (clone_flags & CLONE_THREAD) - threadgroup_change_end(current); + cgroup_post_fork(p, cgrp_ss_priv); + threadgroup_change_end(current); perf_event_fork(p); trace_task_newtask(p, clone_flags); @@ -1637,6 +1649,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, return p; +bad_fork_cancel_cgroup: + cgroup_cancel_fork(p, cgrp_ss_priv); bad_fork_free_pid: if (pid != &init_struct_pid) free_pid(pid); @@ -1668,8 +1682,7 @@ bad_fork_cleanup_policy: mpol_put(p->mempolicy); bad_fork_cleanup_threadgroup_lock: #endif - if (clone_flags & CLONE_THREAD) - threadgroup_change_end(current); + threadgroup_change_end(current); delayacct_tsk_free(p); bad_fork_cleanup_count: atomic_dec(&p->cred->user->processes); @@ -1693,7 +1706,7 @@ static inline void init_idle_pids(struct pid_link *links) struct task_struct *fork_idle(int cpu) { struct task_struct *task; - task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0); + task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0); if (!IS_ERR(task)) { init_idle_pids(task->pids); init_idle(task, cpu); @@ -1708,11 +1721,12 @@ struct task_struct *fork_idle(int cpu) * It copies the process, and if successful kick-starts * it and waits for it to finish using the VM if required. */ -long do_fork(unsigned long clone_flags, +long _do_fork(unsigned long clone_flags, unsigned long stack_start, unsigned long stack_size, int __user *parent_tidptr, - int __user *child_tidptr) + int __user *child_tidptr, + unsigned long tls) { struct task_struct *p; int trace = 0; @@ -1737,7 +1751,7 @@ long do_fork(unsigned long clone_flags, } p = copy_process(clone_flags, stack_start, stack_size, - child_tidptr, NULL, trace); + child_tidptr, NULL, trace, tls); /* * Do this prior waking up the new thread - the thread pointer * might get invalid after that point, if the thread exits quickly. @@ -1778,20 +1792,34 @@ long do_fork(unsigned long clone_flags, return nr; } +#ifndef CONFIG_HAVE_COPY_THREAD_TLS +/* For compatibility with architectures that call do_fork directly rather than + * using the syscall entry points below. */ +long do_fork(unsigned long clone_flags, + unsigned long stack_start, + unsigned long stack_size, + int __user *parent_tidptr, + int __user *child_tidptr) +{ + return _do_fork(clone_flags, stack_start, stack_size, + parent_tidptr, child_tidptr, 0); +} +#endif + /* * Create a kernel thread. */ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) { - return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, - (unsigned long)arg, NULL, NULL); + return _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, + (unsigned long)arg, NULL, NULL, 0); } #ifdef __ARCH_WANT_SYS_FORK SYSCALL_DEFINE0(fork) { #ifdef CONFIG_MMU - return do_fork(SIGCHLD, 0, 0, NULL, NULL); + return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0); #else /* can not support in nommu mode */ return -EINVAL; @@ -1802,8 +1830,8 @@ SYSCALL_DEFINE0(fork) #ifdef __ARCH_WANT_SYS_VFORK SYSCALL_DEFINE0(vfork) { - return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, - 0, NULL, NULL); + return _do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, + 0, NULL, NULL, 0); } #endif @@ -1811,27 +1839,27 @@ SYSCALL_DEFINE0(vfork) #ifdef CONFIG_CLONE_BACKWARDS SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, int __user *, parent_tidptr, - int, tls_val, + unsigned long, tls, int __user *, child_tidptr) #elif defined(CONFIG_CLONE_BACKWARDS2) SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags, int __user *, parent_tidptr, int __user *, child_tidptr, - int, tls_val) + unsigned long, tls) #elif defined(CONFIG_CLONE_BACKWARDS3) SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp, int, stack_size, int __user *, parent_tidptr, int __user *, child_tidptr, - int, tls_val) + unsigned long, tls) #else SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, int __user *, parent_tidptr, int __user *, child_tidptr, - int, tls_val) + unsigned long, tls) #endif { - return do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr); + return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls); } #endif @@ -1966,7 +1994,8 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) int err; /* - * If unsharing a user namespace must also unshare the thread. + * If unsharing a user namespace must also unshare the thread group + * and unshare the filesystem root and working directories. */ if (unshare_flags & CLONE_NEWUSER) unshare_flags |= CLONE_THREAD | CLONE_FS; diff --git a/kernel/kernel/futex.c b/kernel/kernel/futex.c index bf20e0ad0..02bb86921 100644 --- a/kernel/kernel/futex.c +++ b/kernel/kernel/futex.c @@ -64,6 +64,7 @@ #include <linux/hugetlb.h> #include <linux/freezer.h> #include <linux/bootmem.h> +#include <linux/fault-inject.h> #include <asm/futex.h> @@ -254,9 +255,78 @@ struct futex_hash_bucket { struct plist_head chain; } ____cacheline_aligned_in_smp; -static unsigned long __read_mostly futex_hashsize; +/* + * The base of the bucket array and its size are always used together + * (after initialization only in hash_futex()), so ensure that they + * reside in the same cacheline. + */ +static struct { + struct futex_hash_bucket *queues; + unsigned long hashsize; +} __futex_data __read_mostly __aligned(2*sizeof(long)); +#define futex_queues (__futex_data.queues) +#define futex_hashsize (__futex_data.hashsize) + + +/* + * Fault injections for futexes. + */ +#ifdef CONFIG_FAIL_FUTEX -static struct futex_hash_bucket *futex_queues; +static struct { + struct fault_attr attr; + + bool ignore_private; +} fail_futex = { + .attr = FAULT_ATTR_INITIALIZER, + .ignore_private = false, +}; + +static int __init setup_fail_futex(char *str) +{ + return setup_fault_attr(&fail_futex.attr, str); +} +__setup("fail_futex=", setup_fail_futex); + +static bool should_fail_futex(bool fshared) +{ + if (fail_futex.ignore_private && !fshared) + return false; + + return should_fail(&fail_futex.attr, 1); +} + +#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS + +static int __init fail_futex_debugfs(void) +{ + umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; + struct dentry *dir; + + dir = fault_create_debugfs_attr("fail_futex", NULL, + &fail_futex.attr); + if (IS_ERR(dir)) + return PTR_ERR(dir); + + if (!debugfs_create_bool("ignore-private", mode, dir, + &fail_futex.ignore_private)) { + debugfs_remove_recursive(dir); + return -ENOMEM; + } + + return 0; +} + +late_initcall(fail_futex_debugfs); + +#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ + +#else +static inline bool should_fail_futex(bool fshared) +{ + return false; +} +#endif /* CONFIG_FAIL_FUTEX */ static inline void futex_get_mm(union futex_key *key) { @@ -413,6 +483,9 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) if (unlikely(!access_ok(rw, uaddr, sizeof(u32)))) return -EFAULT; + if (unlikely(should_fail_futex(fshared))) + return -EFAULT; + /* * PROCESS_PRIVATE futexes are fast. * As the mm cannot disappear under us and the 'key' only needs @@ -428,6 +501,10 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) } again: + /* Ignore any VERIFY_READ mapping (futex common case) */ + if (unlikely(should_fail_futex(fshared))) + return -EFAULT; + err = get_user_pages_fast(address, 1, 1, &page); /* * If write access is not required (eg. FUTEX_WAIT), try @@ -516,7 +593,7 @@ again: * A RO anonymous page will never change and thus doesn't make * sense for futex operations. */ - if (ro) { + if (unlikely(should_fail_futex(fshared)) || ro) { err = -EFAULT; goto out; } @@ -976,6 +1053,9 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) { u32 uninitialized_var(curval); + if (unlikely(should_fail_futex(true))) + return -EFAULT; + if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) return -EFAULT; @@ -1017,12 +1097,18 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, if (get_futex_value_locked(&uval, uaddr)) return -EFAULT; + if (unlikely(should_fail_futex(true))) + return -EFAULT; + /* * Detect deadlocks. */ if ((unlikely((uval & FUTEX_TID_MASK) == vpid))) return -EDEADLK; + if ((unlikely(should_fail_futex(true)))) + return -EDEADLK; + /* * Lookup existing state first. If it exists, try to attach to * its pi_state. @@ -1125,6 +1211,8 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, struct task_struct *new_owner; struct futex_pi_state *pi_state = this->pi_state; u32 uninitialized_var(curval), newval; + WAKE_Q(wake_q); + WAKE_Q(wake_sleeper_q); bool deboost; int ret = 0; @@ -1138,7 +1226,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, if (pi_state->owner != current) return -EINVAL; - raw_spin_lock(&pi_state->pi_mutex.wait_lock); + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); /* @@ -1156,36 +1244,43 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, */ newval = FUTEX_WAITERS | task_pid_vnr(new_owner); + if (unlikely(should_fail_futex(true))) + ret = -EFAULT; + if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) ret = -EFAULT; else if (curval != uval) ret = -EINVAL; if (ret) { - raw_spin_unlock(&pi_state->pi_mutex.wait_lock); + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); return ret; } - raw_spin_lock_irq(&pi_state->owner->pi_lock); + raw_spin_lock(&pi_state->owner->pi_lock); WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); - raw_spin_unlock_irq(&pi_state->owner->pi_lock); + raw_spin_unlock(&pi_state->owner->pi_lock); - raw_spin_lock_irq(&new_owner->pi_lock); + raw_spin_lock(&new_owner->pi_lock); WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &new_owner->pi_state_list); pi_state->owner = new_owner; - raw_spin_unlock_irq(&new_owner->pi_lock); + raw_spin_unlock(&new_owner->pi_lock); - raw_spin_unlock(&pi_state->pi_mutex.wait_lock); + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); - deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex); + deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q, + &wake_sleeper_q); /* - * We deboost after dropping hb->lock. That prevents a double - * wakeup on RT. + * First unlock HB so the waiter does not spin on it once he got woken + * up. Second wake up the waiter before the priority is adjusted. If we + * deboost first (and lose our higher priority), then the task might get + * scheduled away before the wake up can take place. */ spin_unlock(&hb->lock); - + wake_up_q(&wake_q); + wake_up_q_sleeper(&wake_sleeper_q); if (deboost) rt_mutex_adjust_prio(current); @@ -1456,6 +1551,9 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, if (get_futex_value_locked(&curval, pifutex)) return -EFAULT; + if (unlikely(should_fail_futex(true))) + return -EFAULT; + /* * Find the top_waiter and determine if there are additional waiters. * If the caller intends to requeue more than 1 waiter to pifutex, @@ -2046,11 +2144,11 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) * we returned due to timeout or signal without taking the * rt_mutex. Too late. */ - raw_spin_lock(&q->pi_state->pi_mutex.wait_lock); + raw_spin_lock_irq(&q->pi_state->pi_mutex.wait_lock); owner = rt_mutex_owner(&q->pi_state->pi_mutex); if (!owner) owner = rt_mutex_next_owner(&q->pi_state->pi_mutex); - raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock); + raw_spin_unlock_irq(&q->pi_state->pi_mutex.wait_lock); ret = fixup_pi_state_owner(uaddr, q, owner); goto out; } @@ -2080,7 +2178,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, { /* * The task state is guaranteed to be set before another task can - * wake it. set_current_state() is implemented using set_mb() and + * wake it. set_current_state() is implemented using smp_store_mb() and * queue_me() calls spin_unlock() upon completion, both serializing * access to the hash list and forcing another memory barrier. */ @@ -2088,11 +2186,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, queue_me(q, hb); /* Arm the timer */ - if (timeout) { + if (timeout) hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); - if (!hrtimer_active(&timeout->timer)) - timeout->task = NULL; - } /* * If we have been removed from the hash list, then another task @@ -2280,8 +2375,11 @@ static long futex_wait_restart(struct restart_block *restart) /* * Userspace tried a 0 -> TID atomic transition of the futex value * and failed. The kernel side here does the whole locking operation: - * if there are waiters then it will block, it does PI, etc. (Due to - * races the kernel might see a 0 value of the futex too.) + * if there are waiters then it will block as a consequence of relying + * on rt-mutexes, it does PI, etc. (Due to races the kernel might see + * a 0 value of the futex too.). + * + * Also serves as futex trylock_pi()'ing, and due semantics. */ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock) @@ -2312,6 +2410,10 @@ retry_private: ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0); if (unlikely(ret)) { + /* + * Atomic work succeeded and we got the lock, + * or failed. Either way, we do _not_ block. + */ switch (ret) { case 1: /* We got the lock. */ @@ -2438,21 +2540,18 @@ retry: match = futex_top_waiter(hb, &key); if (match) { ret = wake_futex_pi(uaddr, uval, match, hb); - /* * In case of success wake_futex_pi dropped the hash * bucket lock. */ if (!ret) goto out_putkey; - /* * The atomic access to the futex value generated a * pagefault, so retry the user-access and the wakeup: */ if (ret == -EFAULT) goto pi_faulted; - /* * wake_futex_pi has detected invalid state. Tell user * space. @@ -2545,7 +2644,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 * @uaddr: the futex we initially wait on (non-pi) * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be - * the same type, no requeueing from private to shared, etc. + * the same type, no requeueing from private to shared, etc. * @val: the expected value of uaddr * @abs_time: absolute timeout * @bitset: 32 bit wakeup bitset set by userspace, defaults to all @@ -2704,6 +2803,11 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, spin_lock(&hb2->lock); BUG_ON(&hb2->lock != q.lock_ptr); ret = fixup_pi_state_owner(uaddr2, &q, current); + /* + * Drop the reference to the pi state which + * the requeue_pi() code acquired for us. + */ + free_pi_state(q.pi_state); spin_unlock(&hb2->lock); } } else { @@ -2831,7 +2935,7 @@ SYSCALL_DEFINE3(get_robust_list, int, pid, } ret = -EPERM; - if (!ptrace_may_access(p, PTRACE_MODE_READ)) + if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) goto err_unlock; head = p->robust_list; @@ -3054,6 +3158,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || cmd == FUTEX_WAIT_BITSET || cmd == FUTEX_WAIT_REQUEUE_PI)) { + if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG)))) + return -EFAULT; if (copy_from_user(&ts, utime, sizeof(ts)) != 0) return -EFAULT; if (!timespec_valid(&ts)) diff --git a/kernel/kernel/futex_compat.c b/kernel/kernel/futex_compat.c index 55c8c9349..4ae3232e7 100644 --- a/kernel/kernel/futex_compat.c +++ b/kernel/kernel/futex_compat.c @@ -155,7 +155,7 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, } ret = -EPERM; - if (!ptrace_may_access(p, PTRACE_MODE_READ)) + if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) goto err_unlock; head = p->compat_robust_list; diff --git a/kernel/kernel/gcov/base.c b/kernel/kernel/gcov/base.c index a744098e4..7080ae1eb 100644 --- a/kernel/kernel/gcov/base.c +++ b/kernel/kernel/gcov/base.c @@ -92,6 +92,12 @@ void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters) } EXPORT_SYMBOL(__gcov_merge_time_profile); +void __gcov_merge_icall_topn(gcov_type *counters, unsigned int n_counters) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_icall_topn); + /** * gcov_enable_events - enable event reporting through gcov_event() * diff --git a/kernel/kernel/gcov/gcc_4_7.c b/kernel/kernel/gcov/gcc_4_7.c index 826ba9fb5..e25e92fb4 100644 --- a/kernel/kernel/gcov/gcc_4_7.c +++ b/kernel/kernel/gcov/gcc_4_7.c @@ -18,7 +18,9 @@ #include <linux/vmalloc.h> #include "gcov.h" -#if __GNUC__ == 4 && __GNUC_MINOR__ >= 9 +#if __GNUC__ == 5 && __GNUC_MINOR__ >= 1 +#define GCOV_COUNTERS 10 +#elif __GNUC__ == 4 && __GNUC_MINOR__ >= 9 #define GCOV_COUNTERS 9 #else #define GCOV_COUNTERS 8 diff --git a/kernel/kernel/irq/Kconfig b/kernel/kernel/irq/Kconfig index 9a76e3bed..3b48dab80 100644 --- a/kernel/kernel/irq/Kconfig +++ b/kernel/kernel/irq/Kconfig @@ -30,6 +30,10 @@ config GENERIC_IRQ_LEGACY_ALLOC_HWIRQ config GENERIC_PENDING_IRQ bool +# Support for generic irq migrating off cpu before the cpu is offline. +config GENERIC_IRQ_MIGRATION + bool + # Alpha specific irq affinity mechanism config AUTO_IRQ_AFFINITY bool diff --git a/kernel/kernel/irq/Makefile b/kernel/kernel/irq/Makefile index d12123526..2fc9cbdf3 100644 --- a/kernel/kernel/irq/Makefile +++ b/kernel/kernel/irq/Makefile @@ -5,5 +5,6 @@ obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o +obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o obj-$(CONFIG_PM_SLEEP) += pm.o obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o diff --git a/kernel/kernel/irq/chip.c b/kernel/kernel/irq/chip.c index 94bbd8fee..15206453b 100644 --- a/kernel/kernel/irq/chip.c +++ b/kernel/kernel/irq/chip.c @@ -21,6 +21,20 @@ #include "internals.h" +static irqreturn_t bad_chained_irq(int irq, void *dev_id) +{ + WARN_ONCE(1, "Chained irq %d should not call an action\n", irq); + return IRQ_NONE; +} + +/* + * Chained handlers should never call action on their IRQ. This default + * action will emit warning if such thing happens. + */ +struct irqaction chained_action = { + .handler = bad_chained_irq, +}; + /** * irq_set_chip - set the irq chip for an irq * @irq: irq number @@ -63,7 +77,7 @@ int irq_set_irq_type(unsigned int irq, unsigned int type) return -EINVAL; type &= IRQ_TYPE_SENSE_MASK; - ret = __irq_set_trigger(desc, irq, type); + ret = __irq_set_trigger(desc, type); irq_put_desc_busunlock(desc, flags); return ret; } @@ -83,7 +97,7 @@ int irq_set_handler_data(unsigned int irq, void *data) if (!desc) return -EINVAL; - desc->irq_data.handler_data = data; + desc->irq_common_data.handler_data = data; irq_put_desc_unlock(desc, flags); return 0; } @@ -105,7 +119,7 @@ int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset, if (!desc) return -EINVAL; - desc->irq_data.msi_desc = entry; + desc->irq_common_data.msi_desc = entry; if (entry && !irq_offset) entry->irq = irq_base; irq_put_desc_unlock(desc, flags); @@ -187,7 +201,7 @@ int irq_startup(struct irq_desc *desc, bool resend) irq_enable(desc); } if (resend) - check_irq_resend(desc, desc->irq_data.irq); + check_irq_resend(desc); return ret; } @@ -227,6 +241,13 @@ void irq_enable(struct irq_desc *desc) * disabled. If an interrupt happens, then the interrupt flow * handler masks the line at the hardware level and marks it * pending. + * + * If the interrupt chip does not implement the irq_disable callback, + * a driver can disable the lazy approach for a particular irq line by + * calling 'irq_set_status_flags(irq, IRQ_DISABLE_UNLAZY)'. This can + * be used for devices which cannot disable the interrupt at the + * device level under certain circumstances and have to use + * disable_irq[_nosync] instead. */ void irq_disable(struct irq_desc *desc) { @@ -234,6 +255,8 @@ void irq_disable(struct irq_desc *desc) if (desc->irq_data.chip->irq_disable) { desc->irq_data.chip->irq_disable(&desc->irq_data); irq_state_set_masked(desc); + } else if (irq_settings_disable_unlazy(desc)) { + mask_irq(desc); } } @@ -315,7 +338,7 @@ void handle_nested_irq(unsigned int irq) raw_spin_lock_irq(&desc->lock); desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - kstat_incr_irqs_this_cpu(irq, desc); + kstat_incr_irqs_this_cpu(desc); action = desc->action; if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) { @@ -328,7 +351,7 @@ void handle_nested_irq(unsigned int irq) action_ret = action->thread_fn(action->irq, action->dev_id); if (!noirqdebug) - note_interrupt(irq, desc, action_ret); + note_interrupt(desc, action_ret); raw_spin_lock_irq(&desc->lock); irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); @@ -372,7 +395,6 @@ static bool irq_may_run(struct irq_desc *desc) /** * handle_simple_irq - Simple and software-decoded IRQs. - * @irq: the interrupt number * @desc: the interrupt description structure for this irq * * Simple interrupts are either sent from a demultiplexing interrupt @@ -382,8 +404,7 @@ static bool irq_may_run(struct irq_desc *desc) * Note: The caller is expected to handle the ack, clear, mask and * unmask issues if necessary. */ -void -handle_simple_irq(unsigned int irq, struct irq_desc *desc) +void handle_simple_irq(struct irq_desc *desc) { raw_spin_lock(&desc->lock); @@ -391,7 +412,7 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) goto out_unlock; desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - kstat_incr_irqs_this_cpu(irq, desc); + kstat_incr_irqs_this_cpu(desc); if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { desc->istate |= IRQS_PENDING; @@ -425,7 +446,6 @@ static void cond_unmask_irq(struct irq_desc *desc) /** * handle_level_irq - Level type irq handler - * @irq: the interrupt number * @desc: the interrupt description structure for this irq * * Level type interrupts are active as long as the hardware line has @@ -433,8 +453,7 @@ static void cond_unmask_irq(struct irq_desc *desc) * it after the associated handler has acknowledged the device, so the * interrupt line is back to inactive. */ -void -handle_level_irq(unsigned int irq, struct irq_desc *desc) +void handle_level_irq(struct irq_desc *desc) { raw_spin_lock(&desc->lock); mask_ack_irq(desc); @@ -443,7 +462,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) goto out_unlock; desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - kstat_incr_irqs_this_cpu(irq, desc); + kstat_incr_irqs_this_cpu(desc); /* * If its disabled or no action available @@ -496,7 +515,6 @@ static void cond_unmask_eoi_irq(struct irq_desc *desc, struct irq_chip *chip) /** * handle_fasteoi_irq - irq handler for transparent controllers - * @irq: the interrupt number * @desc: the interrupt description structure for this irq * * Only a single callback will be issued to the chip: an ->eoi() @@ -504,8 +522,7 @@ static void cond_unmask_eoi_irq(struct irq_desc *desc, struct irq_chip *chip) * for modern forms of interrupt handlers, which handle the flow * details in hardware, transparently. */ -void -handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) +void handle_fasteoi_irq(struct irq_desc *desc) { struct irq_chip *chip = desc->irq_data.chip; @@ -515,7 +532,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) goto out; desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - kstat_incr_irqs_this_cpu(irq, desc); + kstat_incr_irqs_this_cpu(desc); /* * If its disabled or no action available @@ -546,7 +563,6 @@ EXPORT_SYMBOL_GPL(handle_fasteoi_irq); /** * handle_edge_irq - edge type IRQ handler - * @irq: the interrupt number * @desc: the interrupt description structure for this irq * * Interrupt occures on the falling and/or rising edge of a hardware @@ -560,8 +576,7 @@ EXPORT_SYMBOL_GPL(handle_fasteoi_irq); * the handler was running. If all pending interrupts are handled, the * loop is left. */ -void -handle_edge_irq(unsigned int irq, struct irq_desc *desc) +void handle_edge_irq(struct irq_desc *desc) { raw_spin_lock(&desc->lock); @@ -583,7 +598,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) goto out_unlock; } - kstat_incr_irqs_this_cpu(irq, desc); + kstat_incr_irqs_this_cpu(desc); /* Start handling the irq */ desc->irq_data.chip->irq_ack(&desc->irq_data); @@ -618,13 +633,12 @@ EXPORT_SYMBOL(handle_edge_irq); #ifdef CONFIG_IRQ_EDGE_EOI_HANDLER /** * handle_edge_eoi_irq - edge eoi type IRQ handler - * @irq: the interrupt number * @desc: the interrupt description structure for this irq * * Similar as the above handle_edge_irq, but using eoi and w/o the * mask/unmask logic. */ -void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc) +void handle_edge_eoi_irq(struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); @@ -646,7 +660,7 @@ void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc) goto out_eoi; } - kstat_incr_irqs_this_cpu(irq, desc); + kstat_incr_irqs_this_cpu(desc); do { if (unlikely(!desc->action)) @@ -665,22 +679,20 @@ out_eoi: /** * handle_percpu_irq - Per CPU local irq handler - * @irq: the interrupt number * @desc: the interrupt description structure for this irq * * Per CPU interrupts on SMP machines without locking requirements */ -void -handle_percpu_irq(unsigned int irq, struct irq_desc *desc) +void handle_percpu_irq(struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); - kstat_incr_irqs_this_cpu(irq, desc); + kstat_incr_irqs_this_cpu(desc); if (chip->irq_ack) chip->irq_ack(&desc->irq_data); - handle_irq_event_percpu(desc, desc->action); + handle_irq_event_percpu(desc); if (chip->irq_eoi) chip->irq_eoi(&desc->irq_data); @@ -688,7 +700,6 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc) /** * handle_percpu_devid_irq - Per CPU local irq handler with per cpu dev ids - * @irq: the interrupt number * @desc: the interrupt description structure for this irq * * Per CPU interrupts on SMP machines without locking requirements. Same as @@ -698,14 +709,15 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc) * contain the real device id for the cpu on which this handler is * called */ -void handle_percpu_devid_irq(unsigned int irq, struct irq_desc *desc) +void handle_percpu_devid_irq(struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); struct irqaction *action = desc->action; void *dev_id = raw_cpu_ptr(action->percpu_dev_id); + unsigned int irq = irq_desc_get_irq(desc); irqreturn_t res; - kstat_incr_irqs_this_cpu(irq, desc); + kstat_incr_irqs_this_cpu(desc); if (chip->irq_ack) chip->irq_ack(&desc->irq_data); @@ -719,15 +731,9 @@ void handle_percpu_devid_irq(unsigned int irq, struct irq_desc *desc) } void -__irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, - const char *name) +__irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, + int is_chained, const char *name) { - unsigned long flags; - struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, 0); - - if (!desc) - return; - if (!handle) { handle = handle_bad_irq; } else { @@ -749,13 +755,13 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, * right away. */ if (WARN_ON(is_chained)) - goto out; + return; /* Try the parent */ irq_data = irq_data->parent_data; } #endif if (WARN_ON(!irq_data || irq_data->chip == &no_irq_chip)) - goto out; + return; } /* Uninstall? */ @@ -763,6 +769,8 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, if (desc->irq_data.chip != &no_irq_chip) mask_ack_irq(desc); irq_state_set_disabled(desc); + if (is_chained) + desc->action = NULL; desc->depth = 1; } desc->handle_irq = handle; @@ -772,14 +780,44 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, irq_settings_set_noprobe(desc); irq_settings_set_norequest(desc); irq_settings_set_nothread(desc); + desc->action = &chained_action; irq_startup(desc, true); } -out: +} + +void +__irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, + const char *name) +{ + unsigned long flags; + struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, 0); + + if (!desc) + return; + + __irq_do_set_handler(desc, handle, is_chained, name); irq_put_desc_busunlock(desc, flags); } EXPORT_SYMBOL_GPL(__irq_set_handler); void +irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle, + void *data) +{ + unsigned long flags; + struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, 0); + + if (!desc) + return; + + __irq_do_set_handler(desc, handle, 1, NULL); + desc->irq_common_data.handler_data = data; + + irq_put_desc_busunlock(desc, flags); +} +EXPORT_SYMBOL_GPL(irq_set_chained_handler_and_data); + +void irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, irq_flow_handler_t handle, const char *name) { @@ -876,6 +914,34 @@ void irq_cpu_offline(void) #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY /** + * irq_chip_enable_parent - Enable the parent interrupt (defaults to unmask if + * NULL) + * @data: Pointer to interrupt specific data + */ +void irq_chip_enable_parent(struct irq_data *data) +{ + data = data->parent_data; + if (data->chip->irq_enable) + data->chip->irq_enable(data); + else + data->chip->irq_unmask(data); +} + +/** + * irq_chip_disable_parent - Disable the parent interrupt (defaults to mask if + * NULL) + * @data: Pointer to interrupt specific data + */ +void irq_chip_disable_parent(struct irq_data *data) +{ + data = data->parent_data; + if (data->chip->irq_disable) + data->chip->irq_disable(data); + else + data->chip->irq_mask(data); +} + +/** * irq_chip_ack_parent - Acknowledge the parent interrupt * @data: Pointer to interrupt specific data */ @@ -967,6 +1033,20 @@ int irq_chip_retrigger_hierarchy(struct irq_data *data) } /** + * irq_chip_set_vcpu_affinity_parent - Set vcpu affinity on the parent interrupt + * @data: Pointer to interrupt specific data + * @vcpu_info: The vcpu affinity information + */ +int irq_chip_set_vcpu_affinity_parent(struct irq_data *data, void *vcpu_info) +{ + data = data->parent_data; + if (data->chip->irq_set_vcpu_affinity) + return data->chip->irq_set_vcpu_affinity(data, vcpu_info); + + return -ENOSYS; +} + +/** * irq_chip_set_wake_parent - Set/reset wake-up on the parent interrupt * @data: Pointer to interrupt specific data * @on: Whether to set or reset the wake-up capability of this irq diff --git a/kernel/kernel/irq/cpuhotplug.c b/kernel/kernel/irq/cpuhotplug.c new file mode 100644 index 000000000..011f8c4c6 --- /dev/null +++ b/kernel/kernel/irq/cpuhotplug.c @@ -0,0 +1,82 @@ +/* + * Generic cpu hotunplug interrupt migration code copied from the + * arch/arm implementation + * + * Copyright (C) Russell King + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/interrupt.h> +#include <linux/ratelimit.h> +#include <linux/irq.h> + +#include "internals.h" + +static bool migrate_one_irq(struct irq_desc *desc) +{ + struct irq_data *d = irq_desc_get_irq_data(desc); + const struct cpumask *affinity = d->common->affinity; + struct irq_chip *c; + bool ret = false; + + /* + * If this is a per-CPU interrupt, or the affinity does not + * include this CPU, then we have nothing to do. + */ + if (irqd_is_per_cpu(d) || + !cpumask_test_cpu(smp_processor_id(), affinity)) + return false; + + if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { + affinity = cpu_online_mask; + ret = true; + } + + c = irq_data_get_irq_chip(d); + if (!c->irq_set_affinity) { + pr_debug("IRQ%u: unable to set affinity\n", d->irq); + } else { + int r = irq_do_set_affinity(d, affinity, false); + if (r) + pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n", + d->irq, r); + } + + return ret; +} + +/** + * irq_migrate_all_off_this_cpu - Migrate irqs away from offline cpu + * + * The current CPU has been marked offline. Migrate IRQs off this CPU. + * If the affinity settings do not allow other CPUs, force them onto any + * available CPU. + * + * Note: we must iterate over all IRQs, whether they have an attached + * action structure or not, as we need to get chained interrupts too. + */ +void irq_migrate_all_off_this_cpu(void) +{ + unsigned int irq; + struct irq_desc *desc; + unsigned long flags; + + local_irq_save(flags); + + for_each_active_irq(irq) { + bool affinity_broken; + + desc = irq_to_desc(irq); + raw_spin_lock(&desc->lock); + affinity_broken = migrate_one_irq(desc); + raw_spin_unlock(&desc->lock); + + if (affinity_broken) + pr_warn_ratelimited("IRQ%u no longer affine to CPU%u\n", + irq, smp_processor_id()); + } + + local_irq_restore(flags); +} diff --git a/kernel/kernel/irq/dummychip.c b/kernel/kernel/irq/dummychip.c index 2feb6feca..326a67f24 100644 --- a/kernel/kernel/irq/dummychip.c +++ b/kernel/kernel/irq/dummychip.c @@ -42,6 +42,7 @@ struct irq_chip no_irq_chip = { .irq_enable = noop, .irq_disable = noop, .irq_ack = ack_bad, + .flags = IRQCHIP_SKIP_SET_WAKE, }; /* diff --git a/kernel/kernel/irq/generic-chip.c b/kernel/kernel/irq/generic-chip.c index 61024e8ab..abd286afb 100644 --- a/kernel/kernel/irq/generic-chip.c +++ b/kernel/kernel/irq/generic-chip.c @@ -360,7 +360,7 @@ static struct lock_class_key irq_nested_lock_class; int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, irq_hw_number_t hw_irq) { - struct irq_data *data = irq_get_irq_data(virq); + struct irq_data *data = irq_domain_get_irq_data(d, virq); struct irq_domain_chip_generic *dgc = d->gc; struct irq_chip_generic *gc; struct irq_chip_type *ct; @@ -405,8 +405,7 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, else data->mask = 1 << idx; - irq_set_chip_and_handler(virq, chip, ct->handler); - irq_set_chip_data(virq, gc); + irq_domain_set_info(d, virq, hw_irq, chip, gc, ct->handler, NULL, NULL); irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set); return 0; } @@ -554,6 +553,9 @@ static int irq_gc_suspend(void) if (data) ct->chip.irq_suspend(data); } + + if (gc->suspend) + gc->suspend(gc); } return 0; } @@ -565,6 +567,9 @@ static void irq_gc_resume(void) list_for_each_entry(gc, &gc_list, list) { struct irq_chip_type *ct = gc->chip_types; + if (gc->resume) + gc->resume(gc); + if (ct->chip.irq_resume) { struct irq_data *data = irq_gc_get_irq_data(gc); diff --git a/kernel/kernel/irq/handle.c b/kernel/kernel/irq/handle.c index 26a63672c..6c65c9252 100644 --- a/kernel/kernel/irq/handle.c +++ b/kernel/kernel/irq/handle.c @@ -22,17 +22,19 @@ /** * handle_bad_irq - handle spurious and unhandled irqs - * @irq: the interrupt number * @desc: description of the interrupt * * Handles spurious and unhandled IRQ's. It also prints a debugmessage. */ -void handle_bad_irq(unsigned int irq, struct irq_desc *desc) +void handle_bad_irq(struct irq_desc *desc) { + unsigned int irq = irq_desc_get_irq(desc); + print_irq_desc(irq, desc); - kstat_incr_irqs_this_cpu(irq, desc); + kstat_incr_irqs_this_cpu(desc); ack_bad_irq(irq); } +EXPORT_SYMBOL_GPL(handle_bad_irq); /* * Special, empty irq handler: @@ -130,15 +132,16 @@ void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action) wake_up_process(action->thread); } -irqreturn_t -handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) +irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) { struct pt_regs *regs = get_irq_regs(); u64 ip = regs ? instruction_pointer(regs) : 0; irqreturn_t retval = IRQ_NONE; unsigned int flags = 0, irq = desc->irq_data.irq; + struct irqaction *action = desc->action; - do { + /* action might have become NULL since we dropped the lock */ + while (action) { irqreturn_t res; trace_irq_handler_entry(irq, action); @@ -173,29 +176,28 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) retval |= res; action = action->next; - } while (action); + } -#ifndef CONFIG_PREEMPT_RT_FULL - add_interrupt_randomness(irq, flags, ip); -#else +#ifdef CONFIG_PREEMPT_RT_FULL desc->random_ip = ip; +#else + add_interrupt_randomness(irq, flags, ip); #endif if (!noirqdebug) - note_interrupt(irq, desc, retval); + note_interrupt(desc, retval); return retval; } irqreturn_t handle_irq_event(struct irq_desc *desc) { - struct irqaction *action = desc->action; irqreturn_t ret; desc->istate &= ~IRQS_PENDING; irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); raw_spin_unlock(&desc->lock); - ret = handle_irq_event_percpu(desc, action); + ret = handle_irq_event_percpu(desc); raw_spin_lock(&desc->lock); irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); diff --git a/kernel/kernel/irq/internals.h b/kernel/kernel/irq/internals.h index df553b0af..fcab63c66 100644 --- a/kernel/kernel/irq/internals.h +++ b/kernel/kernel/irq/internals.h @@ -18,6 +18,8 @@ extern bool noirqdebug; +extern struct irqaction chained_action; + /* * Bits used by threaded handlers: * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run @@ -59,12 +61,9 @@ enum { #include "debug.h" #include "settings.h" -#define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) - -extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, - unsigned long flags); -extern void __disable_irq(struct irq_desc *desc, unsigned int irq); -extern void __enable_irq(struct irq_desc *desc, unsigned int irq); +extern int __irq_set_trigger(struct irq_desc *desc, unsigned long flags); +extern void __disable_irq(struct irq_desc *desc); +extern void __enable_irq(struct irq_desc *desc); extern int irq_startup(struct irq_desc *desc, bool resend); extern void irq_shutdown(struct irq_desc *desc); @@ -78,21 +77,17 @@ extern void unmask_threaded_irq(struct irq_desc *desc); #ifdef CONFIG_SPARSE_IRQ static inline void irq_mark_irq(unsigned int irq) { } -extern void irq_lock_sparse(void); -extern void irq_unlock_sparse(void); #else extern void irq_mark_irq(unsigned int irq); -static inline void irq_lock_sparse(void) { } -static inline void irq_unlock_sparse(void) { } #endif extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); -irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action); +irqreturn_t handle_irq_event_percpu(struct irq_desc *desc); irqreturn_t handle_irq_event(struct irq_desc *desc); /* Resending of interrupts :*/ -void check_irq_resend(struct irq_desc *desc, unsigned int irq); +void check_irq_resend(struct irq_desc *desc); bool irq_wait_for_poll(struct irq_desc *desc); void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action); @@ -170,35 +165,45 @@ irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags) */ static inline void irqd_set_move_pending(struct irq_data *d) { - d->state_use_accessors |= IRQD_SETAFFINITY_PENDING; + __irqd_to_state(d) |= IRQD_SETAFFINITY_PENDING; } static inline void irqd_clr_move_pending(struct irq_data *d) { - d->state_use_accessors &= ~IRQD_SETAFFINITY_PENDING; + __irqd_to_state(d) &= ~IRQD_SETAFFINITY_PENDING; } static inline void irqd_clear(struct irq_data *d, unsigned int mask) { - d->state_use_accessors &= ~mask; + __irqd_to_state(d) &= ~mask; } static inline void irqd_set(struct irq_data *d, unsigned int mask) { - d->state_use_accessors |= mask; + __irqd_to_state(d) |= mask; } static inline bool irqd_has_set(struct irq_data *d, unsigned int mask) { - return d->state_use_accessors & mask; + return __irqd_to_state(d) & mask; } -static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *desc) +static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc) { __this_cpu_inc(*desc->kstat_irqs); __this_cpu_inc(kstat.irqs_sum); } +static inline int irq_desc_get_node(struct irq_desc *desc) +{ + return irq_common_data_get_node(&desc->irq_common_data); +} + +static inline int irq_desc_is_chained(struct irq_desc *desc) +{ + return (desc->action && desc->action == &chained_action); +} + #ifdef CONFIG_PM_SLEEP bool irq_pm_check_wakeup(struct irq_desc *desc); void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action); diff --git a/kernel/kernel/irq/irqdesc.c b/kernel/kernel/irq/irqdesc.c index 99793b9b6..0b73349a4 100644 --- a/kernel/kernel/irq/irqdesc.c +++ b/kernel/kernel/irq/irqdesc.c @@ -24,10 +24,27 @@ static struct lock_class_key irq_desc_lock_class; #if defined(CONFIG_SMP) +static int __init irq_affinity_setup(char *str) +{ + zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); + cpulist_parse(str, irq_default_affinity); + /* + * Set at least the boot cpu. We don't want to end up with + * bugreports caused by random comandline masks + */ + cpumask_set_cpu(smp_processor_id(), irq_default_affinity); + return 1; +} +__setup("irqaffinity=", irq_affinity_setup); + static void __init init_irq_default_affinity(void) { - alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); - cpumask_setall(irq_default_affinity); +#ifdef CONFIG_CPUMASK_OFFSTACK + if (!irq_default_affinity) + zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); +#endif + if (cpumask_empty(irq_default_affinity)) + cpumask_setall(irq_default_affinity); } #else static void __init init_irq_default_affinity(void) @@ -38,12 +55,13 @@ static void __init init_irq_default_affinity(void) #ifdef CONFIG_SMP static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { - if (!zalloc_cpumask_var_node(&desc->irq_data.affinity, gfp, node)) + if (!zalloc_cpumask_var_node(&desc->irq_common_data.affinity, + gfp, node)) return -ENOMEM; #ifdef CONFIG_GENERIC_PENDING_IRQ if (!zalloc_cpumask_var_node(&desc->pending_mask, gfp, node)) { - free_cpumask_var(desc->irq_data.affinity); + free_cpumask_var(desc->irq_common_data.affinity); return -ENOMEM; } #endif @@ -52,23 +70,19 @@ static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) static void desc_smp_init(struct irq_desc *desc, int node) { - desc->irq_data.node = node; - cpumask_copy(desc->irq_data.affinity, irq_default_affinity); + cpumask_copy(desc->irq_common_data.affinity, irq_default_affinity); #ifdef CONFIG_GENERIC_PENDING_IRQ cpumask_clear(desc->pending_mask); #endif -} - -static inline int desc_node(struct irq_desc *desc) -{ - return desc->irq_data.node; +#ifdef CONFIG_NUMA + desc->irq_common_data.node = node; +#endif } #else static inline int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { return 0; } static inline void desc_smp_init(struct irq_desc *desc, int node) { } -static inline int desc_node(struct irq_desc *desc) { return 0; } #endif static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node, @@ -76,11 +90,13 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node, { int cpu; + desc->irq_common_data.handler_data = NULL; + desc->irq_common_data.msi_desc = NULL; + + desc->irq_data.common = &desc->irq_common_data; desc->irq_data.irq = irq; desc->irq_data.chip = &no_irq_chip; desc->irq_data.chip_data = NULL; - desc->irq_data.handler_data = NULL; - desc->irq_data.msi_desc = NULL; irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS); irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); desc->handle_irq = handle_bad_irq; @@ -126,7 +142,7 @@ static void free_masks(struct irq_desc *desc) #ifdef CONFIG_GENERIC_PENDING_IRQ free_cpumask_var(desc->pending_mask); #endif - free_cpumask_var(desc->irq_data.affinity); + free_cpumask_var(desc->irq_common_data.affinity); } #else static inline void free_masks(struct irq_desc *desc) { } @@ -299,7 +315,7 @@ static void free_desc(unsigned int irq) unsigned long flags; raw_spin_lock_irqsave(&desc->lock, flags); - desc_set_defaults(irq, desc, desc_node(desc), NULL); + desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL); raw_spin_unlock_irqrestore(&desc->lock, flags); } @@ -348,7 +364,7 @@ int generic_handle_irq(unsigned int irq) if (!desc) return -EINVAL; - generic_handle_irq_desc(irq, desc); + generic_handle_irq_desc(desc); return 0; } EXPORT_SYMBOL_GPL(generic_handle_irq); @@ -587,7 +603,7 @@ int irq_set_percpu_devid(unsigned int irq) void kstat_incr_irq_this_cpu(unsigned int irq) { - kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); + kstat_incr_irqs_this_cpu(irq_to_desc(irq)); } /** @@ -619,7 +635,7 @@ unsigned int kstat_irqs(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); int cpu; - int sum = 0; + unsigned int sum = 0; if (!desc || !desc->kstat_irqs) return 0; @@ -639,7 +655,7 @@ unsigned int kstat_irqs(unsigned int irq) */ unsigned int kstat_irqs_usr(unsigned int irq) { - int sum; + unsigned int sum; irq_lock_sparse(); sum = kstat_irqs(irq); diff --git a/kernel/kernel/irq/irqdomain.c b/kernel/kernel/irq/irqdomain.c index 7fac31105..22aa9612e 100644 --- a/kernel/kernel/irq/irqdomain.c +++ b/kernel/kernel/irq/irqdomain.c @@ -27,6 +27,57 @@ static int irq_domain_alloc_descs(int virq, unsigned int nr_irqs, irq_hw_number_t hwirq, int node); static void irq_domain_check_hierarchy(struct irq_domain *domain); +struct irqchip_fwid { + struct fwnode_handle fwnode; + char *name; + void *data; +}; + +/** + * irq_domain_alloc_fwnode - Allocate a fwnode_handle suitable for + * identifying an irq domain + * @data: optional user-provided data + * + * Allocate a struct device_node, and return a poiner to the embedded + * fwnode_handle (or NULL on failure). + */ +struct fwnode_handle *irq_domain_alloc_fwnode(void *data) +{ + struct irqchip_fwid *fwid; + char *name; + + fwid = kzalloc(sizeof(*fwid), GFP_KERNEL); + name = kasprintf(GFP_KERNEL, "irqchip@%p", data); + + if (!fwid || !name) { + kfree(fwid); + kfree(name); + return NULL; + } + + fwid->name = name; + fwid->data = data; + fwid->fwnode.type = FWNODE_IRQCHIP; + return &fwid->fwnode; +} + +/** + * irq_domain_free_fwnode - Free a non-OF-backed fwnode_handle + * + * Free a fwnode_handle allocated with irq_domain_alloc_fwnode. + */ +void irq_domain_free_fwnode(struct fwnode_handle *fwnode) +{ + struct irqchip_fwid *fwid; + + if (WARN_ON(fwnode->type != FWNODE_IRQCHIP)) + return; + + fwid = container_of(fwnode, struct irqchip_fwid, fwnode); + kfree(fwid->name); + kfree(fwid); +} + /** * __irq_domain_add() - Allocate a new irq_domain data structure * @of_node: optional device-tree node of the interrupt controller @@ -40,23 +91,28 @@ static void irq_domain_check_hierarchy(struct irq_domain *domain); * Allocates and initialize and irq_domain structure. * Returns pointer to IRQ domain, or NULL on failure. */ -struct irq_domain *__irq_domain_add(struct device_node *of_node, int size, +struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, irq_hw_number_t hwirq_max, int direct_max, const struct irq_domain_ops *ops, void *host_data) { struct irq_domain *domain; + struct device_node *of_node; + + of_node = to_of_node(fwnode); domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size), GFP_KERNEL, of_node_to_nid(of_node)); if (WARN_ON(!domain)) return NULL; + of_node_get(of_node); + /* Fill structure */ INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL); domain->ops = ops; domain->host_data = host_data; - domain->of_node = of_node_get(of_node); + domain->fwnode = fwnode; domain->hwirq_max = hwirq_max; domain->revmap_size = size; domain->revmap_direct_max_irq = direct_max; @@ -102,7 +158,7 @@ void irq_domain_remove(struct irq_domain *domain) pr_debug("Removed domain %s\n", domain->name); - of_node_put(domain->of_node); + of_node_put(irq_domain_get_of_node(domain)); kfree(domain); } EXPORT_SYMBOL_GPL(irq_domain_remove); @@ -133,7 +189,7 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node, { struct irq_domain *domain; - domain = __irq_domain_add(of_node, size, size, 0, ops, host_data); + domain = __irq_domain_add(of_node_to_fwnode(of_node), size, size, 0, ops, host_data); if (!domain) return NULL; @@ -177,7 +233,7 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, { struct irq_domain *domain; - domain = __irq_domain_add(of_node, first_hwirq + size, + domain = __irq_domain_add(of_node_to_fwnode(of_node), first_hwirq + size, first_hwirq + size, 0, ops, host_data); if (domain) irq_domain_associate_many(domain, first_irq, first_hwirq, size); @@ -187,10 +243,12 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, EXPORT_SYMBOL_GPL(irq_domain_add_legacy); /** - * irq_find_host() - Locates a domain for a given device node - * @node: device-tree node of the interrupt controller + * irq_find_matching_fwnode() - Locates a domain for a given fwnode + * @fwnode: FW descriptor of the interrupt controller + * @bus_token: domain-specific data */ -struct irq_domain *irq_find_host(struct device_node *node) +struct irq_domain *irq_find_matching_fwnode(struct fwnode_handle *fwnode, + enum irq_domain_bus_token bus_token) { struct irq_domain *h, *found = NULL; int rc; @@ -199,13 +257,19 @@ struct irq_domain *irq_find_host(struct device_node *node) * it might potentially be set to match all interrupts in * the absence of a device node. This isn't a problem so far * yet though... + * + * bus_token == DOMAIN_BUS_ANY matches any domain, any other + * values must generate an exact match for the domain to be + * selected. */ mutex_lock(&irq_domain_mutex); list_for_each_entry(h, &irq_domain_list, link) { if (h->ops->match) - rc = h->ops->match(h, node); + rc = h->ops->match(h, to_of_node(fwnode), bus_token); else - rc = (h->of_node != NULL) && (h->of_node == node); + rc = ((fwnode != NULL) && (h->fwnode == fwnode) && + ((bus_token == DOMAIN_BUS_ANY) || + (h->bus_token == bus_token))); if (rc) { found = h; @@ -215,7 +279,7 @@ struct irq_domain *irq_find_host(struct device_node *node) mutex_unlock(&irq_domain_mutex); return found; } -EXPORT_SYMBOL_GPL(irq_find_host); +EXPORT_SYMBOL_GPL(irq_find_matching_fwnode); /** * irq_set_default_host() - Set a "default" irq domain @@ -328,10 +392,12 @@ EXPORT_SYMBOL_GPL(irq_domain_associate); void irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, irq_hw_number_t hwirq_base, int count) { + struct device_node *of_node; int i; + of_node = irq_domain_get_of_node(domain); pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__, - of_node_full_name(domain->of_node), irq_base, (int)hwirq_base, count); + of_node_full_name(of_node), irq_base, (int)hwirq_base, count); for (i = 0; i < count; i++) { irq_domain_associate(domain, irq_base + i, hwirq_base + i); @@ -351,12 +417,14 @@ EXPORT_SYMBOL_GPL(irq_domain_associate_many); */ unsigned int irq_create_direct_mapping(struct irq_domain *domain) { + struct device_node *of_node; unsigned int virq; if (domain == NULL) domain = irq_default_domain; - virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node)); + of_node = irq_domain_get_of_node(domain); + virq = irq_alloc_desc_from(1, of_node_to_nid(of_node)); if (!virq) { pr_debug("create_direct virq allocation failed\n"); return 0; @@ -391,6 +459,7 @@ EXPORT_SYMBOL_GPL(irq_create_direct_mapping); unsigned int irq_create_mapping(struct irq_domain *domain, irq_hw_number_t hwirq) { + struct device_node *of_node; int virq; pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq); @@ -404,6 +473,8 @@ unsigned int irq_create_mapping(struct irq_domain *domain, } pr_debug("-> using domain @%p\n", domain); + of_node = irq_domain_get_of_node(domain); + /* Check if mapping already exists */ virq = irq_find_mapping(domain, hwirq); if (virq) { @@ -412,8 +483,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain, } /* Allocate a virtual interrupt number */ - virq = irq_domain_alloc_descs(-1, 1, hwirq, - of_node_to_nid(domain->of_node)); + virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node)); if (virq <= 0) { pr_debug("-> virq allocation failed\n"); return 0; @@ -425,7 +495,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain, } pr_debug("irq %lu on domain %s mapped to virtual irq %u\n", - hwirq, of_node_full_name(domain->of_node), virq); + hwirq, of_node_full_name(of_node), virq); return virq; } @@ -452,10 +522,12 @@ EXPORT_SYMBOL_GPL(irq_create_mapping); int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base, irq_hw_number_t hwirq_base, int count) { + struct device_node *of_node; int ret; + of_node = irq_domain_get_of_node(domain); ret = irq_alloc_descs(irq_base, irq_base, count, - of_node_to_nid(domain->of_node)); + of_node_to_nid(of_node)); if (unlikely(ret < 0)) return ret; @@ -464,28 +536,56 @@ int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base, } EXPORT_SYMBOL_GPL(irq_create_strict_mappings); -unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data) +static int irq_domain_translate(struct irq_domain *d, + struct irq_fwspec *fwspec, + irq_hw_number_t *hwirq, unsigned int *type) +{ +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY + if (d->ops->translate) + return d->ops->translate(d, fwspec, hwirq, type); +#endif + if (d->ops->xlate) + return d->ops->xlate(d, to_of_node(fwspec->fwnode), + fwspec->param, fwspec->param_count, + hwirq, type); + + /* If domain has no translation, then we assume interrupt line */ + *hwirq = fwspec->param[0]; + return 0; +} + +static void of_phandle_args_to_fwspec(struct of_phandle_args *irq_data, + struct irq_fwspec *fwspec) +{ + int i; + + fwspec->fwnode = irq_data->np ? &irq_data->np->fwnode : NULL; + fwspec->param_count = irq_data->args_count; + + for (i = 0; i < irq_data->args_count; i++) + fwspec->param[i] = irq_data->args[i]; +} + +unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) { struct irq_domain *domain; irq_hw_number_t hwirq; unsigned int type = IRQ_TYPE_NONE; int virq; - domain = irq_data->np ? irq_find_host(irq_data->np) : irq_default_domain; + if (fwspec->fwnode) + domain = irq_find_matching_fwnode(fwspec->fwnode, DOMAIN_BUS_ANY); + else + domain = irq_default_domain; + if (!domain) { pr_warn("no irq domain found for %s !\n", - of_node_full_name(irq_data->np)); + of_node_full_name(to_of_node(fwspec->fwnode))); return 0; } - /* If domain has no translation, then we assume interrupt line */ - if (domain->ops->xlate == NULL) - hwirq = irq_data->args[0]; - else { - if (domain->ops->xlate(domain, irq_data->np, irq_data->args, - irq_data->args_count, &hwirq, &type)) - return 0; - } + if (irq_domain_translate(domain, fwspec, &hwirq, &type)) + return 0; if (irq_domain_is_hierarchy(domain)) { /* @@ -496,7 +596,7 @@ unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data) if (virq) return virq; - virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, irq_data); + virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, fwspec); if (virq <= 0) return 0; } else { @@ -512,6 +612,15 @@ unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data) irq_set_irq_type(virq, type); return virq; } +EXPORT_SYMBOL_GPL(irq_create_fwspec_mapping); + +unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data) +{ + struct irq_fwspec fwspec; + + of_phandle_args_to_fwspec(irq_data, &fwspec); + return irq_create_fwspec_mapping(&fwspec); +} EXPORT_SYMBOL_GPL(irq_create_of_mapping); /** @@ -582,14 +691,16 @@ static int virq_debug_show(struct seq_file *m, void *private) "name", "mapped", "linear-max", "direct-max", "devtree-node"); mutex_lock(&irq_domain_mutex); list_for_each_entry(domain, &irq_domain_list, link) { + struct device_node *of_node; int count = 0; + of_node = irq_domain_get_of_node(domain); radix_tree_for_each_slot(slot, &domain->revmap_tree, &iter, 0) count++; seq_printf(m, "%c%-16s %6u %10u %10u %s\n", domain == irq_default_domain ? '*' : ' ', domain->name, domain->revmap_size + count, domain->revmap_size, domain->revmap_direct_max_irq, - domain->of_node ? of_node_full_name(domain->of_node) : ""); + of_node ? of_node_full_name(of_node) : ""); } mutex_unlock(&irq_domain_mutex); @@ -743,11 +854,11 @@ static int irq_domain_alloc_descs(int virq, unsigned int cnt, #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY /** - * irq_domain_add_hierarchy - Add a irqdomain into the hierarchy + * irq_domain_create_hierarchy - Add a irqdomain into the hierarchy * @parent: Parent irq domain to associate with the new domain * @flags: Irq domain flags associated to the domain * @size: Size of the domain. See below - * @node: Optional device-tree node of the interrupt controller + * @fwnode: Optional fwnode of the interrupt controller * @ops: Pointer to the interrupt domain callbacks * @host_data: Controller private data pointer * @@ -757,19 +868,19 @@ static int irq_domain_alloc_descs(int virq, unsigned int cnt, * domain flags are set. * Returns pointer to IRQ domain, or NULL on failure. */ -struct irq_domain *irq_domain_add_hierarchy(struct irq_domain *parent, +struct irq_domain *irq_domain_create_hierarchy(struct irq_domain *parent, unsigned int flags, unsigned int size, - struct device_node *node, + struct fwnode_handle *fwnode, const struct irq_domain_ops *ops, void *host_data) { struct irq_domain *domain; if (size) - domain = irq_domain_add_linear(node, size, ops, host_data); + domain = irq_domain_create_linear(fwnode, size, ops, host_data); else - domain = irq_domain_add_tree(node, ops, host_data); + domain = irq_domain_create_tree(fwnode, ops, host_data); if (domain) { domain->parent = parent; domain->flags |= flags; @@ -830,11 +941,12 @@ static struct irq_data *irq_domain_insert_irq_data(struct irq_domain *domain, { struct irq_data *irq_data; - irq_data = kzalloc_node(sizeof(*irq_data), GFP_KERNEL, child->node); + irq_data = kzalloc_node(sizeof(*irq_data), GFP_KERNEL, + irq_data_get_node(child)); if (irq_data) { child->parent_data = irq_data; irq_data->irq = child->irq; - irq_data->node = child->node; + irq_data->common = child->common; irq_data->domain = domain; } @@ -1232,6 +1344,27 @@ struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain, return (irq_data && irq_data->domain == domain) ? irq_data : NULL; } +/** + * irq_domain_set_info - Set the complete data for a @virq in @domain + * @domain: Interrupt domain to match + * @virq: IRQ number + * @hwirq: The hardware interrupt number + * @chip: The associated interrupt chip + * @chip_data: The associated interrupt chip data + * @handler: The interrupt flow handler + * @handler_data: The interrupt flow handler data + * @handler_name: The interrupt handler name + */ +void irq_domain_set_info(struct irq_domain *domain, unsigned int virq, + irq_hw_number_t hwirq, struct irq_chip *chip, + void *chip_data, irq_flow_handler_t handler, + void *handler_data, const char *handler_name) +{ + irq_set_chip_and_handler_name(virq, chip, handler, handler_name); + irq_set_chip_data(virq, chip_data); + irq_set_handler_data(virq, handler_data); +} + static void irq_domain_check_hierarchy(struct irq_domain *domain) { } diff --git a/kernel/kernel/irq/manage.c b/kernel/kernel/irq/manage.c index 79c55c26e..8e89554aa 100644 --- a/kernel/kernel/irq/manage.c +++ b/kernel/kernel/irq/manage.c @@ -117,6 +117,14 @@ EXPORT_SYMBOL(synchronize_irq); #ifdef CONFIG_SMP cpumask_var_t irq_default_affinity; +static int __irq_can_set_affinity(struct irq_desc *desc) +{ + if (!desc || !irqd_can_balance(&desc->irq_data) || + !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity) + return 0; + return 1; +} + /** * irq_can_set_affinity - Check if the affinity of a given irq can be set * @irq: Interrupt to check @@ -124,13 +132,7 @@ cpumask_var_t irq_default_affinity; */ int irq_can_set_affinity(unsigned int irq) { - struct irq_desc *desc = irq_to_desc(irq); - - if (!desc || !irqd_can_balance(&desc->irq_data) || - !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity) - return 0; - - return 1; + return __irq_can_set_affinity(irq_to_desc(irq)); } /** @@ -248,7 +250,7 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, switch (ret) { case IRQ_SET_MASK_OK: case IRQ_SET_MASK_OK_DONE: - cpumask_copy(data->affinity, mask); + cpumask_copy(desc->irq_common_data.affinity, mask); case IRQ_SET_MASK_OK_NOCOPY: irq_set_thread_affinity(desc); ret = 0; @@ -337,7 +339,7 @@ static void _irq_affinity_notify(struct irq_affinity_notify *notify) if (irq_move_pending(&desc->irq_data)) irq_get_pending(cpumask, desc); else - cpumask_copy(cpumask, desc->irq_data.affinity); + cpumask_copy(cpumask, desc->irq_common_data.affinity); raw_spin_unlock_irqrestore(&desc->lock, flags); notify->notify(notify, cpumask); @@ -403,14 +405,13 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier); /* * Generic version of the affinity autoselector. */ -static int -setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) +static int setup_affinity(struct irq_desc *desc, struct cpumask *mask) { struct cpumask *set = irq_default_affinity; - int node = desc->irq_data.node; + int node = irq_desc_get_node(desc); /* Excludes PER_CPU and NO_BALANCE interrupts */ - if (!irq_can_set_affinity(irq)) + if (!__irq_can_set_affinity(desc)) return 0; /* @@ -418,9 +419,9 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) * one of the targets is online. */ if (irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) { - if (cpumask_intersects(desc->irq_data.affinity, + if (cpumask_intersects(desc->irq_common_data.affinity, cpu_online_mask)) - set = desc->irq_data.affinity; + set = desc->irq_common_data.affinity; else irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET); } @@ -437,10 +438,10 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) return 0; } #else -static inline int -setup_affinity(unsigned int irq, struct irq_desc *d, struct cpumask *mask) +/* Wrapper for ALPHA specific affinity selector magic */ +static inline int setup_affinity(struct irq_desc *d, struct cpumask *mask) { - return irq_select_affinity(irq); + return irq_select_affinity(irq_desc_get_irq(d)); } #endif @@ -454,20 +455,51 @@ int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask) int ret; raw_spin_lock_irqsave(&desc->lock, flags); - ret = setup_affinity(irq, desc, mask); + ret = setup_affinity(desc, mask); raw_spin_unlock_irqrestore(&desc->lock, flags); return ret; } #else static inline int -setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) +setup_affinity(struct irq_desc *desc, struct cpumask *mask) { return 0; } #endif -void __disable_irq(struct irq_desc *desc, unsigned int irq) +/** + * irq_set_vcpu_affinity - Set vcpu affinity for the interrupt + * @irq: interrupt number to set affinity + * @vcpu_info: vCPU specific data + * + * This function uses the vCPU specific data to set the vCPU + * affinity for an irq. The vCPU specific data is passed from + * outside, such as KVM. One example code path is as below: + * KVM -> IOMMU -> irq_set_vcpu_affinity(). + */ +int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info) +{ + unsigned long flags; + struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); + struct irq_data *data; + struct irq_chip *chip; + int ret = -ENOSYS; + + if (!desc) + return -EINVAL; + + data = irq_desc_get_irq_data(desc); + chip = irq_data_get_irq_chip(data); + if (chip && chip->irq_set_vcpu_affinity) + ret = chip->irq_set_vcpu_affinity(data, vcpu_info); + irq_put_desc_unlock(desc, flags); + + return ret; +} +EXPORT_SYMBOL_GPL(irq_set_vcpu_affinity); + +void __disable_irq(struct irq_desc *desc) { if (!desc->depth++) irq_disable(desc); @@ -480,7 +512,7 @@ static int __disable_irq_nosync(unsigned int irq) if (!desc) return -EINVAL; - __disable_irq(desc, irq); + __disable_irq(desc); irq_put_desc_busunlock(desc, flags); return 0; } @@ -547,12 +579,13 @@ bool disable_hardirq(unsigned int irq) } EXPORT_SYMBOL_GPL(disable_hardirq); -void __enable_irq(struct irq_desc *desc, unsigned int irq) +void __enable_irq(struct irq_desc *desc) { switch (desc->depth) { case 0: err_out: - WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq); + WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", + irq_desc_get_irq(desc)); break; case 1: { if (desc->istate & IRQS_SUSPENDED) @@ -560,7 +593,7 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq) /* Prevent probing on this irq: */ irq_settings_set_noprobe(desc); irq_enable(desc); - check_irq_resend(desc, irq); + check_irq_resend(desc); /* fall-through */ } default: @@ -590,7 +623,7 @@ void enable_irq(unsigned int irq) KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq)) goto out; - __enable_irq(desc, irq); + __enable_irq(desc); out: irq_put_desc_busunlock(desc, flags); } @@ -681,8 +714,7 @@ int can_request_irq(unsigned int irq, unsigned long irqflags) return canrequest; } -int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, - unsigned long flags) +int __irq_set_trigger(struct irq_desc *desc, unsigned long flags) { struct irq_chip *chip = desc->irq_data.chip; int ret, unmask = 0; @@ -692,7 +724,8 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, * IRQF_TRIGGER_* but the PIC does not support multiple * flow-types? */ - pr_debug("No set_type function for IRQ %d (%s)\n", irq, + pr_debug("No set_type function for IRQ %d (%s)\n", + irq_desc_get_irq(desc), chip ? (chip->name ? : "unknown") : "unknown"); return 0; } @@ -729,7 +762,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, break; default: pr_err("Setting trigger mode %lu for irq %u failed (%pF)\n", - flags, irq, chip->irq_set_type); + flags, irq_desc_get_irq(desc), chip->irq_set_type); } if (unmask) unmask_irq(desc); @@ -878,8 +911,8 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) * This code is triggered unconditionally. Check the affinity * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out. */ - if (desc->irq_data.affinity) - cpumask_copy(mask, desc->irq_data.affinity); + if (desc->irq_common_data.affinity) + cpumask_copy(mask, desc->irq_common_data.affinity); else valid = false; raw_spin_unlock_irq(&desc->lock); @@ -1124,7 +1157,7 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary) } else { t = kthread_create(irq_thread, new, "irq/%d-s-%s", irq, new->name); - param.sched_priority += 1; + param.sched_priority -= 1; } if (IS_ERR(t)) @@ -1340,8 +1373,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) /* Setup the type (level, edge polarity) if configured: */ if (new->flags & IRQF_TRIGGER_MASK) { - ret = __irq_set_trigger(desc, irq, - new->flags & IRQF_TRIGGER_MASK); + ret = __irq_set_trigger(desc, + new->flags & IRQF_TRIGGER_MASK); if (ret) goto out_mask; @@ -1375,7 +1408,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) irq_settings_set_no_softirq_call(desc); /* Set default affinity mask once everything is setup */ - setup_affinity(irq, desc, mask); + setup_affinity(desc, mask); } else if (new->flags & IRQF_TRIGGER_MASK) { unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK; @@ -1401,7 +1434,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) */ if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) { desc->istate &= ~IRQS_SPURIOUS_DISABLED; - __enable_irq(desc, irq); + __enable_irq(desc); } raw_spin_unlock_irqrestore(&desc->lock, flags); @@ -1493,6 +1526,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) if (!desc) return NULL; + chip_bus_lock(desc); raw_spin_lock_irqsave(&desc->lock, flags); /* @@ -1506,7 +1540,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) if (!action) { WARN(1, "Trying to free already-free IRQ %d\n", irq); raw_spin_unlock_irqrestore(&desc->lock, flags); - + chip_bus_sync_unlock(desc); return NULL; } @@ -1522,6 +1556,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) /* If this was the last handler, shut down the IRQ line: */ if (!desc->action) { + irq_settings_clr_disable_unlazy(desc); irq_shutdown(desc); irq_release_resources(desc); } @@ -1533,6 +1568,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) #endif raw_spin_unlock_irqrestore(&desc->lock, flags); + chip_bus_sync_unlock(desc); unregister_handler_proc(irq, action); @@ -1611,9 +1647,7 @@ void free_irq(unsigned int irq, void *dev_id) desc->affinity_notify = NULL; #endif - chip_bus_lock(desc); kfree(__free_irq(irq, dev_id)); - chip_bus_sync_unlock(desc); } EXPORT_SYMBOL(free_irq); @@ -1787,7 +1821,7 @@ void enable_percpu_irq(unsigned int irq, unsigned int type) if (type != IRQ_TYPE_NONE) { int ret; - ret = __irq_set_trigger(desc, irq, type); + ret = __irq_set_trigger(desc, type); if (ret) { WARN(1, "failed to set type for IRQ%d\n", irq); @@ -1896,6 +1930,7 @@ void free_percpu_irq(unsigned int irq, void __percpu *dev_id) kfree(__free_percpu_irq(irq, dev_id)); chip_bus_sync_unlock(desc); } +EXPORT_SYMBOL_GPL(free_percpu_irq); /** * setup_percpu_irq - setup a per-cpu interrupt @@ -1925,9 +1960,10 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act) * @devname: An ascii name for the claiming device * @dev_id: A percpu cookie passed back to the handler function * - * This call allocates interrupt resources, but doesn't - * automatically enable the interrupt. It has to be done on each - * CPU using enable_percpu_irq(). + * This call allocates interrupt resources and enables the + * interrupt on the local CPU. If the interrupt is supposed to be + * enabled on other CPUs, it has to be done on each CPU using + * enable_percpu_irq(). * * Dev_id must be globally unique. It is a per-cpu variable, and * the handler gets called with the interrupted CPU's instance of @@ -1966,6 +2002,7 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler, return retval; } +EXPORT_SYMBOL_GPL(request_percpu_irq); /** * irq_get_irqchip_state - returns the irqchip state of a interrupt. @@ -2012,6 +2049,7 @@ int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which, irq_put_desc_busunlock(desc, flags); return err; } +EXPORT_SYMBOL_GPL(irq_get_irqchip_state); /** * irq_set_irqchip_state - set the state of a forwarded interrupt. @@ -2022,7 +2060,7 @@ int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which, * This call sets the internal irqchip state of an interrupt, * depending on the value of @which. * - * This function should be called with preemption disabled if the + * This function should be called with migration disabled if the * interrupt controller has per-cpu registers. */ int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, @@ -2057,3 +2095,4 @@ int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, irq_put_desc_busunlock(desc, flags); return err; } +EXPORT_SYMBOL_GPL(irq_set_irqchip_state); diff --git a/kernel/kernel/irq/migration.c b/kernel/kernel/irq/migration.c index ca3f4aaff..37ddb7bda 100644 --- a/kernel/kernel/irq/migration.c +++ b/kernel/kernel/irq/migration.c @@ -7,21 +7,21 @@ void irq_move_masked_irq(struct irq_data *idata) { struct irq_desc *desc = irq_data_to_desc(idata); - struct irq_chip *chip = idata->chip; + struct irq_chip *chip = desc->irq_data.chip; if (likely(!irqd_is_setaffinity_pending(&desc->irq_data))) return; + irqd_clr_move_pending(&desc->irq_data); + /* * Paranoia: cpu-local interrupts shouldn't be calling in here anyway. */ - if (!irqd_can_balance(&desc->irq_data)) { + if (irqd_is_per_cpu(&desc->irq_data)) { WARN_ON(1); return; } - irqd_clr_move_pending(&desc->irq_data); - if (unlikely(cpumask_empty(desc->pending_mask))) return; @@ -52,6 +52,13 @@ void irq_move_irq(struct irq_data *idata) { bool masked; + /* + * Get top level irq_data when CONFIG_IRQ_DOMAIN_HIERARCHY is enabled, + * and it should be optimized away when CONFIG_IRQ_DOMAIN_HIERARCHY is + * disabled. So we avoid an "#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY" here. + */ + idata = irq_desc_get_irq_data(irq_data_to_desc(idata)); + if (likely(!irqd_is_setaffinity_pending(idata))) return; diff --git a/kernel/kernel/irq/msi.c b/kernel/kernel/irq/msi.c index 474de5cb3..6b0c0b74a 100644 --- a/kernel/kernel/irq/msi.c +++ b/kernel/kernel/irq/msi.c @@ -18,6 +18,23 @@ /* Temparory solution for building, will be removed later */ #include <linux/pci.h> +struct msi_desc *alloc_msi_entry(struct device *dev) +{ + struct msi_desc *desc = kzalloc(sizeof(*desc), GFP_KERNEL); + if (!desc) + return NULL; + + INIT_LIST_HEAD(&desc->list); + desc->dev = dev; + + return desc; +} + +void free_msi_entry(struct msi_desc *entry) +{ + kfree(entry); +} + void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg) { *msg = entry->msg; @@ -124,7 +141,7 @@ static void msi_domain_free(struct irq_domain *domain, unsigned int virq, irq_domain_free_irqs_top(domain, virq, nr_irqs); } -static struct irq_domain_ops msi_domain_ops = { +static const struct irq_domain_ops msi_domain_ops = { .alloc = msi_domain_alloc, .free = msi_domain_free, .activate = msi_domain_activate, @@ -211,22 +228,18 @@ static void msi_domain_update_chip_ops(struct msi_domain_info *info) { struct irq_chip *chip = info->chip; - BUG_ON(!chip); - if (!chip->irq_mask) - chip->irq_mask = pci_msi_mask_irq; - if (!chip->irq_unmask) - chip->irq_unmask = pci_msi_unmask_irq; + BUG_ON(!chip || !chip->irq_mask || !chip->irq_unmask); if (!chip->irq_set_affinity) chip->irq_set_affinity = msi_domain_set_affinity; } /** * msi_create_irq_domain - Create a MSI interrupt domain - * @of_node: Optional device-tree node of the interrupt controller + * @fwnode: Optional fwnode of the interrupt controller * @info: MSI domain info * @parent: Parent irq domain */ -struct irq_domain *msi_create_irq_domain(struct device_node *node, +struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode, struct msi_domain_info *info, struct irq_domain *parent) { @@ -235,8 +248,8 @@ struct irq_domain *msi_create_irq_domain(struct device_node *node, if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS) msi_domain_update_chip_ops(info); - return irq_domain_add_hierarchy(parent, 0, 0, node, &msi_domain_ops, - info); + return irq_domain_create_hierarchy(parent, 0, 0, fwnode, + &msi_domain_ops, info); } /** diff --git a/kernel/kernel/irq/pm.c b/kernel/kernel/irq/pm.c index 5204a6d1b..cea1de016 100644 --- a/kernel/kernel/irq/pm.c +++ b/kernel/kernel/irq/pm.c @@ -21,7 +21,7 @@ bool irq_pm_check_wakeup(struct irq_desc *desc) desc->istate |= IRQS_SUSPENDED | IRQS_PENDING; desc->depth++; irq_disable(desc); - pm_system_wakeup(); + pm_system_irq_wakeup(irq_desc_get_irq(desc)); return true; } return false; @@ -68,9 +68,10 @@ void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) desc->cond_suspend_depth--; } -static bool suspend_device_irq(struct irq_desc *desc, int irq) +static bool suspend_device_irq(struct irq_desc *desc) { - if (!desc->action || desc->no_suspend_depth) + if (!desc->action || irq_desc_is_chained(desc) || + desc->no_suspend_depth) return false; if (irqd_is_wakeup_set(&desc->irq_data)) { @@ -85,7 +86,7 @@ static bool suspend_device_irq(struct irq_desc *desc, int irq) } desc->istate |= IRQS_SUSPENDED; - __disable_irq(desc, irq); + __disable_irq(desc); /* * Hardware which has no wakeup source configuration facility @@ -123,8 +124,10 @@ void suspend_device_irqs(void) unsigned long flags; bool sync; + if (irq_settings_is_nested_thread(desc)) + continue; raw_spin_lock_irqsave(&desc->lock, flags); - sync = suspend_device_irq(desc, irq); + sync = suspend_device_irq(desc); raw_spin_unlock_irqrestore(&desc->lock, flags); if (sync) @@ -133,7 +136,7 @@ void suspend_device_irqs(void) } EXPORT_SYMBOL_GPL(suspend_device_irqs); -static void resume_irq(struct irq_desc *desc, int irq) +static void resume_irq(struct irq_desc *desc) { irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED); @@ -148,7 +151,7 @@ static void resume_irq(struct irq_desc *desc, int irq) desc->depth++; resume: desc->istate &= ~IRQS_SUSPENDED; - __enable_irq(desc, irq); + __enable_irq(desc); } static void resume_irqs(bool want_early) @@ -163,9 +166,11 @@ static void resume_irqs(bool want_early) if (!is_early && want_early) continue; + if (irq_settings_is_nested_thread(desc)) + continue; raw_spin_lock_irqsave(&desc->lock, flags); - resume_irq(desc, irq); + resume_irq(desc); raw_spin_unlock_irqrestore(&desc->lock, flags); } } diff --git a/kernel/kernel/irq/proc.c b/kernel/kernel/irq/proc.c index df2f4642d..a2c02fd5d 100644 --- a/kernel/kernel/irq/proc.c +++ b/kernel/kernel/irq/proc.c @@ -12,6 +12,7 @@ #include <linux/seq_file.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> +#include <linux/mutex.h> #include "internals.h" @@ -39,7 +40,7 @@ static struct proc_dir_entry *root_irq_dir; static int show_irq_affinity(int type, struct seq_file *m, void *v) { struct irq_desc *desc = irq_to_desc((long)m->private); - const struct cpumask *mask = desc->irq_data.affinity; + const struct cpumask *mask = desc->irq_common_data.affinity; #ifdef CONFIG_GENERIC_PENDING_IRQ if (irqd_is_setaffinity_pending(&desc->irq_data)) @@ -241,7 +242,7 @@ static int irq_node_proc_show(struct seq_file *m, void *v) { struct irq_desc *desc = irq_to_desc((long) m->private); - seq_printf(m, "%d\n", desc->irq_data.node); + seq_printf(m, "%d\n", irq_desc_get_node(desc)); return 0; } @@ -323,18 +324,29 @@ void register_handler_proc(unsigned int irq, struct irqaction *action) void register_irq_proc(unsigned int irq, struct irq_desc *desc) { + static DEFINE_MUTEX(register_lock); char name [MAX_NAMELEN]; - if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip) || desc->dir) + if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip)) return; + /* + * irq directories are registered only when a handler is + * added, not when the descriptor is created, so multiple + * tasks might try to register at the same time. + */ + mutex_lock(®ister_lock); + + if (desc->dir) + goto out_unlock; + memset(name, 0, MAX_NAMELEN); sprintf(name, "%d", irq); /* create /proc/irq/1234 */ desc->dir = proc_mkdir(name, root_irq_dir); if (!desc->dir) - return; + goto out_unlock; #ifdef CONFIG_SMP /* create /proc/irq/<irq>/smp_affinity */ @@ -355,6 +367,9 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) proc_create_data("spurious", 0444, desc->dir, &irq_spurious_proc_fops, (void *)(long)irq); + +out_unlock: + mutex_unlock(®ister_lock); } void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) @@ -460,7 +475,7 @@ int show_interrupts(struct seq_file *p, void *v) for_each_online_cpu(j) any_count |= kstat_irqs_cpu(i, j); action = desc->action; - if (!action && !any_count) + if ((!action || irq_desc_is_chained(desc)) && !any_count) goto out; seq_printf(p, "%*d: ", prec, i); diff --git a/kernel/kernel/irq/resend.c b/kernel/kernel/irq/resend.c index 7a5237a1b..b86886bee 100644 --- a/kernel/kernel/irq/resend.c +++ b/kernel/kernel/irq/resend.c @@ -38,7 +38,7 @@ static void resend_irqs(unsigned long arg) clear_bit(irq, irqs_resend); desc = irq_to_desc(irq); local_irq_disable(); - desc->handle_irq(irq, desc); + desc->handle_irq(desc); local_irq_enable(); } } @@ -53,7 +53,7 @@ static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0); * * Is called with interrupts disabled and desc->lock held. */ -void check_irq_resend(struct irq_desc *desc, unsigned int irq) +void check_irq_resend(struct irq_desc *desc) { /* * We do not resend level type interrupts. Level type @@ -74,6 +74,8 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq) if (!desc->irq_data.chip->irq_retrigger || !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) { #ifdef CONFIG_HARDIRQS_SW_RESEND + unsigned int irq = irq_desc_get_irq(desc); + /* * If the interrupt is running in the thread * context of the parent irq we need to be diff --git a/kernel/kernel/irq/settings.h b/kernel/kernel/irq/settings.h index 34b803b89..2df2d4445 100644 --- a/kernel/kernel/irq/settings.h +++ b/kernel/kernel/irq/settings.h @@ -15,6 +15,7 @@ enum { _IRQ_NESTED_THREAD = IRQ_NESTED_THREAD, _IRQ_PER_CPU_DEVID = IRQ_PER_CPU_DEVID, _IRQ_IS_POLLED = IRQ_IS_POLLED, + _IRQ_DISABLE_UNLAZY = IRQ_DISABLE_UNLAZY, _IRQ_NO_SOFTIRQ_CALL = IRQ_NO_SOFTIRQ_CALL, _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK, }; @@ -29,6 +30,7 @@ enum { #define IRQ_NESTED_THREAD GOT_YOU_MORON #define IRQ_PER_CPU_DEVID GOT_YOU_MORON #define IRQ_IS_POLLED GOT_YOU_MORON +#define IRQ_DISABLE_UNLAZY GOT_YOU_MORON #define IRQ_NO_SOFTIRQ_CALL GOT_YOU_MORON #undef IRQF_MODIFY_MASK #define IRQF_MODIFY_MASK GOT_YOU_MORON @@ -166,3 +168,13 @@ static inline bool irq_settings_is_polled(struct irq_desc *desc) { return desc->status_use_accessors & _IRQ_IS_POLLED; } + +static inline bool irq_settings_disable_unlazy(struct irq_desc *desc) +{ + return desc->status_use_accessors & _IRQ_DISABLE_UNLAZY; +} + +static inline void irq_settings_clr_disable_unlazy(struct irq_desc *desc) +{ + desc->status_use_accessors &= ~_IRQ_DISABLE_UNLAZY; +} diff --git a/kernel/kernel/irq/spurious.c b/kernel/kernel/irq/spurious.c index 903a69c45..ed26f2554 100644 --- a/kernel/kernel/irq/spurious.c +++ b/kernel/kernel/irq/spurious.c @@ -60,7 +60,7 @@ bool irq_wait_for_poll(struct irq_desc *desc) /* * Recovery handler for misrouted interrupts. */ -static int try_one_irq(int irq, struct irq_desc *desc, bool force) +static int try_one_irq(struct irq_desc *desc, bool force) { irqreturn_t ret = IRQ_NONE; struct irqaction *action; @@ -133,7 +133,7 @@ static int misrouted_irq(int irq) if (i == irq) /* Already tried */ continue; - if (try_one_irq(i, desc, false)) + if (try_one_irq(desc, false)) ok = 1; } out: @@ -164,7 +164,7 @@ static void poll_spurious_irqs(unsigned long dummy) continue; local_irq_disable(); - try_one_irq(i, desc, true); + try_one_irq(desc, true); local_irq_enable(); } out: @@ -188,10 +188,9 @@ static inline int bad_action_ret(irqreturn_t action_ret) * (The other 100-of-100,000 interrupts may have been a correctly * functioning device sharing an IRQ with the failing one) */ -static void -__report_bad_irq(unsigned int irq, struct irq_desc *desc, - irqreturn_t action_ret) +static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret) { + unsigned int irq = irq_desc_get_irq(desc); struct irqaction *action; unsigned long flags; @@ -224,14 +223,13 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc, raw_spin_unlock_irqrestore(&desc->lock, flags); } -static void -report_bad_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret) +static void report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret) { static int count = 100; if (count > 0) { count--; - __report_bad_irq(irq, desc, action_ret); + __report_bad_irq(desc, action_ret); } } @@ -272,15 +270,16 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc, #define SPURIOUS_DEFERRED 0x80000000 -void note_interrupt(unsigned int irq, struct irq_desc *desc, - irqreturn_t action_ret) +void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret) { + unsigned int irq; + if (desc->istate & IRQS_POLL_INPROGRESS || irq_settings_is_polled(desc)) return; if (bad_action_ret(action_ret)) { - report_bad_irq(irq, desc, action_ret); + report_bad_irq(desc, action_ret); return; } @@ -398,6 +397,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, desc->last_unhandled = jiffies; } + irq = irq_desc_get_irq(desc); if (unlikely(try_misrouted_irq(irq, desc, action_ret))) { int ok = misrouted_irq(irq); if (action_ret == IRQ_NONE) @@ -413,7 +413,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, /* * The interrupt is stuck */ - __report_bad_irq(irq, desc, action_ret); + __report_bad_irq(desc, action_ret); /* * Now kill the IRQ */ diff --git a/kernel/kernel/irq_work.c b/kernel/kernel/irq_work.c index 5a0f45251..2899ba0d2 100644 --- a/kernel/kernel/irq_work.c +++ b/kernel/kernel/irq_work.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra * * Provides a framework for enqueueing and running callbacks from hardirq * context. The enqueueing is NMI-safe. @@ -200,8 +200,17 @@ void irq_work_tick(void) if (!llist_empty(raised) && !arch_irq_work_has_interrupt()) irq_work_run_list(raised); + + if (!IS_ENABLED(CONFIG_PREEMPT_RT_FULL)) + irq_work_run_list(this_cpu_ptr(&lazy_list)); +} + +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL) +void irq_work_tick_soft(void) +{ irq_work_run_list(this_cpu_ptr(&lazy_list)); } +#endif /* * Synchronize against the irq_work @entry, ensures the entry is not diff --git a/kernel/kernel/jump_label.c b/kernel/kernel/jump_label.c index 9019f15de..05254eeb4 100644 --- a/kernel/kernel/jump_label.c +++ b/kernel/kernel/jump_label.c @@ -2,7 +2,7 @@ * jump label support * * Copyright (C) 2009 Jason Baron <jbaron@redhat.com> - * Copyright (C) 2011 Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2011 Peter Zijlstra * */ #include <linux/memory.h> @@ -54,7 +54,7 @@ jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop) sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL); } -static void jump_label_update(struct static_key *key, int enable); +static void jump_label_update(struct static_key *key); void static_key_slow_inc(struct static_key *key) { @@ -63,13 +63,8 @@ void static_key_slow_inc(struct static_key *key) return; jump_label_lock(); - if (atomic_read(&key->enabled) == 0) { - if (!jump_label_get_branch_default(key)) - jump_label_update(key, JUMP_LABEL_ENABLE); - else - jump_label_update(key, JUMP_LABEL_DISABLE); - } - atomic_inc(&key->enabled); + if (atomic_inc_return(&key->enabled) == 1) + jump_label_update(key); jump_label_unlock(); } EXPORT_SYMBOL_GPL(static_key_slow_inc); @@ -87,10 +82,7 @@ static void __static_key_slow_dec(struct static_key *key, atomic_inc(&key->enabled); schedule_delayed_work(work, rate_limit); } else { - if (!jump_label_get_branch_default(key)) - jump_label_update(key, JUMP_LABEL_DISABLE); - else - jump_label_update(key, JUMP_LABEL_ENABLE); + jump_label_update(key); } jump_label_unlock(); } @@ -149,7 +141,7 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start, return 0; } -/* +/* * Update code which is definitely not currently executing. * Architectures which need heavyweight synchronization to modify * running code can override this to make the non-live update case @@ -158,37 +150,54 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start, void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry *entry, enum jump_label_type type) { - arch_jump_label_transform(entry, type); + arch_jump_label_transform(entry, type); +} + +static inline struct jump_entry *static_key_entries(struct static_key *key) +{ + return (struct jump_entry *)((unsigned long)key->entries & ~JUMP_TYPE_MASK); +} + +static inline bool static_key_type(struct static_key *key) +{ + return (unsigned long)key->entries & JUMP_TYPE_MASK; +} + +static inline struct static_key *jump_entry_key(struct jump_entry *entry) +{ + return (struct static_key *)((unsigned long)entry->key & ~1UL); +} + +static bool jump_entry_branch(struct jump_entry *entry) +{ + return (unsigned long)entry->key & 1UL; +} + +static enum jump_label_type jump_label_type(struct jump_entry *entry) +{ + struct static_key *key = jump_entry_key(entry); + bool enabled = static_key_enabled(key); + bool branch = jump_entry_branch(entry); + + /* See the comment in linux/jump_label.h */ + return enabled ^ branch; } static void __jump_label_update(struct static_key *key, struct jump_entry *entry, - struct jump_entry *stop, int enable) + struct jump_entry *stop) { - for (; (entry < stop) && - (entry->key == (jump_label_t)(unsigned long)key); - entry++) { + for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) { /* * entry->code set to 0 invalidates module init text sections * kernel_text_address() verifies we are not in core kernel * init code, see jump_label_invalidate_module_init(). */ if (entry->code && kernel_text_address(entry->code)) - arch_jump_label_transform(entry, enable); + arch_jump_label_transform(entry, jump_label_type(entry)); } } -static enum jump_label_type jump_label_type(struct static_key *key) -{ - bool true_branch = jump_label_get_branch_default(key); - bool state = static_key_enabled(key); - - if ((!true_branch && state) || (true_branch && !state)) - return JUMP_LABEL_ENABLE; - - return JUMP_LABEL_DISABLE; -} - void __init jump_label_init(void) { struct jump_entry *iter_start = __start___jump_table; @@ -202,8 +211,11 @@ void __init jump_label_init(void) for (iter = iter_start; iter < iter_stop; iter++) { struct static_key *iterk; - iterk = (struct static_key *)(unsigned long)iter->key; - arch_jump_label_transform_static(iter, jump_label_type(iterk)); + /* rewrite NOPs */ + if (jump_label_type(iter) == JUMP_LABEL_NOP) + arch_jump_label_transform_static(iter, JUMP_LABEL_NOP); + + iterk = jump_entry_key(iter); if (iterk == key) continue; @@ -222,6 +234,16 @@ void __init jump_label_init(void) #ifdef CONFIG_MODULES +static enum jump_label_type jump_label_init_type(struct jump_entry *entry) +{ + struct static_key *key = jump_entry_key(entry); + bool type = static_key_type(key); + bool branch = jump_entry_branch(entry); + + /* See the comment in linux/jump_label.h */ + return type ^ branch; +} + struct static_key_mod { struct static_key_mod *next; struct jump_entry *entries; @@ -243,17 +265,15 @@ static int __jump_label_mod_text_reserved(void *start, void *end) start, end); } -static void __jump_label_mod_update(struct static_key *key, int enable) +static void __jump_label_mod_update(struct static_key *key) { - struct static_key_mod *mod = key->next; + struct static_key_mod *mod; - while (mod) { + for (mod = key->next; mod; mod = mod->next) { struct module *m = mod->mod; __jump_label_update(key, mod->entries, - m->jump_entries + m->num_jump_entries, - enable); - mod = mod->next; + m->jump_entries + m->num_jump_entries); } } @@ -276,7 +296,9 @@ void jump_label_apply_nops(struct module *mod) return; for (iter = iter_start; iter < iter_stop; iter++) { - arch_jump_label_transform_static(iter, JUMP_LABEL_DISABLE); + /* Only write NOPs for arch_branch_static(). */ + if (jump_label_init_type(iter) == JUMP_LABEL_NOP) + arch_jump_label_transform_static(iter, JUMP_LABEL_NOP); } } @@ -297,12 +319,12 @@ static int jump_label_add_module(struct module *mod) for (iter = iter_start; iter < iter_stop; iter++) { struct static_key *iterk; - iterk = (struct static_key *)(unsigned long)iter->key; + iterk = jump_entry_key(iter); if (iterk == key) continue; key = iterk; - if (__module_address(iter->key) == mod) { + if (within_module(iter->key, mod)) { /* * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH. */ @@ -318,8 +340,9 @@ static int jump_label_add_module(struct module *mod) jlm->next = key->next; key->next = jlm; - if (jump_label_type(key) == JUMP_LABEL_ENABLE) - __jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE); + /* Only update if we've changed from our initial state */ + if (jump_label_type(iter) != jump_label_init_type(iter)) + __jump_label_update(key, iter, iter_stop); } return 0; @@ -334,12 +357,12 @@ static void jump_label_del_module(struct module *mod) struct static_key_mod *jlm, **prev; for (iter = iter_start; iter < iter_stop; iter++) { - if (iter->key == (jump_label_t)(unsigned long)key) + if (jump_entry_key(iter) == key) continue; - key = (struct static_key *)(unsigned long)iter->key; + key = jump_entry_key(iter); - if (__module_address(iter->key) == mod) + if (within_module(iter->key, mod)) continue; prev = &key->next; @@ -439,22 +462,61 @@ int jump_label_text_reserved(void *start, void *end) return ret; } -static void jump_label_update(struct static_key *key, int enable) +static void jump_label_update(struct static_key *key) { struct jump_entry *stop = __stop___jump_table; - struct jump_entry *entry = jump_label_get_entries(key); - + struct jump_entry *entry = static_key_entries(key); #ifdef CONFIG_MODULES - struct module *mod = __module_address((unsigned long)key); + struct module *mod; - __jump_label_mod_update(key, enable); + __jump_label_mod_update(key); + preempt_disable(); + mod = __module_address((unsigned long)key); if (mod) stop = mod->jump_entries + mod->num_jump_entries; + preempt_enable(); #endif /* if there are no users, entry can be NULL */ if (entry) - __jump_label_update(key, entry, stop, enable); + __jump_label_update(key, entry, stop); } -#endif +#ifdef CONFIG_STATIC_KEYS_SELFTEST +static DEFINE_STATIC_KEY_TRUE(sk_true); +static DEFINE_STATIC_KEY_FALSE(sk_false); + +static __init int jump_label_test(void) +{ + int i; + + for (i = 0; i < 2; i++) { + WARN_ON(static_key_enabled(&sk_true.key) != true); + WARN_ON(static_key_enabled(&sk_false.key) != false); + + WARN_ON(!static_branch_likely(&sk_true)); + WARN_ON(!static_branch_unlikely(&sk_true)); + WARN_ON(static_branch_likely(&sk_false)); + WARN_ON(static_branch_unlikely(&sk_false)); + + static_branch_disable(&sk_true); + static_branch_enable(&sk_false); + + WARN_ON(static_key_enabled(&sk_true.key) == true); + WARN_ON(static_key_enabled(&sk_false.key) == false); + + WARN_ON(static_branch_likely(&sk_true)); + WARN_ON(static_branch_unlikely(&sk_true)); + WARN_ON(!static_branch_likely(&sk_false)); + WARN_ON(!static_branch_unlikely(&sk_false)); + + static_branch_enable(&sk_true); + static_branch_disable(&sk_false); + } + + return 0; +} +late_initcall(jump_label_test); +#endif /* STATIC_KEYS_SELFTEST */ + +#endif /* HAVE_JUMP_LABEL */ diff --git a/kernel/kernel/kcmp.c b/kernel/kernel/kcmp.c index 0aa69ea1d..3a47fa998 100644 --- a/kernel/kernel/kcmp.c +++ b/kernel/kernel/kcmp.c @@ -122,8 +122,8 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, &task2->signal->cred_guard_mutex); if (ret) goto err; - if (!ptrace_may_access(task1, PTRACE_MODE_READ) || - !ptrace_may_access(task2, PTRACE_MODE_READ)) { + if (!ptrace_may_access(task1, PTRACE_MODE_READ_REALCREDS) || + !ptrace_may_access(task2, PTRACE_MODE_READ_REALCREDS)) { ret = -EPERM; goto err_unlock; } diff --git a/kernel/kernel/kexec.c b/kernel/kernel/kexec.c index 7a36fdcca..d873b64fb 100644 --- a/kernel/kernel/kexec.c +++ b/kernel/kernel/kexec.c @@ -1,145 +1,24 @@ /* - * kexec.c - kexec system call + * kexec.c - kexec_load system call * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> * * This source code is licensed under the GNU General Public License, * Version 2. See the file COPYING for more details. */ -#define pr_fmt(fmt) "kexec: " fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/capability.h> #include <linux/mm.h> #include <linux/file.h> -#include <linux/slab.h> -#include <linux/fs.h> #include <linux/kexec.h> #include <linux/mutex.h> #include <linux/list.h> -#include <linux/highmem.h> #include <linux/syscalls.h> -#include <linux/reboot.h> -#include <linux/ioport.h> -#include <linux/hardirq.h> -#include <linux/elf.h> -#include <linux/elfcore.h> -#include <linux/utsname.h> -#include <linux/numa.h> -#include <linux/suspend.h> -#include <linux/device.h> -#include <linux/freezer.h> -#include <linux/pm.h> -#include <linux/cpu.h> -#include <linux/console.h> #include <linux/vmalloc.h> -#include <linux/swap.h> -#include <linux/syscore_ops.h> -#include <linux/compiler.h> -#include <linux/hugetlb.h> - -#include <asm/page.h> -#include <asm/uaccess.h> -#include <asm/io.h> -#include <asm/sections.h> - -#include <crypto/hash.h> -#include <crypto/sha.h> - -/* Per cpu memory for storing cpu states in case of system crash. */ -note_buf_t __percpu *crash_notes; - -/* vmcoreinfo stuff */ -static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; -u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; -size_t vmcoreinfo_size; -size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); - -/* Flag to indicate we are going to kexec a new kernel */ -bool kexec_in_progress = false; - -/* - * Declare these symbols weak so that if architecture provides a purgatory, - * these will be overridden. - */ -char __weak kexec_purgatory[0]; -size_t __weak kexec_purgatory_size = 0; - -#ifdef CONFIG_KEXEC_FILE -static int kexec_calculate_store_digests(struct kimage *image); -#endif - -/* Location of the reserved area for the crash kernel */ -struct resource crashk_res = { - .name = "Crash kernel", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM -}; -struct resource crashk_low_res = { - .name = "Crash kernel", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM -}; - -int kexec_should_crash(struct task_struct *p) -{ - if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops) - return 1; - return 0; -} - -/* - * When kexec transitions to the new kernel there is a one-to-one - * mapping between physical and virtual addresses. On processors - * where you can disable the MMU this is trivial, and easy. For - * others it is still a simple predictable page table to setup. - * - * In that environment kexec copies the new kernel to its final - * resting place. This means I can only support memory whose - * physical address can fit in an unsigned long. In particular - * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled. - * If the assembly stub has more restrictive requirements - * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be - * defined more restrictively in <asm/kexec.h>. - * - * The code for the transition from the current kernel to the - * the new kernel is placed in the control_code_buffer, whose size - * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single - * page of memory is necessary, but some architectures require more. - * Because this memory must be identity mapped in the transition from - * virtual to physical addresses it must live in the range - * 0 - TASK_SIZE, as only the user space mappings are arbitrarily - * modifiable. - * - * The assembly stub in the control code buffer is passed a linked list - * of descriptor pages detailing the source pages of the new kernel, - * and the destination addresses of those source pages. As this data - * structure is not used in the context of the current OS, it must - * be self-contained. - * - * The code has been made to work with highmem pages and will use a - * destination page in its final resting place (if it happens - * to allocate it). The end product of this is that most of the - * physical address space, and most of RAM can be used. - * - * Future directions include: - * - allocating a page table with the control code buffer identity - * mapped, to simplify machine_kexec and make kexec_on_panic more - * reliable. - */ - -/* - * KIMAGE_NO_DEST is an impossible destination address..., for - * allocating pages whose destination address we do not care about. - */ -#define KIMAGE_NO_DEST (-1UL) +#include <linux/slab.h> -static int kimage_is_destination_range(struct kimage *image, - unsigned long start, unsigned long end); -static struct page *kimage_alloc_page(struct kimage *image, - gfp_t gfp_mask, - unsigned long dest); +#include "kexec_internal.h" static int copy_user_segment_list(struct kimage *image, unsigned long nr_segments, @@ -158,125 +37,6 @@ static int copy_user_segment_list(struct kimage *image, return ret; } -static int sanity_check_segment_list(struct kimage *image) -{ - int result, i; - unsigned long nr_segments = image->nr_segments; - - /* - * Verify we have good destination addresses. The caller is - * responsible for making certain we don't attempt to load - * the new image into invalid or reserved areas of RAM. This - * just verifies it is an address we can use. - * - * Since the kernel does everything in page size chunks ensure - * the destination addresses are page aligned. Too many - * special cases crop of when we don't do this. The most - * insidious is getting overlapping destination addresses - * simply because addresses are changed to page size - * granularity. - */ - result = -EADDRNOTAVAIL; - for (i = 0; i < nr_segments; i++) { - unsigned long mstart, mend; - - mstart = image->segment[i].mem; - mend = mstart + image->segment[i].memsz; - if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) - return result; - if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) - return result; - } - - /* Verify our destination addresses do not overlap. - * If we alloed overlapping destination addresses - * through very weird things can happen with no - * easy explanation as one segment stops on another. - */ - result = -EINVAL; - for (i = 0; i < nr_segments; i++) { - unsigned long mstart, mend; - unsigned long j; - - mstart = image->segment[i].mem; - mend = mstart + image->segment[i].memsz; - for (j = 0; j < i; j++) { - unsigned long pstart, pend; - pstart = image->segment[j].mem; - pend = pstart + image->segment[j].memsz; - /* Do the segments overlap ? */ - if ((mend > pstart) && (mstart < pend)) - return result; - } - } - - /* Ensure our buffer sizes are strictly less than - * our memory sizes. This should always be the case, - * and it is easier to check up front than to be surprised - * later on. - */ - result = -EINVAL; - for (i = 0; i < nr_segments; i++) { - if (image->segment[i].bufsz > image->segment[i].memsz) - return result; - } - - /* - * Verify we have good destination addresses. Normally - * the caller is responsible for making certain we don't - * attempt to load the new image into invalid or reserved - * areas of RAM. But crash kernels are preloaded into a - * reserved area of ram. We must ensure the addresses - * are in the reserved area otherwise preloading the - * kernel could corrupt things. - */ - - if (image->type == KEXEC_TYPE_CRASH) { - result = -EADDRNOTAVAIL; - for (i = 0; i < nr_segments; i++) { - unsigned long mstart, mend; - - mstart = image->segment[i].mem; - mend = mstart + image->segment[i].memsz - 1; - /* Ensure we are within the crash kernel limits */ - if ((mstart < crashk_res.start) || - (mend > crashk_res.end)) - return result; - } - } - - return 0; -} - -static struct kimage *do_kimage_alloc_init(void) -{ - struct kimage *image; - - /* Allocate a controlling structure */ - image = kzalloc(sizeof(*image), GFP_KERNEL); - if (!image) - return NULL; - - image->head = 0; - image->entry = &image->head; - image->last_entry = &image->head; - image->control_page = ~0; /* By default this does not apply */ - image->type = KEXEC_TYPE_DEFAULT; - - /* Initialize the list of control pages */ - INIT_LIST_HEAD(&image->control_pages); - - /* Initialize the list of destination pages */ - INIT_LIST_HEAD(&image->dest_pages); - - /* Initialize the list of unusable pages */ - INIT_LIST_HEAD(&image->unusable_pages); - - return image; -} - -static void kimage_free_page_list(struct list_head *list); - static int kimage_alloc_init(struct kimage **rimage, unsigned long entry, unsigned long nr_segments, struct kexec_segment __user *segments, @@ -343,873 +103,6 @@ out_free_image: return ret; } -#ifdef CONFIG_KEXEC_FILE -static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len) -{ - struct fd f = fdget(fd); - int ret; - struct kstat stat; - loff_t pos; - ssize_t bytes = 0; - - if (!f.file) - return -EBADF; - - ret = vfs_getattr(&f.file->f_path, &stat); - if (ret) - goto out; - - if (stat.size > INT_MAX) { - ret = -EFBIG; - goto out; - } - - /* Don't hand 0 to vmalloc, it whines. */ - if (stat.size == 0) { - ret = -EINVAL; - goto out; - } - - *buf = vmalloc(stat.size); - if (!*buf) { - ret = -ENOMEM; - goto out; - } - - pos = 0; - while (pos < stat.size) { - bytes = kernel_read(f.file, pos, (char *)(*buf) + pos, - stat.size - pos); - if (bytes < 0) { - vfree(*buf); - ret = bytes; - goto out; - } - - if (bytes == 0) - break; - pos += bytes; - } - - if (pos != stat.size) { - ret = -EBADF; - vfree(*buf); - goto out; - } - - *buf_len = pos; -out: - fdput(f); - return ret; -} - -/* Architectures can provide this probe function */ -int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf, - unsigned long buf_len) -{ - return -ENOEXEC; -} - -void * __weak arch_kexec_kernel_image_load(struct kimage *image) -{ - return ERR_PTR(-ENOEXEC); -} - -void __weak arch_kimage_file_post_load_cleanup(struct kimage *image) -{ -} - -int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, - unsigned long buf_len) -{ - return -EKEYREJECTED; -} - -/* Apply relocations of type RELA */ -int __weak -arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, - unsigned int relsec) -{ - pr_err("RELA relocation unsupported.\n"); - return -ENOEXEC; -} - -/* Apply relocations of type REL */ -int __weak -arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, - unsigned int relsec) -{ - pr_err("REL relocation unsupported.\n"); - return -ENOEXEC; -} - -/* - * Free up memory used by kernel, initrd, and command line. This is temporary - * memory allocation which is not needed any more after these buffers have - * been loaded into separate segments and have been copied elsewhere. - */ -static void kimage_file_post_load_cleanup(struct kimage *image) -{ - struct purgatory_info *pi = &image->purgatory_info; - - vfree(image->kernel_buf); - image->kernel_buf = NULL; - - vfree(image->initrd_buf); - image->initrd_buf = NULL; - - kfree(image->cmdline_buf); - image->cmdline_buf = NULL; - - vfree(pi->purgatory_buf); - pi->purgatory_buf = NULL; - - vfree(pi->sechdrs); - pi->sechdrs = NULL; - - /* See if architecture has anything to cleanup post load */ - arch_kimage_file_post_load_cleanup(image); - - /* - * Above call should have called into bootloader to free up - * any data stored in kimage->image_loader_data. It should - * be ok now to free it up. - */ - kfree(image->image_loader_data); - image->image_loader_data = NULL; -} - -/* - * In file mode list of segments is prepared by kernel. Copy relevant - * data from user space, do error checking, prepare segment list - */ -static int -kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, - const char __user *cmdline_ptr, - unsigned long cmdline_len, unsigned flags) -{ - int ret = 0; - void *ldata; - - ret = copy_file_from_fd(kernel_fd, &image->kernel_buf, - &image->kernel_buf_len); - if (ret) - return ret; - - /* Call arch image probe handlers */ - ret = arch_kexec_kernel_image_probe(image, image->kernel_buf, - image->kernel_buf_len); - - if (ret) - goto out; - -#ifdef CONFIG_KEXEC_VERIFY_SIG - ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf, - image->kernel_buf_len); - if (ret) { - pr_debug("kernel signature verification failed.\n"); - goto out; - } - pr_debug("kernel signature verification successful.\n"); -#endif - /* It is possible that there no initramfs is being loaded */ - if (!(flags & KEXEC_FILE_NO_INITRAMFS)) { - ret = copy_file_from_fd(initrd_fd, &image->initrd_buf, - &image->initrd_buf_len); - if (ret) - goto out; - } - - if (cmdline_len) { - image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL); - if (!image->cmdline_buf) { - ret = -ENOMEM; - goto out; - } - - ret = copy_from_user(image->cmdline_buf, cmdline_ptr, - cmdline_len); - if (ret) { - ret = -EFAULT; - goto out; - } - - image->cmdline_buf_len = cmdline_len; - - /* command line should be a string with last byte null */ - if (image->cmdline_buf[cmdline_len - 1] != '\0') { - ret = -EINVAL; - goto out; - } - } - - /* Call arch image load handlers */ - ldata = arch_kexec_kernel_image_load(image); - - if (IS_ERR(ldata)) { - ret = PTR_ERR(ldata); - goto out; - } - - image->image_loader_data = ldata; -out: - /* In case of error, free up all allocated memory in this function */ - if (ret) - kimage_file_post_load_cleanup(image); - return ret; -} - -static int -kimage_file_alloc_init(struct kimage **rimage, int kernel_fd, - int initrd_fd, const char __user *cmdline_ptr, - unsigned long cmdline_len, unsigned long flags) -{ - int ret; - struct kimage *image; - bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH; - - image = do_kimage_alloc_init(); - if (!image) - return -ENOMEM; - - image->file_mode = 1; - - if (kexec_on_panic) { - /* Enable special crash kernel control page alloc policy. */ - image->control_page = crashk_res.start; - image->type = KEXEC_TYPE_CRASH; - } - - ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd, - cmdline_ptr, cmdline_len, flags); - if (ret) - goto out_free_image; - - ret = sanity_check_segment_list(image); - if (ret) - goto out_free_post_load_bufs; - - ret = -ENOMEM; - image->control_code_page = kimage_alloc_control_pages(image, - get_order(KEXEC_CONTROL_PAGE_SIZE)); - if (!image->control_code_page) { - pr_err("Could not allocate control_code_buffer\n"); - goto out_free_post_load_bufs; - } - - if (!kexec_on_panic) { - image->swap_page = kimage_alloc_control_pages(image, 0); - if (!image->swap_page) { - pr_err("Could not allocate swap buffer\n"); - goto out_free_control_pages; - } - } - - *rimage = image; - return 0; -out_free_control_pages: - kimage_free_page_list(&image->control_pages); -out_free_post_load_bufs: - kimage_file_post_load_cleanup(image); -out_free_image: - kfree(image); - return ret; -} -#else /* CONFIG_KEXEC_FILE */ -static inline void kimage_file_post_load_cleanup(struct kimage *image) { } -#endif /* CONFIG_KEXEC_FILE */ - -static int kimage_is_destination_range(struct kimage *image, - unsigned long start, - unsigned long end) -{ - unsigned long i; - - for (i = 0; i < image->nr_segments; i++) { - unsigned long mstart, mend; - - mstart = image->segment[i].mem; - mend = mstart + image->segment[i].memsz; - if ((end > mstart) && (start < mend)) - return 1; - } - - return 0; -} - -static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) -{ - struct page *pages; - - pages = alloc_pages(gfp_mask, order); - if (pages) { - unsigned int count, i; - pages->mapping = NULL; - set_page_private(pages, order); - count = 1 << order; - for (i = 0; i < count; i++) - SetPageReserved(pages + i); - } - - return pages; -} - -static void kimage_free_pages(struct page *page) -{ - unsigned int order, count, i; - - order = page_private(page); - count = 1 << order; - for (i = 0; i < count; i++) - ClearPageReserved(page + i); - __free_pages(page, order); -} - -static void kimage_free_page_list(struct list_head *list) -{ - struct list_head *pos, *next; - - list_for_each_safe(pos, next, list) { - struct page *page; - - page = list_entry(pos, struct page, lru); - list_del(&page->lru); - kimage_free_pages(page); - } -} - -static struct page *kimage_alloc_normal_control_pages(struct kimage *image, - unsigned int order) -{ - /* Control pages are special, they are the intermediaries - * that are needed while we copy the rest of the pages - * to their final resting place. As such they must - * not conflict with either the destination addresses - * or memory the kernel is already using. - * - * The only case where we really need more than one of - * these are for architectures where we cannot disable - * the MMU and must instead generate an identity mapped - * page table for all of the memory. - * - * At worst this runs in O(N) of the image size. - */ - struct list_head extra_pages; - struct page *pages; - unsigned int count; - - count = 1 << order; - INIT_LIST_HEAD(&extra_pages); - - /* Loop while I can allocate a page and the page allocated - * is a destination page. - */ - do { - unsigned long pfn, epfn, addr, eaddr; - - pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order); - if (!pages) - break; - pfn = page_to_pfn(pages); - epfn = pfn + count; - addr = pfn << PAGE_SHIFT; - eaddr = epfn << PAGE_SHIFT; - if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) || - kimage_is_destination_range(image, addr, eaddr)) { - list_add(&pages->lru, &extra_pages); - pages = NULL; - } - } while (!pages); - - if (pages) { - /* Remember the allocated page... */ - list_add(&pages->lru, &image->control_pages); - - /* Because the page is already in it's destination - * location we will never allocate another page at - * that address. Therefore kimage_alloc_pages - * will not return it (again) and we don't need - * to give it an entry in image->segment[]. - */ - } - /* Deal with the destination pages I have inadvertently allocated. - * - * Ideally I would convert multi-page allocations into single - * page allocations, and add everything to image->dest_pages. - * - * For now it is simpler to just free the pages. - */ - kimage_free_page_list(&extra_pages); - - return pages; -} - -static struct page *kimage_alloc_crash_control_pages(struct kimage *image, - unsigned int order) -{ - /* Control pages are special, they are the intermediaries - * that are needed while we copy the rest of the pages - * to their final resting place. As such they must - * not conflict with either the destination addresses - * or memory the kernel is already using. - * - * Control pages are also the only pags we must allocate - * when loading a crash kernel. All of the other pages - * are specified by the segments and we just memcpy - * into them directly. - * - * The only case where we really need more than one of - * these are for architectures where we cannot disable - * the MMU and must instead generate an identity mapped - * page table for all of the memory. - * - * Given the low demand this implements a very simple - * allocator that finds the first hole of the appropriate - * size in the reserved memory region, and allocates all - * of the memory up to and including the hole. - */ - unsigned long hole_start, hole_end, size; - struct page *pages; - - pages = NULL; - size = (1 << order) << PAGE_SHIFT; - hole_start = (image->control_page + (size - 1)) & ~(size - 1); - hole_end = hole_start + size - 1; - while (hole_end <= crashk_res.end) { - unsigned long i; - - if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT) - break; - /* See if I overlap any of the segments */ - for (i = 0; i < image->nr_segments; i++) { - unsigned long mstart, mend; - - mstart = image->segment[i].mem; - mend = mstart + image->segment[i].memsz - 1; - if ((hole_end >= mstart) && (hole_start <= mend)) { - /* Advance the hole to the end of the segment */ - hole_start = (mend + (size - 1)) & ~(size - 1); - hole_end = hole_start + size - 1; - break; - } - } - /* If I don't overlap any segments I have found my hole! */ - if (i == image->nr_segments) { - pages = pfn_to_page(hole_start >> PAGE_SHIFT); - break; - } - } - if (pages) - image->control_page = hole_end; - - return pages; -} - - -struct page *kimage_alloc_control_pages(struct kimage *image, - unsigned int order) -{ - struct page *pages = NULL; - - switch (image->type) { - case KEXEC_TYPE_DEFAULT: - pages = kimage_alloc_normal_control_pages(image, order); - break; - case KEXEC_TYPE_CRASH: - pages = kimage_alloc_crash_control_pages(image, order); - break; - } - - return pages; -} - -static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) -{ - if (*image->entry != 0) - image->entry++; - - if (image->entry == image->last_entry) { - kimage_entry_t *ind_page; - struct page *page; - - page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST); - if (!page) - return -ENOMEM; - - ind_page = page_address(page); - *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION; - image->entry = ind_page; - image->last_entry = ind_page + - ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1); - } - *image->entry = entry; - image->entry++; - *image->entry = 0; - - return 0; -} - -static int kimage_set_destination(struct kimage *image, - unsigned long destination) -{ - int result; - - destination &= PAGE_MASK; - result = kimage_add_entry(image, destination | IND_DESTINATION); - - return result; -} - - -static int kimage_add_page(struct kimage *image, unsigned long page) -{ - int result; - - page &= PAGE_MASK; - result = kimage_add_entry(image, page | IND_SOURCE); - - return result; -} - - -static void kimage_free_extra_pages(struct kimage *image) -{ - /* Walk through and free any extra destination pages I may have */ - kimage_free_page_list(&image->dest_pages); - - /* Walk through and free any unusable pages I have cached */ - kimage_free_page_list(&image->unusable_pages); - -} -static void kimage_terminate(struct kimage *image) -{ - if (*image->entry != 0) - image->entry++; - - *image->entry = IND_DONE; -} - -#define for_each_kimage_entry(image, ptr, entry) \ - for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ - ptr = (entry & IND_INDIRECTION) ? \ - phys_to_virt((entry & PAGE_MASK)) : ptr + 1) - -static void kimage_free_entry(kimage_entry_t entry) -{ - struct page *page; - - page = pfn_to_page(entry >> PAGE_SHIFT); - kimage_free_pages(page); -} - -static void kimage_free(struct kimage *image) -{ - kimage_entry_t *ptr, entry; - kimage_entry_t ind = 0; - - if (!image) - return; - - kimage_free_extra_pages(image); - for_each_kimage_entry(image, ptr, entry) { - if (entry & IND_INDIRECTION) { - /* Free the previous indirection page */ - if (ind & IND_INDIRECTION) - kimage_free_entry(ind); - /* Save this indirection page until we are - * done with it. - */ - ind = entry; - } else if (entry & IND_SOURCE) - kimage_free_entry(entry); - } - /* Free the final indirection page */ - if (ind & IND_INDIRECTION) - kimage_free_entry(ind); - - /* Handle any machine specific cleanup */ - machine_kexec_cleanup(image); - - /* Free the kexec control pages... */ - kimage_free_page_list(&image->control_pages); - - /* - * Free up any temporary buffers allocated. This might hit if - * error occurred much later after buffer allocation. - */ - if (image->file_mode) - kimage_file_post_load_cleanup(image); - - kfree(image); -} - -static kimage_entry_t *kimage_dst_used(struct kimage *image, - unsigned long page) -{ - kimage_entry_t *ptr, entry; - unsigned long destination = 0; - - for_each_kimage_entry(image, ptr, entry) { - if (entry & IND_DESTINATION) - destination = entry & PAGE_MASK; - else if (entry & IND_SOURCE) { - if (page == destination) - return ptr; - destination += PAGE_SIZE; - } - } - - return NULL; -} - -static struct page *kimage_alloc_page(struct kimage *image, - gfp_t gfp_mask, - unsigned long destination) -{ - /* - * Here we implement safeguards to ensure that a source page - * is not copied to its destination page before the data on - * the destination page is no longer useful. - * - * To do this we maintain the invariant that a source page is - * either its own destination page, or it is not a - * destination page at all. - * - * That is slightly stronger than required, but the proof - * that no problems will not occur is trivial, and the - * implementation is simply to verify. - * - * When allocating all pages normally this algorithm will run - * in O(N) time, but in the worst case it will run in O(N^2) - * time. If the runtime is a problem the data structures can - * be fixed. - */ - struct page *page; - unsigned long addr; - - /* - * Walk through the list of destination pages, and see if I - * have a match. - */ - list_for_each_entry(page, &image->dest_pages, lru) { - addr = page_to_pfn(page) << PAGE_SHIFT; - if (addr == destination) { - list_del(&page->lru); - return page; - } - } - page = NULL; - while (1) { - kimage_entry_t *old; - - /* Allocate a page, if we run out of memory give up */ - page = kimage_alloc_pages(gfp_mask, 0); - if (!page) - return NULL; - /* If the page cannot be used file it away */ - if (page_to_pfn(page) > - (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { - list_add(&page->lru, &image->unusable_pages); - continue; - } - addr = page_to_pfn(page) << PAGE_SHIFT; - - /* If it is the destination page we want use it */ - if (addr == destination) - break; - - /* If the page is not a destination page use it */ - if (!kimage_is_destination_range(image, addr, - addr + PAGE_SIZE)) - break; - - /* - * I know that the page is someones destination page. - * See if there is already a source page for this - * destination page. And if so swap the source pages. - */ - old = kimage_dst_used(image, addr); - if (old) { - /* If so move it */ - unsigned long old_addr; - struct page *old_page; - - old_addr = *old & PAGE_MASK; - old_page = pfn_to_page(old_addr >> PAGE_SHIFT); - copy_highpage(page, old_page); - *old = addr | (*old & ~PAGE_MASK); - - /* The old page I have found cannot be a - * destination page, so return it if it's - * gfp_flags honor the ones passed in. - */ - if (!(gfp_mask & __GFP_HIGHMEM) && - PageHighMem(old_page)) { - kimage_free_pages(old_page); - continue; - } - addr = old_addr; - page = old_page; - break; - } else { - /* Place the page on the destination list I - * will use it later. - */ - list_add(&page->lru, &image->dest_pages); - } - } - - return page; -} - -static int kimage_load_normal_segment(struct kimage *image, - struct kexec_segment *segment) -{ - unsigned long maddr; - size_t ubytes, mbytes; - int result; - unsigned char __user *buf = NULL; - unsigned char *kbuf = NULL; - - result = 0; - if (image->file_mode) - kbuf = segment->kbuf; - else - buf = segment->buf; - ubytes = segment->bufsz; - mbytes = segment->memsz; - maddr = segment->mem; - - result = kimage_set_destination(image, maddr); - if (result < 0) - goto out; - - while (mbytes) { - struct page *page; - char *ptr; - size_t uchunk, mchunk; - - page = kimage_alloc_page(image, GFP_HIGHUSER, maddr); - if (!page) { - result = -ENOMEM; - goto out; - } - result = kimage_add_page(image, page_to_pfn(page) - << PAGE_SHIFT); - if (result < 0) - goto out; - - ptr = kmap(page); - /* Start with a clear page */ - clear_page(ptr); - ptr += maddr & ~PAGE_MASK; - mchunk = min_t(size_t, mbytes, - PAGE_SIZE - (maddr & ~PAGE_MASK)); - uchunk = min(ubytes, mchunk); - - /* For file based kexec, source pages are in kernel memory */ - if (image->file_mode) - memcpy(ptr, kbuf, uchunk); - else - result = copy_from_user(ptr, buf, uchunk); - kunmap(page); - if (result) { - result = -EFAULT; - goto out; - } - ubytes -= uchunk; - maddr += mchunk; - if (image->file_mode) - kbuf += mchunk; - else - buf += mchunk; - mbytes -= mchunk; - } -out: - return result; -} - -static int kimage_load_crash_segment(struct kimage *image, - struct kexec_segment *segment) -{ - /* For crash dumps kernels we simply copy the data from - * user space to it's destination. - * We do things a page at a time for the sake of kmap. - */ - unsigned long maddr; - size_t ubytes, mbytes; - int result; - unsigned char __user *buf = NULL; - unsigned char *kbuf = NULL; - - result = 0; - if (image->file_mode) - kbuf = segment->kbuf; - else - buf = segment->buf; - ubytes = segment->bufsz; - mbytes = segment->memsz; - maddr = segment->mem; - while (mbytes) { - struct page *page; - char *ptr; - size_t uchunk, mchunk; - - page = pfn_to_page(maddr >> PAGE_SHIFT); - if (!page) { - result = -ENOMEM; - goto out; - } - ptr = kmap(page); - ptr += maddr & ~PAGE_MASK; - mchunk = min_t(size_t, mbytes, - PAGE_SIZE - (maddr & ~PAGE_MASK)); - uchunk = min(ubytes, mchunk); - if (mchunk > uchunk) { - /* Zero the trailing part of the page */ - memset(ptr + uchunk, 0, mchunk - uchunk); - } - - /* For file based kexec, source pages are in kernel memory */ - if (image->file_mode) - memcpy(ptr, kbuf, uchunk); - else - result = copy_from_user(ptr, buf, uchunk); - kexec_flush_icache_page(page); - kunmap(page); - if (result) { - result = -EFAULT; - goto out; - } - ubytes -= uchunk; - maddr += mchunk; - if (image->file_mode) - kbuf += mchunk; - else - buf += mchunk; - mbytes -= mchunk; - } -out: - return result; -} - -static int kimage_load_segment(struct kimage *image, - struct kexec_segment *segment) -{ - int result = -ENOMEM; - - switch (image->type) { - case KEXEC_TYPE_DEFAULT: - result = kimage_load_normal_segment(image, segment); - break; - case KEXEC_TYPE_CRASH: - result = kimage_load_crash_segment(image, segment); - break; - } - - return result; -} - /* * Exec Kernel system call: for obvious reasons only root may call it. * @@ -1230,11 +123,6 @@ static int kimage_load_segment(struct kimage *image, * kexec does not sync, or unmount filesystems so if you need * that to happen you need to do that yourself. */ -struct kimage *kexec_image; -struct kimage *kexec_crash_image; -int kexec_load_disabled; - -static DEFINE_MUTEX(kexec_mutex); SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, struct kexec_segment __user *, segments, unsigned long, flags) @@ -1329,18 +217,6 @@ out: return result; } -/* - * Add and remove page tables for crashkernel memory - * - * Provide an empty default implementation here -- architecture - * code may override this - */ -void __weak crash_map_reserved_pages(void) -{} - -void __weak crash_unmap_reserved_pages(void) -{} - #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, compat_ulong_t, nr_segments, @@ -1379,1391 +255,3 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, return sys_kexec_load(entry, nr_segments, ksegments, flags); } #endif - -#ifdef CONFIG_KEXEC_FILE -SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, - unsigned long, cmdline_len, const char __user *, cmdline_ptr, - unsigned long, flags) -{ - int ret = 0, i; - struct kimage **dest_image, *image; - - /* We only trust the superuser with rebooting the system. */ - if (!capable(CAP_SYS_BOOT) || kexec_load_disabled) - return -EPERM; - - /* Make sure we have a legal set of flags */ - if (flags != (flags & KEXEC_FILE_FLAGS)) - return -EINVAL; - - image = NULL; - - if (!mutex_trylock(&kexec_mutex)) - return -EBUSY; - - dest_image = &kexec_image; - if (flags & KEXEC_FILE_ON_CRASH) - dest_image = &kexec_crash_image; - - if (flags & KEXEC_FILE_UNLOAD) - goto exchange; - - /* - * In case of crash, new kernel gets loaded in reserved region. It is - * same memory where old crash kernel might be loaded. Free any - * current crash dump kernel before we corrupt it. - */ - if (flags & KEXEC_FILE_ON_CRASH) - kimage_free(xchg(&kexec_crash_image, NULL)); - - ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr, - cmdline_len, flags); - if (ret) - goto out; - - ret = machine_kexec_prepare(image); - if (ret) - goto out; - - ret = kexec_calculate_store_digests(image); - if (ret) - goto out; - - for (i = 0; i < image->nr_segments; i++) { - struct kexec_segment *ksegment; - - ksegment = &image->segment[i]; - pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n", - i, ksegment->buf, ksegment->bufsz, ksegment->mem, - ksegment->memsz); - - ret = kimage_load_segment(image, &image->segment[i]); - if (ret) - goto out; - } - - kimage_terminate(image); - - /* - * Free up any temporary buffers allocated which are not needed - * after image has been loaded - */ - kimage_file_post_load_cleanup(image); -exchange: - image = xchg(dest_image, image); -out: - mutex_unlock(&kexec_mutex); - kimage_free(image); - return ret; -} - -#endif /* CONFIG_KEXEC_FILE */ - -void crash_kexec(struct pt_regs *regs) -{ - /* Take the kexec_mutex here to prevent sys_kexec_load - * running on one cpu from replacing the crash kernel - * we are using after a panic on a different cpu. - * - * If the crash kernel was not located in a fixed area - * of memory the xchg(&kexec_crash_image) would be - * sufficient. But since I reuse the memory... - */ - if (mutex_trylock(&kexec_mutex)) { - if (kexec_crash_image) { - struct pt_regs fixed_regs; - - crash_setup_regs(&fixed_regs, regs); - crash_save_vmcoreinfo(); - machine_crash_shutdown(&fixed_regs); - machine_kexec(kexec_crash_image); - } - mutex_unlock(&kexec_mutex); - } -} - -size_t crash_get_memory_size(void) -{ - size_t size = 0; - mutex_lock(&kexec_mutex); - if (crashk_res.end != crashk_res.start) - size = resource_size(&crashk_res); - mutex_unlock(&kexec_mutex); - return size; -} - -void __weak crash_free_reserved_phys_range(unsigned long begin, - unsigned long end) -{ - unsigned long addr; - - for (addr = begin; addr < end; addr += PAGE_SIZE) - free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT)); -} - -int crash_shrink_memory(unsigned long new_size) -{ - int ret = 0; - unsigned long start, end; - unsigned long old_size; - struct resource *ram_res; - - mutex_lock(&kexec_mutex); - - if (kexec_crash_image) { - ret = -ENOENT; - goto unlock; - } - start = crashk_res.start; - end = crashk_res.end; - old_size = (end == 0) ? 0 : end - start + 1; - if (new_size >= old_size) { - ret = (new_size == old_size) ? 0 : -EINVAL; - goto unlock; - } - - ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL); - if (!ram_res) { - ret = -ENOMEM; - goto unlock; - } - - start = roundup(start, KEXEC_CRASH_MEM_ALIGN); - end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN); - - crash_map_reserved_pages(); - crash_free_reserved_phys_range(end, crashk_res.end); - - if ((start == end) && (crashk_res.parent != NULL)) - release_resource(&crashk_res); - - ram_res->start = end; - ram_res->end = crashk_res.end; - ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM; - ram_res->name = "System RAM"; - - crashk_res.end = end - 1; - - insert_resource(&iomem_resource, ram_res); - crash_unmap_reserved_pages(); - -unlock: - mutex_unlock(&kexec_mutex); - return ret; -} - -static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data, - size_t data_len) -{ - struct elf_note note; - - note.n_namesz = strlen(name) + 1; - note.n_descsz = data_len; - note.n_type = type; - memcpy(buf, ¬e, sizeof(note)); - buf += (sizeof(note) + 3)/4; - memcpy(buf, name, note.n_namesz); - buf += (note.n_namesz + 3)/4; - memcpy(buf, data, note.n_descsz); - buf += (note.n_descsz + 3)/4; - - return buf; -} - -static void final_note(u32 *buf) -{ - struct elf_note note; - - note.n_namesz = 0; - note.n_descsz = 0; - note.n_type = 0; - memcpy(buf, ¬e, sizeof(note)); -} - -void crash_save_cpu(struct pt_regs *regs, int cpu) -{ - struct elf_prstatus prstatus; - u32 *buf; - - if ((cpu < 0) || (cpu >= nr_cpu_ids)) - return; - - /* Using ELF notes here is opportunistic. - * I need a well defined structure format - * for the data I pass, and I need tags - * on the data to indicate what information I have - * squirrelled away. ELF notes happen to provide - * all of that, so there is no need to invent something new. - */ - buf = (u32 *)per_cpu_ptr(crash_notes, cpu); - if (!buf) - return; - memset(&prstatus, 0, sizeof(prstatus)); - prstatus.pr_pid = current->pid; - elf_core_copy_kernel_regs(&prstatus.pr_reg, regs); - buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, - &prstatus, sizeof(prstatus)); - final_note(buf); -} - -static int __init crash_notes_memory_init(void) -{ - /* Allocate memory for saving cpu registers. */ - crash_notes = alloc_percpu(note_buf_t); - if (!crash_notes) { - pr_warn("Kexec: Memory allocation for saving cpu register states failed\n"); - return -ENOMEM; - } - return 0; -} -subsys_initcall(crash_notes_memory_init); - - -/* - * parsing the "crashkernel" commandline - * - * this code is intended to be called from architecture specific code - */ - - -/* - * This function parses command lines in the format - * - * crashkernel=ramsize-range:size[,...][@offset] - * - * The function returns 0 on success and -EINVAL on failure. - */ -static int __init parse_crashkernel_mem(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - char *cur = cmdline, *tmp; - - /* for each entry of the comma-separated list */ - do { - unsigned long long start, end = ULLONG_MAX, size; - - /* get the start of the range */ - start = memparse(cur, &tmp); - if (cur == tmp) { - pr_warn("crashkernel: Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (*cur != '-') { - pr_warn("crashkernel: '-' expected\n"); - return -EINVAL; - } - cur++; - - /* if no ':' is here, than we read the end */ - if (*cur != ':') { - end = memparse(cur, &tmp); - if (cur == tmp) { - pr_warn("crashkernel: Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (end <= start) { - pr_warn("crashkernel: end <= start\n"); - return -EINVAL; - } - } - - if (*cur != ':') { - pr_warn("crashkernel: ':' expected\n"); - return -EINVAL; - } - cur++; - - size = memparse(cur, &tmp); - if (cur == tmp) { - pr_warn("Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (size >= system_ram) { - pr_warn("crashkernel: invalid size\n"); - return -EINVAL; - } - - /* match ? */ - if (system_ram >= start && system_ram < end) { - *crash_size = size; - break; - } - } while (*cur++ == ','); - - if (*crash_size > 0) { - while (*cur && *cur != ' ' && *cur != '@') - cur++; - if (*cur == '@') { - cur++; - *crash_base = memparse(cur, &tmp); - if (cur == tmp) { - pr_warn("Memory value expected after '@'\n"); - return -EINVAL; - } - } - } - - return 0; -} - -/* - * That function parses "simple" (old) crashkernel command lines like - * - * crashkernel=size[@offset] - * - * It returns 0 on success and -EINVAL on failure. - */ -static int __init parse_crashkernel_simple(char *cmdline, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - char *cur = cmdline; - - *crash_size = memparse(cmdline, &cur); - if (cmdline == cur) { - pr_warn("crashkernel: memory value expected\n"); - return -EINVAL; - } - - if (*cur == '@') - *crash_base = memparse(cur+1, &cur); - else if (*cur != ' ' && *cur != '\0') { - pr_warn("crashkernel: unrecognized char\n"); - return -EINVAL; - } - - return 0; -} - -#define SUFFIX_HIGH 0 -#define SUFFIX_LOW 1 -#define SUFFIX_NULL 2 -static __initdata char *suffix_tbl[] = { - [SUFFIX_HIGH] = ",high", - [SUFFIX_LOW] = ",low", - [SUFFIX_NULL] = NULL, -}; - -/* - * That function parses "suffix" crashkernel command lines like - * - * crashkernel=size,[high|low] - * - * It returns 0 on success and -EINVAL on failure. - */ -static int __init parse_crashkernel_suffix(char *cmdline, - unsigned long long *crash_size, - const char *suffix) -{ - char *cur = cmdline; - - *crash_size = memparse(cmdline, &cur); - if (cmdline == cur) { - pr_warn("crashkernel: memory value expected\n"); - return -EINVAL; - } - - /* check with suffix */ - if (strncmp(cur, suffix, strlen(suffix))) { - pr_warn("crashkernel: unrecognized char\n"); - return -EINVAL; - } - cur += strlen(suffix); - if (*cur != ' ' && *cur != '\0') { - pr_warn("crashkernel: unrecognized char\n"); - return -EINVAL; - } - - return 0; -} - -static __init char *get_last_crashkernel(char *cmdline, - const char *name, - const char *suffix) -{ - char *p = cmdline, *ck_cmdline = NULL; - - /* find crashkernel and use the last one if there are more */ - p = strstr(p, name); - while (p) { - char *end_p = strchr(p, ' '); - char *q; - - if (!end_p) - end_p = p + strlen(p); - - if (!suffix) { - int i; - - /* skip the one with any known suffix */ - for (i = 0; suffix_tbl[i]; i++) { - q = end_p - strlen(suffix_tbl[i]); - if (!strncmp(q, suffix_tbl[i], - strlen(suffix_tbl[i]))) - goto next; - } - ck_cmdline = p; - } else { - q = end_p - strlen(suffix); - if (!strncmp(q, suffix, strlen(suffix))) - ck_cmdline = p; - } -next: - p = strstr(p+1, name); - } - - if (!ck_cmdline) - return NULL; - - return ck_cmdline; -} - -static int __init __parse_crashkernel(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base, - const char *name, - const char *suffix) -{ - char *first_colon, *first_space; - char *ck_cmdline; - - BUG_ON(!crash_size || !crash_base); - *crash_size = 0; - *crash_base = 0; - - ck_cmdline = get_last_crashkernel(cmdline, name, suffix); - - if (!ck_cmdline) - return -EINVAL; - - ck_cmdline += strlen(name); - - if (suffix) - return parse_crashkernel_suffix(ck_cmdline, crash_size, - suffix); - /* - * if the commandline contains a ':', then that's the extended - * syntax -- if not, it must be the classic syntax - */ - first_colon = strchr(ck_cmdline, ':'); - first_space = strchr(ck_cmdline, ' '); - if (first_colon && (!first_space || first_colon < first_space)) - return parse_crashkernel_mem(ck_cmdline, system_ram, - crash_size, crash_base); - - return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base); -} - -/* - * That function is the entry point for command line parsing and should be - * called from the arch-specific code. - */ -int __init parse_crashkernel(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, - "crashkernel=", NULL); -} - -int __init parse_crashkernel_high(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, - "crashkernel=", suffix_tbl[SUFFIX_HIGH]); -} - -int __init parse_crashkernel_low(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, - "crashkernel=", suffix_tbl[SUFFIX_LOW]); -} - -static void update_vmcoreinfo_note(void) -{ - u32 *buf = vmcoreinfo_note; - - if (!vmcoreinfo_size) - return; - buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data, - vmcoreinfo_size); - final_note(buf); -} - -void crash_save_vmcoreinfo(void) -{ - vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds()); - update_vmcoreinfo_note(); -} - -void vmcoreinfo_append_str(const char *fmt, ...) -{ - va_list args; - char buf[0x50]; - size_t r; - - va_start(args, fmt); - r = vscnprintf(buf, sizeof(buf), fmt, args); - va_end(args); - - r = min(r, vmcoreinfo_max_size - vmcoreinfo_size); - - memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); - - vmcoreinfo_size += r; -} - -/* - * provide an empty default implementation here -- architecture - * code may override this - */ -void __weak arch_crash_save_vmcoreinfo(void) -{} - -unsigned long __weak paddr_vmcoreinfo_note(void) -{ - return __pa((unsigned long)(char *)&vmcoreinfo_note); -} - -static int __init crash_save_vmcoreinfo_init(void) -{ - VMCOREINFO_OSRELEASE(init_uts_ns.name.release); - VMCOREINFO_PAGESIZE(PAGE_SIZE); - - VMCOREINFO_SYMBOL(init_uts_ns); - VMCOREINFO_SYMBOL(node_online_map); -#ifdef CONFIG_MMU - VMCOREINFO_SYMBOL(swapper_pg_dir); -#endif - VMCOREINFO_SYMBOL(_stext); - VMCOREINFO_SYMBOL(vmap_area_list); - -#ifndef CONFIG_NEED_MULTIPLE_NODES - VMCOREINFO_SYMBOL(mem_map); - VMCOREINFO_SYMBOL(contig_page_data); -#endif -#ifdef CONFIG_SPARSEMEM - VMCOREINFO_SYMBOL(mem_section); - VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); - VMCOREINFO_STRUCT_SIZE(mem_section); - VMCOREINFO_OFFSET(mem_section, section_mem_map); -#endif - VMCOREINFO_STRUCT_SIZE(page); - VMCOREINFO_STRUCT_SIZE(pglist_data); - VMCOREINFO_STRUCT_SIZE(zone); - VMCOREINFO_STRUCT_SIZE(free_area); - VMCOREINFO_STRUCT_SIZE(list_head); - VMCOREINFO_SIZE(nodemask_t); - VMCOREINFO_OFFSET(page, flags); - VMCOREINFO_OFFSET(page, _count); - VMCOREINFO_OFFSET(page, mapping); - VMCOREINFO_OFFSET(page, lru); - VMCOREINFO_OFFSET(page, _mapcount); - VMCOREINFO_OFFSET(page, private); - VMCOREINFO_OFFSET(pglist_data, node_zones); - VMCOREINFO_OFFSET(pglist_data, nr_zones); -#ifdef CONFIG_FLAT_NODE_MEM_MAP - VMCOREINFO_OFFSET(pglist_data, node_mem_map); -#endif - VMCOREINFO_OFFSET(pglist_data, node_start_pfn); - VMCOREINFO_OFFSET(pglist_data, node_spanned_pages); - VMCOREINFO_OFFSET(pglist_data, node_id); - VMCOREINFO_OFFSET(zone, free_area); - VMCOREINFO_OFFSET(zone, vm_stat); - VMCOREINFO_OFFSET(zone, spanned_pages); - VMCOREINFO_OFFSET(free_area, free_list); - VMCOREINFO_OFFSET(list_head, next); - VMCOREINFO_OFFSET(list_head, prev); - VMCOREINFO_OFFSET(vmap_area, va_start); - VMCOREINFO_OFFSET(vmap_area, list); - VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); - log_buf_kexec_setup(); - VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); - VMCOREINFO_NUMBER(NR_FREE_PAGES); - VMCOREINFO_NUMBER(PG_lru); - VMCOREINFO_NUMBER(PG_private); - VMCOREINFO_NUMBER(PG_swapcache); - VMCOREINFO_NUMBER(PG_slab); -#ifdef CONFIG_MEMORY_FAILURE - VMCOREINFO_NUMBER(PG_hwpoison); -#endif - VMCOREINFO_NUMBER(PG_head_mask); - VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); -#ifdef CONFIG_HUGETLBFS - VMCOREINFO_SYMBOL(free_huge_page); -#endif - - arch_crash_save_vmcoreinfo(); - update_vmcoreinfo_note(); - - return 0; -} - -subsys_initcall(crash_save_vmcoreinfo_init); - -#ifdef CONFIG_KEXEC_FILE -static int locate_mem_hole_top_down(unsigned long start, unsigned long end, - struct kexec_buf *kbuf) -{ - struct kimage *image = kbuf->image; - unsigned long temp_start, temp_end; - - temp_end = min(end, kbuf->buf_max); - temp_start = temp_end - kbuf->memsz; - - do { - /* align down start */ - temp_start = temp_start & (~(kbuf->buf_align - 1)); - - if (temp_start < start || temp_start < kbuf->buf_min) - return 0; - - temp_end = temp_start + kbuf->memsz - 1; - - /* - * Make sure this does not conflict with any of existing - * segments - */ - if (kimage_is_destination_range(image, temp_start, temp_end)) { - temp_start = temp_start - PAGE_SIZE; - continue; - } - - /* We found a suitable memory range */ - break; - } while (1); - - /* If we are here, we found a suitable memory range */ - kbuf->mem = temp_start; - - /* Success, stop navigating through remaining System RAM ranges */ - return 1; -} - -static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end, - struct kexec_buf *kbuf) -{ - struct kimage *image = kbuf->image; - unsigned long temp_start, temp_end; - - temp_start = max(start, kbuf->buf_min); - - do { - temp_start = ALIGN(temp_start, kbuf->buf_align); - temp_end = temp_start + kbuf->memsz - 1; - - if (temp_end > end || temp_end > kbuf->buf_max) - return 0; - /* - * Make sure this does not conflict with any of existing - * segments - */ - if (kimage_is_destination_range(image, temp_start, temp_end)) { - temp_start = temp_start + PAGE_SIZE; - continue; - } - - /* We found a suitable memory range */ - break; - } while (1); - - /* If we are here, we found a suitable memory range */ - kbuf->mem = temp_start; - - /* Success, stop navigating through remaining System RAM ranges */ - return 1; -} - -static int locate_mem_hole_callback(u64 start, u64 end, void *arg) -{ - struct kexec_buf *kbuf = (struct kexec_buf *)arg; - unsigned long sz = end - start + 1; - - /* Returning 0 will take to next memory range */ - if (sz < kbuf->memsz) - return 0; - - if (end < kbuf->buf_min || start > kbuf->buf_max) - return 0; - - /* - * Allocate memory top down with-in ram range. Otherwise bottom up - * allocation. - */ - if (kbuf->top_down) - return locate_mem_hole_top_down(start, end, kbuf); - return locate_mem_hole_bottom_up(start, end, kbuf); -} - -/* - * Helper function for placing a buffer in a kexec segment. This assumes - * that kexec_mutex is held. - */ -int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz, - unsigned long memsz, unsigned long buf_align, - unsigned long buf_min, unsigned long buf_max, - bool top_down, unsigned long *load_addr) -{ - - struct kexec_segment *ksegment; - struct kexec_buf buf, *kbuf; - int ret; - - /* Currently adding segment this way is allowed only in file mode */ - if (!image->file_mode) - return -EINVAL; - - if (image->nr_segments >= KEXEC_SEGMENT_MAX) - return -EINVAL; - - /* - * Make sure we are not trying to add buffer after allocating - * control pages. All segments need to be placed first before - * any control pages are allocated. As control page allocation - * logic goes through list of segments to make sure there are - * no destination overlaps. - */ - if (!list_empty(&image->control_pages)) { - WARN_ON(1); - return -EINVAL; - } - - memset(&buf, 0, sizeof(struct kexec_buf)); - kbuf = &buf; - kbuf->image = image; - kbuf->buffer = buffer; - kbuf->bufsz = bufsz; - - kbuf->memsz = ALIGN(memsz, PAGE_SIZE); - kbuf->buf_align = max(buf_align, PAGE_SIZE); - kbuf->buf_min = buf_min; - kbuf->buf_max = buf_max; - kbuf->top_down = top_down; - - /* Walk the RAM ranges and allocate a suitable range for the buffer */ - if (image->type == KEXEC_TYPE_CRASH) - ret = walk_iomem_res("Crash kernel", - IORESOURCE_MEM | IORESOURCE_BUSY, - crashk_res.start, crashk_res.end, kbuf, - locate_mem_hole_callback); - else - ret = walk_system_ram_res(0, -1, kbuf, - locate_mem_hole_callback); - if (ret != 1) { - /* A suitable memory range could not be found for buffer */ - return -EADDRNOTAVAIL; - } - - /* Found a suitable memory range */ - ksegment = &image->segment[image->nr_segments]; - ksegment->kbuf = kbuf->buffer; - ksegment->bufsz = kbuf->bufsz; - ksegment->mem = kbuf->mem; - ksegment->memsz = kbuf->memsz; - image->nr_segments++; - *load_addr = ksegment->mem; - return 0; -} - -/* Calculate and store the digest of segments */ -static int kexec_calculate_store_digests(struct kimage *image) -{ - struct crypto_shash *tfm; - struct shash_desc *desc; - int ret = 0, i, j, zero_buf_sz, sha_region_sz; - size_t desc_size, nullsz; - char *digest; - void *zero_buf; - struct kexec_sha_region *sha_regions; - struct purgatory_info *pi = &image->purgatory_info; - - zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT); - zero_buf_sz = PAGE_SIZE; - - tfm = crypto_alloc_shash("sha256", 0, 0); - if (IS_ERR(tfm)) { - ret = PTR_ERR(tfm); - goto out; - } - - desc_size = crypto_shash_descsize(tfm) + sizeof(*desc); - desc = kzalloc(desc_size, GFP_KERNEL); - if (!desc) { - ret = -ENOMEM; - goto out_free_tfm; - } - - sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region); - sha_regions = vzalloc(sha_region_sz); - if (!sha_regions) - goto out_free_desc; - - desc->tfm = tfm; - desc->flags = 0; - - ret = crypto_shash_init(desc); - if (ret < 0) - goto out_free_sha_regions; - - digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL); - if (!digest) { - ret = -ENOMEM; - goto out_free_sha_regions; - } - - for (j = i = 0; i < image->nr_segments; i++) { - struct kexec_segment *ksegment; - - ksegment = &image->segment[i]; - /* - * Skip purgatory as it will be modified once we put digest - * info in purgatory. - */ - if (ksegment->kbuf == pi->purgatory_buf) - continue; - - ret = crypto_shash_update(desc, ksegment->kbuf, - ksegment->bufsz); - if (ret) - break; - - /* - * Assume rest of the buffer is filled with zero and - * update digest accordingly. - */ - nullsz = ksegment->memsz - ksegment->bufsz; - while (nullsz) { - unsigned long bytes = nullsz; - - if (bytes > zero_buf_sz) - bytes = zero_buf_sz; - ret = crypto_shash_update(desc, zero_buf, bytes); - if (ret) - break; - nullsz -= bytes; - } - - if (ret) - break; - - sha_regions[j].start = ksegment->mem; - sha_regions[j].len = ksegment->memsz; - j++; - } - - if (!ret) { - ret = crypto_shash_final(desc, digest); - if (ret) - goto out_free_digest; - ret = kexec_purgatory_get_set_symbol(image, "sha_regions", - sha_regions, sha_region_sz, 0); - if (ret) - goto out_free_digest; - - ret = kexec_purgatory_get_set_symbol(image, "sha256_digest", - digest, SHA256_DIGEST_SIZE, 0); - if (ret) - goto out_free_digest; - } - -out_free_digest: - kfree(digest); -out_free_sha_regions: - vfree(sha_regions); -out_free_desc: - kfree(desc); -out_free_tfm: - kfree(tfm); -out: - return ret; -} - -/* Actually load purgatory. Lot of code taken from kexec-tools */ -static int __kexec_load_purgatory(struct kimage *image, unsigned long min, - unsigned long max, int top_down) -{ - struct purgatory_info *pi = &image->purgatory_info; - unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad; - unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset; - unsigned char *buf_addr, *src; - int i, ret = 0, entry_sidx = -1; - const Elf_Shdr *sechdrs_c; - Elf_Shdr *sechdrs = NULL; - void *purgatory_buf = NULL; - - /* - * sechdrs_c points to section headers in purgatory and are read - * only. No modifications allowed. - */ - sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff; - - /* - * We can not modify sechdrs_c[] and its fields. It is read only. - * Copy it over to a local copy where one can store some temporary - * data and free it at the end. We need to modify ->sh_addr and - * ->sh_offset fields to keep track of permanent and temporary - * locations of sections. - */ - sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr)); - if (!sechdrs) - return -ENOMEM; - - memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr)); - - /* - * We seem to have multiple copies of sections. First copy is which - * is embedded in kernel in read only section. Some of these sections - * will be copied to a temporary buffer and relocated. And these - * sections will finally be copied to their final destination at - * segment load time. - * - * Use ->sh_offset to reflect section address in memory. It will - * point to original read only copy if section is not allocatable. - * Otherwise it will point to temporary copy which will be relocated. - * - * Use ->sh_addr to contain final address of the section where it - * will go during execution time. - */ - for (i = 0; i < pi->ehdr->e_shnum; i++) { - if (sechdrs[i].sh_type == SHT_NOBITS) - continue; - - sechdrs[i].sh_offset = (unsigned long)pi->ehdr + - sechdrs[i].sh_offset; - } - - /* - * Identify entry point section and make entry relative to section - * start. - */ - entry = pi->ehdr->e_entry; - for (i = 0; i < pi->ehdr->e_shnum; i++) { - if (!(sechdrs[i].sh_flags & SHF_ALLOC)) - continue; - - if (!(sechdrs[i].sh_flags & SHF_EXECINSTR)) - continue; - - /* Make entry section relative */ - if (sechdrs[i].sh_addr <= pi->ehdr->e_entry && - ((sechdrs[i].sh_addr + sechdrs[i].sh_size) > - pi->ehdr->e_entry)) { - entry_sidx = i; - entry -= sechdrs[i].sh_addr; - break; - } - } - - /* Determine how much memory is needed to load relocatable object. */ - buf_align = 1; - bss_align = 1; - buf_sz = 0; - bss_sz = 0; - - for (i = 0; i < pi->ehdr->e_shnum; i++) { - if (!(sechdrs[i].sh_flags & SHF_ALLOC)) - continue; - - align = sechdrs[i].sh_addralign; - if (sechdrs[i].sh_type != SHT_NOBITS) { - if (buf_align < align) - buf_align = align; - buf_sz = ALIGN(buf_sz, align); - buf_sz += sechdrs[i].sh_size; - } else { - /* bss section */ - if (bss_align < align) - bss_align = align; - bss_sz = ALIGN(bss_sz, align); - bss_sz += sechdrs[i].sh_size; - } - } - - /* Determine the bss padding required to align bss properly */ - bss_pad = 0; - if (buf_sz & (bss_align - 1)) - bss_pad = bss_align - (buf_sz & (bss_align - 1)); - - memsz = buf_sz + bss_pad + bss_sz; - - /* Allocate buffer for purgatory */ - purgatory_buf = vzalloc(buf_sz); - if (!purgatory_buf) { - ret = -ENOMEM; - goto out; - } - - if (buf_align < bss_align) - buf_align = bss_align; - - /* Add buffer to segment list */ - ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz, - buf_align, min, max, top_down, - &pi->purgatory_load_addr); - if (ret) - goto out; - - /* Load SHF_ALLOC sections */ - buf_addr = purgatory_buf; - load_addr = curr_load_addr = pi->purgatory_load_addr; - bss_addr = load_addr + buf_sz + bss_pad; - - for (i = 0; i < pi->ehdr->e_shnum; i++) { - if (!(sechdrs[i].sh_flags & SHF_ALLOC)) - continue; - - align = sechdrs[i].sh_addralign; - if (sechdrs[i].sh_type != SHT_NOBITS) { - curr_load_addr = ALIGN(curr_load_addr, align); - offset = curr_load_addr - load_addr; - /* We already modifed ->sh_offset to keep src addr */ - src = (char *) sechdrs[i].sh_offset; - memcpy(buf_addr + offset, src, sechdrs[i].sh_size); - - /* Store load address and source address of section */ - sechdrs[i].sh_addr = curr_load_addr; - - /* - * This section got copied to temporary buffer. Update - * ->sh_offset accordingly. - */ - sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset); - - /* Advance to the next address */ - curr_load_addr += sechdrs[i].sh_size; - } else { - bss_addr = ALIGN(bss_addr, align); - sechdrs[i].sh_addr = bss_addr; - bss_addr += sechdrs[i].sh_size; - } - } - - /* Update entry point based on load address of text section */ - if (entry_sidx >= 0) - entry += sechdrs[entry_sidx].sh_addr; - - /* Make kernel jump to purgatory after shutdown */ - image->start = entry; - - /* Used later to get/set symbol values */ - pi->sechdrs = sechdrs; - - /* - * Used later to identify which section is purgatory and skip it - * from checksumming. - */ - pi->purgatory_buf = purgatory_buf; - return ret; -out: - vfree(sechdrs); - vfree(purgatory_buf); - return ret; -} - -static int kexec_apply_relocations(struct kimage *image) -{ - int i, ret; - struct purgatory_info *pi = &image->purgatory_info; - Elf_Shdr *sechdrs = pi->sechdrs; - - /* Apply relocations */ - for (i = 0; i < pi->ehdr->e_shnum; i++) { - Elf_Shdr *section, *symtab; - - if (sechdrs[i].sh_type != SHT_RELA && - sechdrs[i].sh_type != SHT_REL) - continue; - - /* - * For section of type SHT_RELA/SHT_REL, - * ->sh_link contains section header index of associated - * symbol table. And ->sh_info contains section header - * index of section to which relocations apply. - */ - if (sechdrs[i].sh_info >= pi->ehdr->e_shnum || - sechdrs[i].sh_link >= pi->ehdr->e_shnum) - return -ENOEXEC; - - section = &sechdrs[sechdrs[i].sh_info]; - symtab = &sechdrs[sechdrs[i].sh_link]; - - if (!(section->sh_flags & SHF_ALLOC)) - continue; - - /* - * symtab->sh_link contain section header index of associated - * string table. - */ - if (symtab->sh_link >= pi->ehdr->e_shnum) - /* Invalid section number? */ - continue; - - /* - * Respective architecture needs to provide support for applying - * relocations of type SHT_RELA/SHT_REL. - */ - if (sechdrs[i].sh_type == SHT_RELA) - ret = arch_kexec_apply_relocations_add(pi->ehdr, - sechdrs, i); - else if (sechdrs[i].sh_type == SHT_REL) - ret = arch_kexec_apply_relocations(pi->ehdr, - sechdrs, i); - if (ret) - return ret; - } - - return 0; -} - -/* Load relocatable purgatory object and relocate it appropriately */ -int kexec_load_purgatory(struct kimage *image, unsigned long min, - unsigned long max, int top_down, - unsigned long *load_addr) -{ - struct purgatory_info *pi = &image->purgatory_info; - int ret; - - if (kexec_purgatory_size <= 0) - return -EINVAL; - - if (kexec_purgatory_size < sizeof(Elf_Ehdr)) - return -ENOEXEC; - - pi->ehdr = (Elf_Ehdr *)kexec_purgatory; - - if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0 - || pi->ehdr->e_type != ET_REL - || !elf_check_arch(pi->ehdr) - || pi->ehdr->e_shentsize != sizeof(Elf_Shdr)) - return -ENOEXEC; - - if (pi->ehdr->e_shoff >= kexec_purgatory_size - || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) > - kexec_purgatory_size - pi->ehdr->e_shoff)) - return -ENOEXEC; - - ret = __kexec_load_purgatory(image, min, max, top_down); - if (ret) - return ret; - - ret = kexec_apply_relocations(image); - if (ret) - goto out; - - *load_addr = pi->purgatory_load_addr; - return 0; -out: - vfree(pi->sechdrs); - vfree(pi->purgatory_buf); - return ret; -} - -static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi, - const char *name) -{ - Elf_Sym *syms; - Elf_Shdr *sechdrs; - Elf_Ehdr *ehdr; - int i, k; - const char *strtab; - - if (!pi->sechdrs || !pi->ehdr) - return NULL; - - sechdrs = pi->sechdrs; - ehdr = pi->ehdr; - - for (i = 0; i < ehdr->e_shnum; i++) { - if (sechdrs[i].sh_type != SHT_SYMTAB) - continue; - - if (sechdrs[i].sh_link >= ehdr->e_shnum) - /* Invalid strtab section number */ - continue; - strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset; - syms = (Elf_Sym *)sechdrs[i].sh_offset; - - /* Go through symbols for a match */ - for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) { - if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL) - continue; - - if (strcmp(strtab + syms[k].st_name, name) != 0) - continue; - - if (syms[k].st_shndx == SHN_UNDEF || - syms[k].st_shndx >= ehdr->e_shnum) { - pr_debug("Symbol: %s has bad section index %d.\n", - name, syms[k].st_shndx); - return NULL; - } - - /* Found the symbol we are looking for */ - return &syms[k]; - } - } - - return NULL; -} - -void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name) -{ - struct purgatory_info *pi = &image->purgatory_info; - Elf_Sym *sym; - Elf_Shdr *sechdr; - - sym = kexec_purgatory_find_symbol(pi, name); - if (!sym) - return ERR_PTR(-EINVAL); - - sechdr = &pi->sechdrs[sym->st_shndx]; - - /* - * Returns the address where symbol will finally be loaded after - * kexec_load_segment() - */ - return (void *)(sechdr->sh_addr + sym->st_value); -} - -/* - * Get or set value of a symbol. If "get_value" is true, symbol value is - * returned in buf otherwise symbol value is set based on value in buf. - */ -int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, - void *buf, unsigned int size, bool get_value) -{ - Elf_Sym *sym; - Elf_Shdr *sechdrs; - struct purgatory_info *pi = &image->purgatory_info; - char *sym_buf; - - sym = kexec_purgatory_find_symbol(pi, name); - if (!sym) - return -EINVAL; - - if (sym->st_size != size) { - pr_err("symbol %s size mismatch: expected %lu actual %u\n", - name, (unsigned long)sym->st_size, size); - return -EINVAL; - } - - sechdrs = pi->sechdrs; - - if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) { - pr_err("symbol %s is in a bss section. Cannot %s\n", name, - get_value ? "get" : "set"); - return -EINVAL; - } - - sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset + - sym->st_value; - - if (get_value) - memcpy((void *)buf, sym_buf, size); - else - memcpy((void *)sym_buf, buf, size); - - return 0; -} -#endif /* CONFIG_KEXEC_FILE */ - -/* - * Move into place and start executing a preloaded standalone - * executable. If nothing was preloaded return an error. - */ -int kernel_kexec(void) -{ - int error = 0; - - if (!mutex_trylock(&kexec_mutex)) - return -EBUSY; - if (!kexec_image) { - error = -EINVAL; - goto Unlock; - } - -#ifdef CONFIG_KEXEC_JUMP - if (kexec_image->preserve_context) { - lock_system_sleep(); - pm_prepare_console(); - error = freeze_processes(); - if (error) { - error = -EBUSY; - goto Restore_console; - } - suspend_console(); - error = dpm_suspend_start(PMSG_FREEZE); - if (error) - goto Resume_console; - /* At this point, dpm_suspend_start() has been called, - * but *not* dpm_suspend_end(). We *must* call - * dpm_suspend_end() now. Otherwise, drivers for - * some devices (e.g. interrupt controllers) become - * desynchronized with the actual state of the - * hardware at resume time, and evil weirdness ensues. - */ - error = dpm_suspend_end(PMSG_FREEZE); - if (error) - goto Resume_devices; - error = disable_nonboot_cpus(); - if (error) - goto Enable_cpus; - local_irq_disable(); - error = syscore_suspend(); - if (error) - goto Enable_irqs; - } else -#endif - { - kexec_in_progress = true; - kernel_restart_prepare(NULL); - migrate_to_reboot_cpu(); - - /* - * migrate_to_reboot_cpu() disables CPU hotplug assuming that - * no further code needs to use CPU hotplug (which is true in - * the reboot case). However, the kexec path depends on using - * CPU hotplug again; so re-enable it here. - */ - cpu_hotplug_enable(); - pr_emerg("Starting new kernel\n"); - machine_shutdown(); - } - - machine_kexec(kexec_image); - -#ifdef CONFIG_KEXEC_JUMP - if (kexec_image->preserve_context) { - syscore_resume(); - Enable_irqs: - local_irq_enable(); - Enable_cpus: - enable_nonboot_cpus(); - dpm_resume_start(PMSG_RESTORE); - Resume_devices: - dpm_resume_end(PMSG_RESTORE); - Resume_console: - resume_console(); - thaw_processes(); - Restore_console: - pm_restore_console(); - unlock_system_sleep(); - } -#endif - - Unlock: - mutex_unlock(&kexec_mutex); - return error; -} diff --git a/kernel/kernel/kexec_core.c b/kernel/kernel/kexec_core.c new file mode 100644 index 000000000..11b64a63c --- /dev/null +++ b/kernel/kernel/kexec_core.c @@ -0,0 +1,1534 @@ +/* + * kexec.c - kexec system call core code. + * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/capability.h> +#include <linux/mm.h> +#include <linux/file.h> +#include <linux/slab.h> +#include <linux/fs.h> +#include <linux/kexec.h> +#include <linux/mutex.h> +#include <linux/list.h> +#include <linux/highmem.h> +#include <linux/syscalls.h> +#include <linux/reboot.h> +#include <linux/ioport.h> +#include <linux/hardirq.h> +#include <linux/elf.h> +#include <linux/elfcore.h> +#include <linux/utsname.h> +#include <linux/numa.h> +#include <linux/suspend.h> +#include <linux/device.h> +#include <linux/freezer.h> +#include <linux/pm.h> +#include <linux/cpu.h> +#include <linux/uaccess.h> +#include <linux/io.h> +#include <linux/console.h> +#include <linux/vmalloc.h> +#include <linux/swap.h> +#include <linux/syscore_ops.h> +#include <linux/compiler.h> +#include <linux/hugetlb.h> + +#include <asm/page.h> +#include <asm/sections.h> + +#include <crypto/hash.h> +#include <crypto/sha.h> +#include "kexec_internal.h" + +DEFINE_MUTEX(kexec_mutex); + +/* Per cpu memory for storing cpu states in case of system crash. */ +note_buf_t __percpu *crash_notes; + +/* vmcoreinfo stuff */ +static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; +u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; +size_t vmcoreinfo_size; +size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); + +/* Flag to indicate we are going to kexec a new kernel */ +bool kexec_in_progress = false; + + +/* Location of the reserved area for the crash kernel */ +struct resource crashk_res = { + .name = "Crash kernel", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; +struct resource crashk_low_res = { + .name = "Crash kernel", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +int kexec_should_crash(struct task_struct *p) +{ + /* + * If crash_kexec_post_notifiers is enabled, don't run + * crash_kexec() here yet, which must be run after panic + * notifiers in panic(). + */ + if (crash_kexec_post_notifiers) + return 0; + /* + * There are 4 panic() calls in do_exit() path, each of which + * corresponds to each of these 4 conditions. + */ + if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops) + return 1; + return 0; +} + +/* + * When kexec transitions to the new kernel there is a one-to-one + * mapping between physical and virtual addresses. On processors + * where you can disable the MMU this is trivial, and easy. For + * others it is still a simple predictable page table to setup. + * + * In that environment kexec copies the new kernel to its final + * resting place. This means I can only support memory whose + * physical address can fit in an unsigned long. In particular + * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled. + * If the assembly stub has more restrictive requirements + * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be + * defined more restrictively in <asm/kexec.h>. + * + * The code for the transition from the current kernel to the + * the new kernel is placed in the control_code_buffer, whose size + * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single + * page of memory is necessary, but some architectures require more. + * Because this memory must be identity mapped in the transition from + * virtual to physical addresses it must live in the range + * 0 - TASK_SIZE, as only the user space mappings are arbitrarily + * modifiable. + * + * The assembly stub in the control code buffer is passed a linked list + * of descriptor pages detailing the source pages of the new kernel, + * and the destination addresses of those source pages. As this data + * structure is not used in the context of the current OS, it must + * be self-contained. + * + * The code has been made to work with highmem pages and will use a + * destination page in its final resting place (if it happens + * to allocate it). The end product of this is that most of the + * physical address space, and most of RAM can be used. + * + * Future directions include: + * - allocating a page table with the control code buffer identity + * mapped, to simplify machine_kexec and make kexec_on_panic more + * reliable. + */ + +/* + * KIMAGE_NO_DEST is an impossible destination address..., for + * allocating pages whose destination address we do not care about. + */ +#define KIMAGE_NO_DEST (-1UL) + +static struct page *kimage_alloc_page(struct kimage *image, + gfp_t gfp_mask, + unsigned long dest); + +int sanity_check_segment_list(struct kimage *image) +{ + int result, i; + unsigned long nr_segments = image->nr_segments; + + /* + * Verify we have good destination addresses. The caller is + * responsible for making certain we don't attempt to load + * the new image into invalid or reserved areas of RAM. This + * just verifies it is an address we can use. + * + * Since the kernel does everything in page size chunks ensure + * the destination addresses are page aligned. Too many + * special cases crop of when we don't do this. The most + * insidious is getting overlapping destination addresses + * simply because addresses are changed to page size + * granularity. + */ + result = -EADDRNOTAVAIL; + for (i = 0; i < nr_segments; i++) { + unsigned long mstart, mend; + + mstart = image->segment[i].mem; + mend = mstart + image->segment[i].memsz; + if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) + return result; + if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) + return result; + } + + /* Verify our destination addresses do not overlap. + * If we alloed overlapping destination addresses + * through very weird things can happen with no + * easy explanation as one segment stops on another. + */ + result = -EINVAL; + for (i = 0; i < nr_segments; i++) { + unsigned long mstart, mend; + unsigned long j; + + mstart = image->segment[i].mem; + mend = mstart + image->segment[i].memsz; + for (j = 0; j < i; j++) { + unsigned long pstart, pend; + + pstart = image->segment[j].mem; + pend = pstart + image->segment[j].memsz; + /* Do the segments overlap ? */ + if ((mend > pstart) && (mstart < pend)) + return result; + } + } + + /* Ensure our buffer sizes are strictly less than + * our memory sizes. This should always be the case, + * and it is easier to check up front than to be surprised + * later on. + */ + result = -EINVAL; + for (i = 0; i < nr_segments; i++) { + if (image->segment[i].bufsz > image->segment[i].memsz) + return result; + } + + /* + * Verify we have good destination addresses. Normally + * the caller is responsible for making certain we don't + * attempt to load the new image into invalid or reserved + * areas of RAM. But crash kernels are preloaded into a + * reserved area of ram. We must ensure the addresses + * are in the reserved area otherwise preloading the + * kernel could corrupt things. + */ + + if (image->type == KEXEC_TYPE_CRASH) { + result = -EADDRNOTAVAIL; + for (i = 0; i < nr_segments; i++) { + unsigned long mstart, mend; + + mstart = image->segment[i].mem; + mend = mstart + image->segment[i].memsz - 1; + /* Ensure we are within the crash kernel limits */ + if ((mstart < crashk_res.start) || + (mend > crashk_res.end)) + return result; + } + } + + return 0; +} + +struct kimage *do_kimage_alloc_init(void) +{ + struct kimage *image; + + /* Allocate a controlling structure */ + image = kzalloc(sizeof(*image), GFP_KERNEL); + if (!image) + return NULL; + + image->head = 0; + image->entry = &image->head; + image->last_entry = &image->head; + image->control_page = ~0; /* By default this does not apply */ + image->type = KEXEC_TYPE_DEFAULT; + + /* Initialize the list of control pages */ + INIT_LIST_HEAD(&image->control_pages); + + /* Initialize the list of destination pages */ + INIT_LIST_HEAD(&image->dest_pages); + + /* Initialize the list of unusable pages */ + INIT_LIST_HEAD(&image->unusable_pages); + + return image; +} + +int kimage_is_destination_range(struct kimage *image, + unsigned long start, + unsigned long end) +{ + unsigned long i; + + for (i = 0; i < image->nr_segments; i++) { + unsigned long mstart, mend; + + mstart = image->segment[i].mem; + mend = mstart + image->segment[i].memsz; + if ((end > mstart) && (start < mend)) + return 1; + } + + return 0; +} + +static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) +{ + struct page *pages; + + pages = alloc_pages(gfp_mask, order); + if (pages) { + unsigned int count, i; + + pages->mapping = NULL; + set_page_private(pages, order); + count = 1 << order; + for (i = 0; i < count; i++) + SetPageReserved(pages + i); + } + + return pages; +} + +static void kimage_free_pages(struct page *page) +{ + unsigned int order, count, i; + + order = page_private(page); + count = 1 << order; + for (i = 0; i < count; i++) + ClearPageReserved(page + i); + __free_pages(page, order); +} + +void kimage_free_page_list(struct list_head *list) +{ + struct list_head *pos, *next; + + list_for_each_safe(pos, next, list) { + struct page *page; + + page = list_entry(pos, struct page, lru); + list_del(&page->lru); + kimage_free_pages(page); + } +} + +static struct page *kimage_alloc_normal_control_pages(struct kimage *image, + unsigned int order) +{ + /* Control pages are special, they are the intermediaries + * that are needed while we copy the rest of the pages + * to their final resting place. As such they must + * not conflict with either the destination addresses + * or memory the kernel is already using. + * + * The only case where we really need more than one of + * these are for architectures where we cannot disable + * the MMU and must instead generate an identity mapped + * page table for all of the memory. + * + * At worst this runs in O(N) of the image size. + */ + struct list_head extra_pages; + struct page *pages; + unsigned int count; + + count = 1 << order; + INIT_LIST_HEAD(&extra_pages); + + /* Loop while I can allocate a page and the page allocated + * is a destination page. + */ + do { + unsigned long pfn, epfn, addr, eaddr; + + pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order); + if (!pages) + break; + pfn = page_to_pfn(pages); + epfn = pfn + count; + addr = pfn << PAGE_SHIFT; + eaddr = epfn << PAGE_SHIFT; + if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) || + kimage_is_destination_range(image, addr, eaddr)) { + list_add(&pages->lru, &extra_pages); + pages = NULL; + } + } while (!pages); + + if (pages) { + /* Remember the allocated page... */ + list_add(&pages->lru, &image->control_pages); + + /* Because the page is already in it's destination + * location we will never allocate another page at + * that address. Therefore kimage_alloc_pages + * will not return it (again) and we don't need + * to give it an entry in image->segment[]. + */ + } + /* Deal with the destination pages I have inadvertently allocated. + * + * Ideally I would convert multi-page allocations into single + * page allocations, and add everything to image->dest_pages. + * + * For now it is simpler to just free the pages. + */ + kimage_free_page_list(&extra_pages); + + return pages; +} + +static struct page *kimage_alloc_crash_control_pages(struct kimage *image, + unsigned int order) +{ + /* Control pages are special, they are the intermediaries + * that are needed while we copy the rest of the pages + * to their final resting place. As such they must + * not conflict with either the destination addresses + * or memory the kernel is already using. + * + * Control pages are also the only pags we must allocate + * when loading a crash kernel. All of the other pages + * are specified by the segments and we just memcpy + * into them directly. + * + * The only case where we really need more than one of + * these are for architectures where we cannot disable + * the MMU and must instead generate an identity mapped + * page table for all of the memory. + * + * Given the low demand this implements a very simple + * allocator that finds the first hole of the appropriate + * size in the reserved memory region, and allocates all + * of the memory up to and including the hole. + */ + unsigned long hole_start, hole_end, size; + struct page *pages; + + pages = NULL; + size = (1 << order) << PAGE_SHIFT; + hole_start = (image->control_page + (size - 1)) & ~(size - 1); + hole_end = hole_start + size - 1; + while (hole_end <= crashk_res.end) { + unsigned long i; + + if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT) + break; + /* See if I overlap any of the segments */ + for (i = 0; i < image->nr_segments; i++) { + unsigned long mstart, mend; + + mstart = image->segment[i].mem; + mend = mstart + image->segment[i].memsz - 1; + if ((hole_end >= mstart) && (hole_start <= mend)) { + /* Advance the hole to the end of the segment */ + hole_start = (mend + (size - 1)) & ~(size - 1); + hole_end = hole_start + size - 1; + break; + } + } + /* If I don't overlap any segments I have found my hole! */ + if (i == image->nr_segments) { + pages = pfn_to_page(hole_start >> PAGE_SHIFT); + image->control_page = hole_end; + break; + } + } + + return pages; +} + + +struct page *kimage_alloc_control_pages(struct kimage *image, + unsigned int order) +{ + struct page *pages = NULL; + + switch (image->type) { + case KEXEC_TYPE_DEFAULT: + pages = kimage_alloc_normal_control_pages(image, order); + break; + case KEXEC_TYPE_CRASH: + pages = kimage_alloc_crash_control_pages(image, order); + break; + } + + return pages; +} + +static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) +{ + if (*image->entry != 0) + image->entry++; + + if (image->entry == image->last_entry) { + kimage_entry_t *ind_page; + struct page *page; + + page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST); + if (!page) + return -ENOMEM; + + ind_page = page_address(page); + *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION; + image->entry = ind_page; + image->last_entry = ind_page + + ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1); + } + *image->entry = entry; + image->entry++; + *image->entry = 0; + + return 0; +} + +static int kimage_set_destination(struct kimage *image, + unsigned long destination) +{ + int result; + + destination &= PAGE_MASK; + result = kimage_add_entry(image, destination | IND_DESTINATION); + + return result; +} + + +static int kimage_add_page(struct kimage *image, unsigned long page) +{ + int result; + + page &= PAGE_MASK; + result = kimage_add_entry(image, page | IND_SOURCE); + + return result; +} + + +static void kimage_free_extra_pages(struct kimage *image) +{ + /* Walk through and free any extra destination pages I may have */ + kimage_free_page_list(&image->dest_pages); + + /* Walk through and free any unusable pages I have cached */ + kimage_free_page_list(&image->unusable_pages); + +} +void kimage_terminate(struct kimage *image) +{ + if (*image->entry != 0) + image->entry++; + + *image->entry = IND_DONE; +} + +#define for_each_kimage_entry(image, ptr, entry) \ + for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ + ptr = (entry & IND_INDIRECTION) ? \ + phys_to_virt((entry & PAGE_MASK)) : ptr + 1) + +static void kimage_free_entry(kimage_entry_t entry) +{ + struct page *page; + + page = pfn_to_page(entry >> PAGE_SHIFT); + kimage_free_pages(page); +} + +void kimage_free(struct kimage *image) +{ + kimage_entry_t *ptr, entry; + kimage_entry_t ind = 0; + + if (!image) + return; + + kimage_free_extra_pages(image); + for_each_kimage_entry(image, ptr, entry) { + if (entry & IND_INDIRECTION) { + /* Free the previous indirection page */ + if (ind & IND_INDIRECTION) + kimage_free_entry(ind); + /* Save this indirection page until we are + * done with it. + */ + ind = entry; + } else if (entry & IND_SOURCE) + kimage_free_entry(entry); + } + /* Free the final indirection page */ + if (ind & IND_INDIRECTION) + kimage_free_entry(ind); + + /* Handle any machine specific cleanup */ + machine_kexec_cleanup(image); + + /* Free the kexec control pages... */ + kimage_free_page_list(&image->control_pages); + + /* + * Free up any temporary buffers allocated. This might hit if + * error occurred much later after buffer allocation. + */ + if (image->file_mode) + kimage_file_post_load_cleanup(image); + + kfree(image); +} + +static kimage_entry_t *kimage_dst_used(struct kimage *image, + unsigned long page) +{ + kimage_entry_t *ptr, entry; + unsigned long destination = 0; + + for_each_kimage_entry(image, ptr, entry) { + if (entry & IND_DESTINATION) + destination = entry & PAGE_MASK; + else if (entry & IND_SOURCE) { + if (page == destination) + return ptr; + destination += PAGE_SIZE; + } + } + + return NULL; +} + +static struct page *kimage_alloc_page(struct kimage *image, + gfp_t gfp_mask, + unsigned long destination) +{ + /* + * Here we implement safeguards to ensure that a source page + * is not copied to its destination page before the data on + * the destination page is no longer useful. + * + * To do this we maintain the invariant that a source page is + * either its own destination page, or it is not a + * destination page at all. + * + * That is slightly stronger than required, but the proof + * that no problems will not occur is trivial, and the + * implementation is simply to verify. + * + * When allocating all pages normally this algorithm will run + * in O(N) time, but in the worst case it will run in O(N^2) + * time. If the runtime is a problem the data structures can + * be fixed. + */ + struct page *page; + unsigned long addr; + + /* + * Walk through the list of destination pages, and see if I + * have a match. + */ + list_for_each_entry(page, &image->dest_pages, lru) { + addr = page_to_pfn(page) << PAGE_SHIFT; + if (addr == destination) { + list_del(&page->lru); + return page; + } + } + page = NULL; + while (1) { + kimage_entry_t *old; + + /* Allocate a page, if we run out of memory give up */ + page = kimage_alloc_pages(gfp_mask, 0); + if (!page) + return NULL; + /* If the page cannot be used file it away */ + if (page_to_pfn(page) > + (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { + list_add(&page->lru, &image->unusable_pages); + continue; + } + addr = page_to_pfn(page) << PAGE_SHIFT; + + /* If it is the destination page we want use it */ + if (addr == destination) + break; + + /* If the page is not a destination page use it */ + if (!kimage_is_destination_range(image, addr, + addr + PAGE_SIZE)) + break; + + /* + * I know that the page is someones destination page. + * See if there is already a source page for this + * destination page. And if so swap the source pages. + */ + old = kimage_dst_used(image, addr); + if (old) { + /* If so move it */ + unsigned long old_addr; + struct page *old_page; + + old_addr = *old & PAGE_MASK; + old_page = pfn_to_page(old_addr >> PAGE_SHIFT); + copy_highpage(page, old_page); + *old = addr | (*old & ~PAGE_MASK); + + /* The old page I have found cannot be a + * destination page, so return it if it's + * gfp_flags honor the ones passed in. + */ + if (!(gfp_mask & __GFP_HIGHMEM) && + PageHighMem(old_page)) { + kimage_free_pages(old_page); + continue; + } + addr = old_addr; + page = old_page; + break; + } + /* Place the page on the destination list, to be used later */ + list_add(&page->lru, &image->dest_pages); + } + + return page; +} + +static int kimage_load_normal_segment(struct kimage *image, + struct kexec_segment *segment) +{ + unsigned long maddr; + size_t ubytes, mbytes; + int result; + unsigned char __user *buf = NULL; + unsigned char *kbuf = NULL; + + result = 0; + if (image->file_mode) + kbuf = segment->kbuf; + else + buf = segment->buf; + ubytes = segment->bufsz; + mbytes = segment->memsz; + maddr = segment->mem; + + result = kimage_set_destination(image, maddr); + if (result < 0) + goto out; + + while (mbytes) { + struct page *page; + char *ptr; + size_t uchunk, mchunk; + + page = kimage_alloc_page(image, GFP_HIGHUSER, maddr); + if (!page) { + result = -ENOMEM; + goto out; + } + result = kimage_add_page(image, page_to_pfn(page) + << PAGE_SHIFT); + if (result < 0) + goto out; + + ptr = kmap(page); + /* Start with a clear page */ + clear_page(ptr); + ptr += maddr & ~PAGE_MASK; + mchunk = min_t(size_t, mbytes, + PAGE_SIZE - (maddr & ~PAGE_MASK)); + uchunk = min(ubytes, mchunk); + + /* For file based kexec, source pages are in kernel memory */ + if (image->file_mode) + memcpy(ptr, kbuf, uchunk); + else + result = copy_from_user(ptr, buf, uchunk); + kunmap(page); + if (result) { + result = -EFAULT; + goto out; + } + ubytes -= uchunk; + maddr += mchunk; + if (image->file_mode) + kbuf += mchunk; + else + buf += mchunk; + mbytes -= mchunk; + } +out: + return result; +} + +static int kimage_load_crash_segment(struct kimage *image, + struct kexec_segment *segment) +{ + /* For crash dumps kernels we simply copy the data from + * user space to it's destination. + * We do things a page at a time for the sake of kmap. + */ + unsigned long maddr; + size_t ubytes, mbytes; + int result; + unsigned char __user *buf = NULL; + unsigned char *kbuf = NULL; + + result = 0; + if (image->file_mode) + kbuf = segment->kbuf; + else + buf = segment->buf; + ubytes = segment->bufsz; + mbytes = segment->memsz; + maddr = segment->mem; + while (mbytes) { + struct page *page; + char *ptr; + size_t uchunk, mchunk; + + page = pfn_to_page(maddr >> PAGE_SHIFT); + if (!page) { + result = -ENOMEM; + goto out; + } + ptr = kmap(page); + ptr += maddr & ~PAGE_MASK; + mchunk = min_t(size_t, mbytes, + PAGE_SIZE - (maddr & ~PAGE_MASK)); + uchunk = min(ubytes, mchunk); + if (mchunk > uchunk) { + /* Zero the trailing part of the page */ + memset(ptr + uchunk, 0, mchunk - uchunk); + } + + /* For file based kexec, source pages are in kernel memory */ + if (image->file_mode) + memcpy(ptr, kbuf, uchunk); + else + result = copy_from_user(ptr, buf, uchunk); + kexec_flush_icache_page(page); + kunmap(page); + if (result) { + result = -EFAULT; + goto out; + } + ubytes -= uchunk; + maddr += mchunk; + if (image->file_mode) + kbuf += mchunk; + else + buf += mchunk; + mbytes -= mchunk; + } +out: + return result; +} + +int kimage_load_segment(struct kimage *image, + struct kexec_segment *segment) +{ + int result = -ENOMEM; + + switch (image->type) { + case KEXEC_TYPE_DEFAULT: + result = kimage_load_normal_segment(image, segment); + break; + case KEXEC_TYPE_CRASH: + result = kimage_load_crash_segment(image, segment); + break; + } + + return result; +} + +struct kimage *kexec_image; +struct kimage *kexec_crash_image; +int kexec_load_disabled; + +void crash_kexec(struct pt_regs *regs) +{ + /* Take the kexec_mutex here to prevent sys_kexec_load + * running on one cpu from replacing the crash kernel + * we are using after a panic on a different cpu. + * + * If the crash kernel was not located in a fixed area + * of memory the xchg(&kexec_crash_image) would be + * sufficient. But since I reuse the memory... + */ + if (mutex_trylock(&kexec_mutex)) { + if (kexec_crash_image) { + struct pt_regs fixed_regs; + + crash_setup_regs(&fixed_regs, regs); + crash_save_vmcoreinfo(); + machine_crash_shutdown(&fixed_regs); + machine_kexec(kexec_crash_image); + } + mutex_unlock(&kexec_mutex); + } +} + +size_t crash_get_memory_size(void) +{ + size_t size = 0; + + mutex_lock(&kexec_mutex); + if (crashk_res.end != crashk_res.start) + size = resource_size(&crashk_res); + mutex_unlock(&kexec_mutex); + return size; +} + +void __weak crash_free_reserved_phys_range(unsigned long begin, + unsigned long end) +{ + unsigned long addr; + + for (addr = begin; addr < end; addr += PAGE_SIZE) + free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT)); +} + +int crash_shrink_memory(unsigned long new_size) +{ + int ret = 0; + unsigned long start, end; + unsigned long old_size; + struct resource *ram_res; + + mutex_lock(&kexec_mutex); + + if (kexec_crash_image) { + ret = -ENOENT; + goto unlock; + } + start = crashk_res.start; + end = crashk_res.end; + old_size = (end == 0) ? 0 : end - start + 1; + if (new_size >= old_size) { + ret = (new_size == old_size) ? 0 : -EINVAL; + goto unlock; + } + + ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL); + if (!ram_res) { + ret = -ENOMEM; + goto unlock; + } + + start = roundup(start, KEXEC_CRASH_MEM_ALIGN); + end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN); + + crash_map_reserved_pages(); + crash_free_reserved_phys_range(end, crashk_res.end); + + if ((start == end) && (crashk_res.parent != NULL)) + release_resource(&crashk_res); + + ram_res->start = end; + ram_res->end = crashk_res.end; + ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM; + ram_res->name = "System RAM"; + + crashk_res.end = end - 1; + + insert_resource(&iomem_resource, ram_res); + crash_unmap_reserved_pages(); + +unlock: + mutex_unlock(&kexec_mutex); + return ret; +} + +static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data, + size_t data_len) +{ + struct elf_note note; + + note.n_namesz = strlen(name) + 1; + note.n_descsz = data_len; + note.n_type = type; + memcpy(buf, ¬e, sizeof(note)); + buf += (sizeof(note) + 3)/4; + memcpy(buf, name, note.n_namesz); + buf += (note.n_namesz + 3)/4; + memcpy(buf, data, note.n_descsz); + buf += (note.n_descsz + 3)/4; + + return buf; +} + +static void final_note(u32 *buf) +{ + struct elf_note note; + + note.n_namesz = 0; + note.n_descsz = 0; + note.n_type = 0; + memcpy(buf, ¬e, sizeof(note)); +} + +void crash_save_cpu(struct pt_regs *regs, int cpu) +{ + struct elf_prstatus prstatus; + u32 *buf; + + if ((cpu < 0) || (cpu >= nr_cpu_ids)) + return; + + /* Using ELF notes here is opportunistic. + * I need a well defined structure format + * for the data I pass, and I need tags + * on the data to indicate what information I have + * squirrelled away. ELF notes happen to provide + * all of that, so there is no need to invent something new. + */ + buf = (u32 *)per_cpu_ptr(crash_notes, cpu); + if (!buf) + return; + memset(&prstatus, 0, sizeof(prstatus)); + prstatus.pr_pid = current->pid; + elf_core_copy_kernel_regs(&prstatus.pr_reg, regs); + buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, + &prstatus, sizeof(prstatus)); + final_note(buf); +} + +static int __init crash_notes_memory_init(void) +{ + /* Allocate memory for saving cpu registers. */ + size_t size, align; + + /* + * crash_notes could be allocated across 2 vmalloc pages when percpu + * is vmalloc based . vmalloc doesn't guarantee 2 continuous vmalloc + * pages are also on 2 continuous physical pages. In this case the + * 2nd part of crash_notes in 2nd page could be lost since only the + * starting address and size of crash_notes are exported through sysfs. + * Here round up the size of crash_notes to the nearest power of two + * and pass it to __alloc_percpu as align value. This can make sure + * crash_notes is allocated inside one physical page. + */ + size = sizeof(note_buf_t); + align = min(roundup_pow_of_two(sizeof(note_buf_t)), PAGE_SIZE); + + /* + * Break compile if size is bigger than PAGE_SIZE since crash_notes + * definitely will be in 2 pages with that. + */ + BUILD_BUG_ON(size > PAGE_SIZE); + + crash_notes = __alloc_percpu(size, align); + if (!crash_notes) { + pr_warn("Memory allocation for saving cpu register states failed\n"); + return -ENOMEM; + } + return 0; +} +subsys_initcall(crash_notes_memory_init); + + +/* + * parsing the "crashkernel" commandline + * + * this code is intended to be called from architecture specific code + */ + + +/* + * This function parses command lines in the format + * + * crashkernel=ramsize-range:size[,...][@offset] + * + * The function returns 0 on success and -EINVAL on failure. + */ +static int __init parse_crashkernel_mem(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + char *cur = cmdline, *tmp; + + /* for each entry of the comma-separated list */ + do { + unsigned long long start, end = ULLONG_MAX, size; + + /* get the start of the range */ + start = memparse(cur, &tmp); + if (cur == tmp) { + pr_warn("crashkernel: Memory value expected\n"); + return -EINVAL; + } + cur = tmp; + if (*cur != '-') { + pr_warn("crashkernel: '-' expected\n"); + return -EINVAL; + } + cur++; + + /* if no ':' is here, than we read the end */ + if (*cur != ':') { + end = memparse(cur, &tmp); + if (cur == tmp) { + pr_warn("crashkernel: Memory value expected\n"); + return -EINVAL; + } + cur = tmp; + if (end <= start) { + pr_warn("crashkernel: end <= start\n"); + return -EINVAL; + } + } + + if (*cur != ':') { + pr_warn("crashkernel: ':' expected\n"); + return -EINVAL; + } + cur++; + + size = memparse(cur, &tmp); + if (cur == tmp) { + pr_warn("Memory value expected\n"); + return -EINVAL; + } + cur = tmp; + if (size >= system_ram) { + pr_warn("crashkernel: invalid size\n"); + return -EINVAL; + } + + /* match ? */ + if (system_ram >= start && system_ram < end) { + *crash_size = size; + break; + } + } while (*cur++ == ','); + + if (*crash_size > 0) { + while (*cur && *cur != ' ' && *cur != '@') + cur++; + if (*cur == '@') { + cur++; + *crash_base = memparse(cur, &tmp); + if (cur == tmp) { + pr_warn("Memory value expected after '@'\n"); + return -EINVAL; + } + } + } + + return 0; +} + +/* + * That function parses "simple" (old) crashkernel command lines like + * + * crashkernel=size[@offset] + * + * It returns 0 on success and -EINVAL on failure. + */ +static int __init parse_crashkernel_simple(char *cmdline, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + char *cur = cmdline; + + *crash_size = memparse(cmdline, &cur); + if (cmdline == cur) { + pr_warn("crashkernel: memory value expected\n"); + return -EINVAL; + } + + if (*cur == '@') + *crash_base = memparse(cur+1, &cur); + else if (*cur != ' ' && *cur != '\0') { + pr_warn("crashkernel: unrecognized char: %c\n", *cur); + return -EINVAL; + } + + return 0; +} + +#define SUFFIX_HIGH 0 +#define SUFFIX_LOW 1 +#define SUFFIX_NULL 2 +static __initdata char *suffix_tbl[] = { + [SUFFIX_HIGH] = ",high", + [SUFFIX_LOW] = ",low", + [SUFFIX_NULL] = NULL, +}; + +/* + * That function parses "suffix" crashkernel command lines like + * + * crashkernel=size,[high|low] + * + * It returns 0 on success and -EINVAL on failure. + */ +static int __init parse_crashkernel_suffix(char *cmdline, + unsigned long long *crash_size, + const char *suffix) +{ + char *cur = cmdline; + + *crash_size = memparse(cmdline, &cur); + if (cmdline == cur) { + pr_warn("crashkernel: memory value expected\n"); + return -EINVAL; + } + + /* check with suffix */ + if (strncmp(cur, suffix, strlen(suffix))) { + pr_warn("crashkernel: unrecognized char: %c\n", *cur); + return -EINVAL; + } + cur += strlen(suffix); + if (*cur != ' ' && *cur != '\0') { + pr_warn("crashkernel: unrecognized char: %c\n", *cur); + return -EINVAL; + } + + return 0; +} + +static __init char *get_last_crashkernel(char *cmdline, + const char *name, + const char *suffix) +{ + char *p = cmdline, *ck_cmdline = NULL; + + /* find crashkernel and use the last one if there are more */ + p = strstr(p, name); + while (p) { + char *end_p = strchr(p, ' '); + char *q; + + if (!end_p) + end_p = p + strlen(p); + + if (!suffix) { + int i; + + /* skip the one with any known suffix */ + for (i = 0; suffix_tbl[i]; i++) { + q = end_p - strlen(suffix_tbl[i]); + if (!strncmp(q, suffix_tbl[i], + strlen(suffix_tbl[i]))) + goto next; + } + ck_cmdline = p; + } else { + q = end_p - strlen(suffix); + if (!strncmp(q, suffix, strlen(suffix))) + ck_cmdline = p; + } +next: + p = strstr(p+1, name); + } + + if (!ck_cmdline) + return NULL; + + return ck_cmdline; +} + +static int __init __parse_crashkernel(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base, + const char *name, + const char *suffix) +{ + char *first_colon, *first_space; + char *ck_cmdline; + + BUG_ON(!crash_size || !crash_base); + *crash_size = 0; + *crash_base = 0; + + ck_cmdline = get_last_crashkernel(cmdline, name, suffix); + + if (!ck_cmdline) + return -EINVAL; + + ck_cmdline += strlen(name); + + if (suffix) + return parse_crashkernel_suffix(ck_cmdline, crash_size, + suffix); + /* + * if the commandline contains a ':', then that's the extended + * syntax -- if not, it must be the classic syntax + */ + first_colon = strchr(ck_cmdline, ':'); + first_space = strchr(ck_cmdline, ' '); + if (first_colon && (!first_space || first_colon < first_space)) + return parse_crashkernel_mem(ck_cmdline, system_ram, + crash_size, crash_base); + + return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base); +} + +/* + * That function is the entry point for command line parsing and should be + * called from the arch-specific code. + */ +int __init parse_crashkernel(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, + "crashkernel=", NULL); +} + +int __init parse_crashkernel_high(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, + "crashkernel=", suffix_tbl[SUFFIX_HIGH]); +} + +int __init parse_crashkernel_low(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, + "crashkernel=", suffix_tbl[SUFFIX_LOW]); +} + +static void update_vmcoreinfo_note(void) +{ + u32 *buf = vmcoreinfo_note; + + if (!vmcoreinfo_size) + return; + buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data, + vmcoreinfo_size); + final_note(buf); +} + +void crash_save_vmcoreinfo(void) +{ + vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds()); + update_vmcoreinfo_note(); +} + +void vmcoreinfo_append_str(const char *fmt, ...) +{ + va_list args; + char buf[0x50]; + size_t r; + + va_start(args, fmt); + r = vscnprintf(buf, sizeof(buf), fmt, args); + va_end(args); + + r = min(r, vmcoreinfo_max_size - vmcoreinfo_size); + + memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); + + vmcoreinfo_size += r; +} + +/* + * provide an empty default implementation here -- architecture + * code may override this + */ +void __weak arch_crash_save_vmcoreinfo(void) +{} + +unsigned long __weak paddr_vmcoreinfo_note(void) +{ + return __pa((unsigned long)(char *)&vmcoreinfo_note); +} + +static int __init crash_save_vmcoreinfo_init(void) +{ + VMCOREINFO_OSRELEASE(init_uts_ns.name.release); + VMCOREINFO_PAGESIZE(PAGE_SIZE); + + VMCOREINFO_SYMBOL(init_uts_ns); + VMCOREINFO_SYMBOL(node_online_map); +#ifdef CONFIG_MMU + VMCOREINFO_SYMBOL(swapper_pg_dir); +#endif + VMCOREINFO_SYMBOL(_stext); + VMCOREINFO_SYMBOL(vmap_area_list); + +#ifndef CONFIG_NEED_MULTIPLE_NODES + VMCOREINFO_SYMBOL(mem_map); + VMCOREINFO_SYMBOL(contig_page_data); +#endif +#ifdef CONFIG_SPARSEMEM + VMCOREINFO_SYMBOL(mem_section); + VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); + VMCOREINFO_STRUCT_SIZE(mem_section); + VMCOREINFO_OFFSET(mem_section, section_mem_map); +#endif + VMCOREINFO_STRUCT_SIZE(page); + VMCOREINFO_STRUCT_SIZE(pglist_data); + VMCOREINFO_STRUCT_SIZE(zone); + VMCOREINFO_STRUCT_SIZE(free_area); + VMCOREINFO_STRUCT_SIZE(list_head); + VMCOREINFO_SIZE(nodemask_t); + VMCOREINFO_OFFSET(page, flags); + VMCOREINFO_OFFSET(page, _count); + VMCOREINFO_OFFSET(page, mapping); + VMCOREINFO_OFFSET(page, lru); + VMCOREINFO_OFFSET(page, _mapcount); + VMCOREINFO_OFFSET(page, private); + VMCOREINFO_OFFSET(pglist_data, node_zones); + VMCOREINFO_OFFSET(pglist_data, nr_zones); +#ifdef CONFIG_FLAT_NODE_MEM_MAP + VMCOREINFO_OFFSET(pglist_data, node_mem_map); +#endif + VMCOREINFO_OFFSET(pglist_data, node_start_pfn); + VMCOREINFO_OFFSET(pglist_data, node_spanned_pages); + VMCOREINFO_OFFSET(pglist_data, node_id); + VMCOREINFO_OFFSET(zone, free_area); + VMCOREINFO_OFFSET(zone, vm_stat); + VMCOREINFO_OFFSET(zone, spanned_pages); + VMCOREINFO_OFFSET(free_area, free_list); + VMCOREINFO_OFFSET(list_head, next); + VMCOREINFO_OFFSET(list_head, prev); + VMCOREINFO_OFFSET(vmap_area, va_start); + VMCOREINFO_OFFSET(vmap_area, list); + VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); + log_buf_kexec_setup(); + VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); + VMCOREINFO_NUMBER(NR_FREE_PAGES); + VMCOREINFO_NUMBER(PG_lru); + VMCOREINFO_NUMBER(PG_private); + VMCOREINFO_NUMBER(PG_swapcache); + VMCOREINFO_NUMBER(PG_slab); +#ifdef CONFIG_MEMORY_FAILURE + VMCOREINFO_NUMBER(PG_hwpoison); +#endif + VMCOREINFO_NUMBER(PG_head_mask); + VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); +#ifdef CONFIG_X86 + VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE); +#endif +#ifdef CONFIG_HUGETLBFS + VMCOREINFO_SYMBOL(free_huge_page); +#endif + + arch_crash_save_vmcoreinfo(); + update_vmcoreinfo_note(); + + return 0; +} + +subsys_initcall(crash_save_vmcoreinfo_init); + +/* + * Move into place and start executing a preloaded standalone + * executable. If nothing was preloaded return an error. + */ +int kernel_kexec(void) +{ + int error = 0; + + if (!mutex_trylock(&kexec_mutex)) + return -EBUSY; + if (!kexec_image) { + error = -EINVAL; + goto Unlock; + } + +#ifdef CONFIG_KEXEC_JUMP + if (kexec_image->preserve_context) { + lock_system_sleep(); + pm_prepare_console(); + error = freeze_processes(); + if (error) { + error = -EBUSY; + goto Restore_console; + } + suspend_console(); + error = dpm_suspend_start(PMSG_FREEZE); + if (error) + goto Resume_console; + /* At this point, dpm_suspend_start() has been called, + * but *not* dpm_suspend_end(). We *must* call + * dpm_suspend_end() now. Otherwise, drivers for + * some devices (e.g. interrupt controllers) become + * desynchronized with the actual state of the + * hardware at resume time, and evil weirdness ensues. + */ + error = dpm_suspend_end(PMSG_FREEZE); + if (error) + goto Resume_devices; + error = disable_nonboot_cpus(); + if (error) + goto Enable_cpus; + local_irq_disable(); + error = syscore_suspend(); + if (error) + goto Enable_irqs; + } else +#endif + { + kexec_in_progress = true; + kernel_restart_prepare(NULL); + migrate_to_reboot_cpu(); + + /* + * migrate_to_reboot_cpu() disables CPU hotplug assuming that + * no further code needs to use CPU hotplug (which is true in + * the reboot case). However, the kexec path depends on using + * CPU hotplug again; so re-enable it here. + */ + cpu_hotplug_enable(); + pr_emerg("Starting new kernel\n"); + machine_shutdown(); + } + + machine_kexec(kexec_image); + +#ifdef CONFIG_KEXEC_JUMP + if (kexec_image->preserve_context) { + syscore_resume(); + Enable_irqs: + local_irq_enable(); + Enable_cpus: + enable_nonboot_cpus(); + dpm_resume_start(PMSG_RESTORE); + Resume_devices: + dpm_resume_end(PMSG_RESTORE); + Resume_console: + resume_console(); + thaw_processes(); + Restore_console: + pm_restore_console(); + unlock_system_sleep(); + } +#endif + + Unlock: + mutex_unlock(&kexec_mutex); + return error; +} + +/* + * Add and remove page tables for crashkernel memory + * + * Provide an empty default implementation here -- architecture + * code may override this + */ +void __weak crash_map_reserved_pages(void) +{} + +void __weak crash_unmap_reserved_pages(void) +{} diff --git a/kernel/kernel/kexec_file.c b/kernel/kernel/kexec_file.c new file mode 100644 index 000000000..b70ada002 --- /dev/null +++ b/kernel/kernel/kexec_file.c @@ -0,0 +1,1047 @@ +/* + * kexec: kexec_file_load system call + * + * Copyright (C) 2014 Red Hat Inc. + * Authors: + * Vivek Goyal <vgoyal@redhat.com> + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/capability.h> +#include <linux/mm.h> +#include <linux/file.h> +#include <linux/slab.h> +#include <linux/kexec.h> +#include <linux/mutex.h> +#include <linux/list.h> +#include <crypto/hash.h> +#include <crypto/sha.h> +#include <linux/syscalls.h> +#include <linux/vmalloc.h> +#include "kexec_internal.h" + +/* + * Declare these symbols weak so that if architecture provides a purgatory, + * these will be overridden. + */ +char __weak kexec_purgatory[0]; +size_t __weak kexec_purgatory_size = 0; + +static int kexec_calculate_store_digests(struct kimage *image); + +static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len) +{ + struct fd f = fdget(fd); + int ret; + struct kstat stat; + loff_t pos; + ssize_t bytes = 0; + + if (!f.file) + return -EBADF; + + ret = vfs_getattr(&f.file->f_path, &stat); + if (ret) + goto out; + + if (stat.size > INT_MAX) { + ret = -EFBIG; + goto out; + } + + /* Don't hand 0 to vmalloc, it whines. */ + if (stat.size == 0) { + ret = -EINVAL; + goto out; + } + + *buf = vmalloc(stat.size); + if (!*buf) { + ret = -ENOMEM; + goto out; + } + + pos = 0; + while (pos < stat.size) { + bytes = kernel_read(f.file, pos, (char *)(*buf) + pos, + stat.size - pos); + if (bytes < 0) { + vfree(*buf); + ret = bytes; + goto out; + } + + if (bytes == 0) + break; + pos += bytes; + } + + if (pos != stat.size) { + ret = -EBADF; + vfree(*buf); + goto out; + } + + *buf_len = pos; +out: + fdput(f); + return ret; +} + +/* Architectures can provide this probe function */ +int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf, + unsigned long buf_len) +{ + return -ENOEXEC; +} + +void * __weak arch_kexec_kernel_image_load(struct kimage *image) +{ + return ERR_PTR(-ENOEXEC); +} + +int __weak arch_kimage_file_post_load_cleanup(struct kimage *image) +{ + return -EINVAL; +} + +int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, + unsigned long buf_len) +{ + return -EKEYREJECTED; +} + +/* Apply relocations of type RELA */ +int __weak +arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, + unsigned int relsec) +{ + pr_err("RELA relocation unsupported.\n"); + return -ENOEXEC; +} + +/* Apply relocations of type REL */ +int __weak +arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, + unsigned int relsec) +{ + pr_err("REL relocation unsupported.\n"); + return -ENOEXEC; +} + +/* + * Free up memory used by kernel, initrd, and command line. This is temporary + * memory allocation which is not needed any more after these buffers have + * been loaded into separate segments and have been copied elsewhere. + */ +void kimage_file_post_load_cleanup(struct kimage *image) +{ + struct purgatory_info *pi = &image->purgatory_info; + + vfree(image->kernel_buf); + image->kernel_buf = NULL; + + vfree(image->initrd_buf); + image->initrd_buf = NULL; + + kfree(image->cmdline_buf); + image->cmdline_buf = NULL; + + vfree(pi->purgatory_buf); + pi->purgatory_buf = NULL; + + vfree(pi->sechdrs); + pi->sechdrs = NULL; + + /* See if architecture has anything to cleanup post load */ + arch_kimage_file_post_load_cleanup(image); + + /* + * Above call should have called into bootloader to free up + * any data stored in kimage->image_loader_data. It should + * be ok now to free it up. + */ + kfree(image->image_loader_data); + image->image_loader_data = NULL; +} + +/* + * In file mode list of segments is prepared by kernel. Copy relevant + * data from user space, do error checking, prepare segment list + */ +static int +kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, + const char __user *cmdline_ptr, + unsigned long cmdline_len, unsigned flags) +{ + int ret = 0; + void *ldata; + + ret = copy_file_from_fd(kernel_fd, &image->kernel_buf, + &image->kernel_buf_len); + if (ret) + return ret; + + /* Call arch image probe handlers */ + ret = arch_kexec_kernel_image_probe(image, image->kernel_buf, + image->kernel_buf_len); + + if (ret) + goto out; + +#ifdef CONFIG_KEXEC_VERIFY_SIG + ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf, + image->kernel_buf_len); + if (ret) { + pr_debug("kernel signature verification failed.\n"); + goto out; + } + pr_debug("kernel signature verification successful.\n"); +#endif + /* It is possible that there no initramfs is being loaded */ + if (!(flags & KEXEC_FILE_NO_INITRAMFS)) { + ret = copy_file_from_fd(initrd_fd, &image->initrd_buf, + &image->initrd_buf_len); + if (ret) + goto out; + } + + if (cmdline_len) { + image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL); + if (!image->cmdline_buf) { + ret = -ENOMEM; + goto out; + } + + ret = copy_from_user(image->cmdline_buf, cmdline_ptr, + cmdline_len); + if (ret) { + ret = -EFAULT; + goto out; + } + + image->cmdline_buf_len = cmdline_len; + + /* command line should be a string with last byte null */ + if (image->cmdline_buf[cmdline_len - 1] != '\0') { + ret = -EINVAL; + goto out; + } + } + + /* Call arch image load handlers */ + ldata = arch_kexec_kernel_image_load(image); + + if (IS_ERR(ldata)) { + ret = PTR_ERR(ldata); + goto out; + } + + image->image_loader_data = ldata; +out: + /* In case of error, free up all allocated memory in this function */ + if (ret) + kimage_file_post_load_cleanup(image); + return ret; +} + +static int +kimage_file_alloc_init(struct kimage **rimage, int kernel_fd, + int initrd_fd, const char __user *cmdline_ptr, + unsigned long cmdline_len, unsigned long flags) +{ + int ret; + struct kimage *image; + bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH; + + image = do_kimage_alloc_init(); + if (!image) + return -ENOMEM; + + image->file_mode = 1; + + if (kexec_on_panic) { + /* Enable special crash kernel control page alloc policy. */ + image->control_page = crashk_res.start; + image->type = KEXEC_TYPE_CRASH; + } + + ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd, + cmdline_ptr, cmdline_len, flags); + if (ret) + goto out_free_image; + + ret = sanity_check_segment_list(image); + if (ret) + goto out_free_post_load_bufs; + + ret = -ENOMEM; + image->control_code_page = kimage_alloc_control_pages(image, + get_order(KEXEC_CONTROL_PAGE_SIZE)); + if (!image->control_code_page) { + pr_err("Could not allocate control_code_buffer\n"); + goto out_free_post_load_bufs; + } + + if (!kexec_on_panic) { + image->swap_page = kimage_alloc_control_pages(image, 0); + if (!image->swap_page) { + pr_err("Could not allocate swap buffer\n"); + goto out_free_control_pages; + } + } + + *rimage = image; + return 0; +out_free_control_pages: + kimage_free_page_list(&image->control_pages); +out_free_post_load_bufs: + kimage_file_post_load_cleanup(image); +out_free_image: + kfree(image); + return ret; +} + +SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, + unsigned long, cmdline_len, const char __user *, cmdline_ptr, + unsigned long, flags) +{ + int ret = 0, i; + struct kimage **dest_image, *image; + + /* We only trust the superuser with rebooting the system. */ + if (!capable(CAP_SYS_BOOT) || kexec_load_disabled) + return -EPERM; + + /* Make sure we have a legal set of flags */ + if (flags != (flags & KEXEC_FILE_FLAGS)) + return -EINVAL; + + image = NULL; + + if (!mutex_trylock(&kexec_mutex)) + return -EBUSY; + + dest_image = &kexec_image; + if (flags & KEXEC_FILE_ON_CRASH) + dest_image = &kexec_crash_image; + + if (flags & KEXEC_FILE_UNLOAD) + goto exchange; + + /* + * In case of crash, new kernel gets loaded in reserved region. It is + * same memory where old crash kernel might be loaded. Free any + * current crash dump kernel before we corrupt it. + */ + if (flags & KEXEC_FILE_ON_CRASH) + kimage_free(xchg(&kexec_crash_image, NULL)); + + ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr, + cmdline_len, flags); + if (ret) + goto out; + + ret = machine_kexec_prepare(image); + if (ret) + goto out; + + ret = kexec_calculate_store_digests(image); + if (ret) + goto out; + + for (i = 0; i < image->nr_segments; i++) { + struct kexec_segment *ksegment; + + ksegment = &image->segment[i]; + pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n", + i, ksegment->buf, ksegment->bufsz, ksegment->mem, + ksegment->memsz); + + ret = kimage_load_segment(image, &image->segment[i]); + if (ret) + goto out; + } + + kimage_terminate(image); + + /* + * Free up any temporary buffers allocated which are not needed + * after image has been loaded + */ + kimage_file_post_load_cleanup(image); +exchange: + image = xchg(dest_image, image); +out: + mutex_unlock(&kexec_mutex); + kimage_free(image); + return ret; +} + +static int locate_mem_hole_top_down(unsigned long start, unsigned long end, + struct kexec_buf *kbuf) +{ + struct kimage *image = kbuf->image; + unsigned long temp_start, temp_end; + + temp_end = min(end, kbuf->buf_max); + temp_start = temp_end - kbuf->memsz; + + do { + /* align down start */ + temp_start = temp_start & (~(kbuf->buf_align - 1)); + + if (temp_start < start || temp_start < kbuf->buf_min) + return 0; + + temp_end = temp_start + kbuf->memsz - 1; + + /* + * Make sure this does not conflict with any of existing + * segments + */ + if (kimage_is_destination_range(image, temp_start, temp_end)) { + temp_start = temp_start - PAGE_SIZE; + continue; + } + + /* We found a suitable memory range */ + break; + } while (1); + + /* If we are here, we found a suitable memory range */ + kbuf->mem = temp_start; + + /* Success, stop navigating through remaining System RAM ranges */ + return 1; +} + +static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end, + struct kexec_buf *kbuf) +{ + struct kimage *image = kbuf->image; + unsigned long temp_start, temp_end; + + temp_start = max(start, kbuf->buf_min); + + do { + temp_start = ALIGN(temp_start, kbuf->buf_align); + temp_end = temp_start + kbuf->memsz - 1; + + if (temp_end > end || temp_end > kbuf->buf_max) + return 0; + /* + * Make sure this does not conflict with any of existing + * segments + */ + if (kimage_is_destination_range(image, temp_start, temp_end)) { + temp_start = temp_start + PAGE_SIZE; + continue; + } + + /* We found a suitable memory range */ + break; + } while (1); + + /* If we are here, we found a suitable memory range */ + kbuf->mem = temp_start; + + /* Success, stop navigating through remaining System RAM ranges */ + return 1; +} + +static int locate_mem_hole_callback(u64 start, u64 end, void *arg) +{ + struct kexec_buf *kbuf = (struct kexec_buf *)arg; + unsigned long sz = end - start + 1; + + /* Returning 0 will take to next memory range */ + if (sz < kbuf->memsz) + return 0; + + if (end < kbuf->buf_min || start > kbuf->buf_max) + return 0; + + /* + * Allocate memory top down with-in ram range. Otherwise bottom up + * allocation. + */ + if (kbuf->top_down) + return locate_mem_hole_top_down(start, end, kbuf); + return locate_mem_hole_bottom_up(start, end, kbuf); +} + +/* + * Helper function for placing a buffer in a kexec segment. This assumes + * that kexec_mutex is held. + */ +int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz, + unsigned long memsz, unsigned long buf_align, + unsigned long buf_min, unsigned long buf_max, + bool top_down, unsigned long *load_addr) +{ + + struct kexec_segment *ksegment; + struct kexec_buf buf, *kbuf; + int ret; + + /* Currently adding segment this way is allowed only in file mode */ + if (!image->file_mode) + return -EINVAL; + + if (image->nr_segments >= KEXEC_SEGMENT_MAX) + return -EINVAL; + + /* + * Make sure we are not trying to add buffer after allocating + * control pages. All segments need to be placed first before + * any control pages are allocated. As control page allocation + * logic goes through list of segments to make sure there are + * no destination overlaps. + */ + if (!list_empty(&image->control_pages)) { + WARN_ON(1); + return -EINVAL; + } + + memset(&buf, 0, sizeof(struct kexec_buf)); + kbuf = &buf; + kbuf->image = image; + kbuf->buffer = buffer; + kbuf->bufsz = bufsz; + + kbuf->memsz = ALIGN(memsz, PAGE_SIZE); + kbuf->buf_align = max(buf_align, PAGE_SIZE); + kbuf->buf_min = buf_min; + kbuf->buf_max = buf_max; + kbuf->top_down = top_down; + + /* Walk the RAM ranges and allocate a suitable range for the buffer */ + if (image->type == KEXEC_TYPE_CRASH) + ret = walk_iomem_res("Crash kernel", + IORESOURCE_MEM | IORESOURCE_BUSY, + crashk_res.start, crashk_res.end, kbuf, + locate_mem_hole_callback); + else + ret = walk_system_ram_res(0, -1, kbuf, + locate_mem_hole_callback); + if (ret != 1) { + /* A suitable memory range could not be found for buffer */ + return -EADDRNOTAVAIL; + } + + /* Found a suitable memory range */ + ksegment = &image->segment[image->nr_segments]; + ksegment->kbuf = kbuf->buffer; + ksegment->bufsz = kbuf->bufsz; + ksegment->mem = kbuf->mem; + ksegment->memsz = kbuf->memsz; + image->nr_segments++; + *load_addr = ksegment->mem; + return 0; +} + +/* Calculate and store the digest of segments */ +static int kexec_calculate_store_digests(struct kimage *image) +{ + struct crypto_shash *tfm; + struct shash_desc *desc; + int ret = 0, i, j, zero_buf_sz, sha_region_sz; + size_t desc_size, nullsz; + char *digest; + void *zero_buf; + struct kexec_sha_region *sha_regions; + struct purgatory_info *pi = &image->purgatory_info; + + zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT); + zero_buf_sz = PAGE_SIZE; + + tfm = crypto_alloc_shash("sha256", 0, 0); + if (IS_ERR(tfm)) { + ret = PTR_ERR(tfm); + goto out; + } + + desc_size = crypto_shash_descsize(tfm) + sizeof(*desc); + desc = kzalloc(desc_size, GFP_KERNEL); + if (!desc) { + ret = -ENOMEM; + goto out_free_tfm; + } + + sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region); + sha_regions = vzalloc(sha_region_sz); + if (!sha_regions) + goto out_free_desc; + + desc->tfm = tfm; + desc->flags = 0; + + ret = crypto_shash_init(desc); + if (ret < 0) + goto out_free_sha_regions; + + digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL); + if (!digest) { + ret = -ENOMEM; + goto out_free_sha_regions; + } + + for (j = i = 0; i < image->nr_segments; i++) { + struct kexec_segment *ksegment; + + ksegment = &image->segment[i]; + /* + * Skip purgatory as it will be modified once we put digest + * info in purgatory. + */ + if (ksegment->kbuf == pi->purgatory_buf) + continue; + + ret = crypto_shash_update(desc, ksegment->kbuf, + ksegment->bufsz); + if (ret) + break; + + /* + * Assume rest of the buffer is filled with zero and + * update digest accordingly. + */ + nullsz = ksegment->memsz - ksegment->bufsz; + while (nullsz) { + unsigned long bytes = nullsz; + + if (bytes > zero_buf_sz) + bytes = zero_buf_sz; + ret = crypto_shash_update(desc, zero_buf, bytes); + if (ret) + break; + nullsz -= bytes; + } + + if (ret) + break; + + sha_regions[j].start = ksegment->mem; + sha_regions[j].len = ksegment->memsz; + j++; + } + + if (!ret) { + ret = crypto_shash_final(desc, digest); + if (ret) + goto out_free_digest; + ret = kexec_purgatory_get_set_symbol(image, "sha_regions", + sha_regions, sha_region_sz, 0); + if (ret) + goto out_free_digest; + + ret = kexec_purgatory_get_set_symbol(image, "sha256_digest", + digest, SHA256_DIGEST_SIZE, 0); + if (ret) + goto out_free_digest; + } + +out_free_digest: + kfree(digest); +out_free_sha_regions: + vfree(sha_regions); +out_free_desc: + kfree(desc); +out_free_tfm: + kfree(tfm); +out: + return ret; +} + +/* Actually load purgatory. Lot of code taken from kexec-tools */ +static int __kexec_load_purgatory(struct kimage *image, unsigned long min, + unsigned long max, int top_down) +{ + struct purgatory_info *pi = &image->purgatory_info; + unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad; + unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset; + unsigned char *buf_addr, *src; + int i, ret = 0, entry_sidx = -1; + const Elf_Shdr *sechdrs_c; + Elf_Shdr *sechdrs = NULL; + void *purgatory_buf = NULL; + + /* + * sechdrs_c points to section headers in purgatory and are read + * only. No modifications allowed. + */ + sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff; + + /* + * We can not modify sechdrs_c[] and its fields. It is read only. + * Copy it over to a local copy where one can store some temporary + * data and free it at the end. We need to modify ->sh_addr and + * ->sh_offset fields to keep track of permanent and temporary + * locations of sections. + */ + sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr)); + if (!sechdrs) + return -ENOMEM; + + memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr)); + + /* + * We seem to have multiple copies of sections. First copy is which + * is embedded in kernel in read only section. Some of these sections + * will be copied to a temporary buffer and relocated. And these + * sections will finally be copied to their final destination at + * segment load time. + * + * Use ->sh_offset to reflect section address in memory. It will + * point to original read only copy if section is not allocatable. + * Otherwise it will point to temporary copy which will be relocated. + * + * Use ->sh_addr to contain final address of the section where it + * will go during execution time. + */ + for (i = 0; i < pi->ehdr->e_shnum; i++) { + if (sechdrs[i].sh_type == SHT_NOBITS) + continue; + + sechdrs[i].sh_offset = (unsigned long)pi->ehdr + + sechdrs[i].sh_offset; + } + + /* + * Identify entry point section and make entry relative to section + * start. + */ + entry = pi->ehdr->e_entry; + for (i = 0; i < pi->ehdr->e_shnum; i++) { + if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + continue; + + if (!(sechdrs[i].sh_flags & SHF_EXECINSTR)) + continue; + + /* Make entry section relative */ + if (sechdrs[i].sh_addr <= pi->ehdr->e_entry && + ((sechdrs[i].sh_addr + sechdrs[i].sh_size) > + pi->ehdr->e_entry)) { + entry_sidx = i; + entry -= sechdrs[i].sh_addr; + break; + } + } + + /* Determine how much memory is needed to load relocatable object. */ + buf_align = 1; + bss_align = 1; + buf_sz = 0; + bss_sz = 0; + + for (i = 0; i < pi->ehdr->e_shnum; i++) { + if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + continue; + + align = sechdrs[i].sh_addralign; + if (sechdrs[i].sh_type != SHT_NOBITS) { + if (buf_align < align) + buf_align = align; + buf_sz = ALIGN(buf_sz, align); + buf_sz += sechdrs[i].sh_size; + } else { + /* bss section */ + if (bss_align < align) + bss_align = align; + bss_sz = ALIGN(bss_sz, align); + bss_sz += sechdrs[i].sh_size; + } + } + + /* Determine the bss padding required to align bss properly */ + bss_pad = 0; + if (buf_sz & (bss_align - 1)) + bss_pad = bss_align - (buf_sz & (bss_align - 1)); + + memsz = buf_sz + bss_pad + bss_sz; + + /* Allocate buffer for purgatory */ + purgatory_buf = vzalloc(buf_sz); + if (!purgatory_buf) { + ret = -ENOMEM; + goto out; + } + + if (buf_align < bss_align) + buf_align = bss_align; + + /* Add buffer to segment list */ + ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz, + buf_align, min, max, top_down, + &pi->purgatory_load_addr); + if (ret) + goto out; + + /* Load SHF_ALLOC sections */ + buf_addr = purgatory_buf; + load_addr = curr_load_addr = pi->purgatory_load_addr; + bss_addr = load_addr + buf_sz + bss_pad; + + for (i = 0; i < pi->ehdr->e_shnum; i++) { + if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + continue; + + align = sechdrs[i].sh_addralign; + if (sechdrs[i].sh_type != SHT_NOBITS) { + curr_load_addr = ALIGN(curr_load_addr, align); + offset = curr_load_addr - load_addr; + /* We already modifed ->sh_offset to keep src addr */ + src = (char *) sechdrs[i].sh_offset; + memcpy(buf_addr + offset, src, sechdrs[i].sh_size); + + /* Store load address and source address of section */ + sechdrs[i].sh_addr = curr_load_addr; + + /* + * This section got copied to temporary buffer. Update + * ->sh_offset accordingly. + */ + sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset); + + /* Advance to the next address */ + curr_load_addr += sechdrs[i].sh_size; + } else { + bss_addr = ALIGN(bss_addr, align); + sechdrs[i].sh_addr = bss_addr; + bss_addr += sechdrs[i].sh_size; + } + } + + /* Update entry point based on load address of text section */ + if (entry_sidx >= 0) + entry += sechdrs[entry_sidx].sh_addr; + + /* Make kernel jump to purgatory after shutdown */ + image->start = entry; + + /* Used later to get/set symbol values */ + pi->sechdrs = sechdrs; + + /* + * Used later to identify which section is purgatory and skip it + * from checksumming. + */ + pi->purgatory_buf = purgatory_buf; + return ret; +out: + vfree(sechdrs); + vfree(purgatory_buf); + return ret; +} + +static int kexec_apply_relocations(struct kimage *image) +{ + int i, ret; + struct purgatory_info *pi = &image->purgatory_info; + Elf_Shdr *sechdrs = pi->sechdrs; + + /* Apply relocations */ + for (i = 0; i < pi->ehdr->e_shnum; i++) { + Elf_Shdr *section, *symtab; + + if (sechdrs[i].sh_type != SHT_RELA && + sechdrs[i].sh_type != SHT_REL) + continue; + + /* + * For section of type SHT_RELA/SHT_REL, + * ->sh_link contains section header index of associated + * symbol table. And ->sh_info contains section header + * index of section to which relocations apply. + */ + if (sechdrs[i].sh_info >= pi->ehdr->e_shnum || + sechdrs[i].sh_link >= pi->ehdr->e_shnum) + return -ENOEXEC; + + section = &sechdrs[sechdrs[i].sh_info]; + symtab = &sechdrs[sechdrs[i].sh_link]; + + if (!(section->sh_flags & SHF_ALLOC)) + continue; + + /* + * symtab->sh_link contain section header index of associated + * string table. + */ + if (symtab->sh_link >= pi->ehdr->e_shnum) + /* Invalid section number? */ + continue; + + /* + * Respective architecture needs to provide support for applying + * relocations of type SHT_RELA/SHT_REL. + */ + if (sechdrs[i].sh_type == SHT_RELA) + ret = arch_kexec_apply_relocations_add(pi->ehdr, + sechdrs, i); + else if (sechdrs[i].sh_type == SHT_REL) + ret = arch_kexec_apply_relocations(pi->ehdr, + sechdrs, i); + if (ret) + return ret; + } + + return 0; +} + +/* Load relocatable purgatory object and relocate it appropriately */ +int kexec_load_purgatory(struct kimage *image, unsigned long min, + unsigned long max, int top_down, + unsigned long *load_addr) +{ + struct purgatory_info *pi = &image->purgatory_info; + int ret; + + if (kexec_purgatory_size <= 0) + return -EINVAL; + + if (kexec_purgatory_size < sizeof(Elf_Ehdr)) + return -ENOEXEC; + + pi->ehdr = (Elf_Ehdr *)kexec_purgatory; + + if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0 + || pi->ehdr->e_type != ET_REL + || !elf_check_arch(pi->ehdr) + || pi->ehdr->e_shentsize != sizeof(Elf_Shdr)) + return -ENOEXEC; + + if (pi->ehdr->e_shoff >= kexec_purgatory_size + || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) > + kexec_purgatory_size - pi->ehdr->e_shoff)) + return -ENOEXEC; + + ret = __kexec_load_purgatory(image, min, max, top_down); + if (ret) + return ret; + + ret = kexec_apply_relocations(image); + if (ret) + goto out; + + *load_addr = pi->purgatory_load_addr; + return 0; +out: + vfree(pi->sechdrs); + vfree(pi->purgatory_buf); + return ret; +} + +static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi, + const char *name) +{ + Elf_Sym *syms; + Elf_Shdr *sechdrs; + Elf_Ehdr *ehdr; + int i, k; + const char *strtab; + + if (!pi->sechdrs || !pi->ehdr) + return NULL; + + sechdrs = pi->sechdrs; + ehdr = pi->ehdr; + + for (i = 0; i < ehdr->e_shnum; i++) { + if (sechdrs[i].sh_type != SHT_SYMTAB) + continue; + + if (sechdrs[i].sh_link >= ehdr->e_shnum) + /* Invalid strtab section number */ + continue; + strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset; + syms = (Elf_Sym *)sechdrs[i].sh_offset; + + /* Go through symbols for a match */ + for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) { + if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL) + continue; + + if (strcmp(strtab + syms[k].st_name, name) != 0) + continue; + + if (syms[k].st_shndx == SHN_UNDEF || + syms[k].st_shndx >= ehdr->e_shnum) { + pr_debug("Symbol: %s has bad section index %d.\n", + name, syms[k].st_shndx); + return NULL; + } + + /* Found the symbol we are looking for */ + return &syms[k]; + } + } + + return NULL; +} + +void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name) +{ + struct purgatory_info *pi = &image->purgatory_info; + Elf_Sym *sym; + Elf_Shdr *sechdr; + + sym = kexec_purgatory_find_symbol(pi, name); + if (!sym) + return ERR_PTR(-EINVAL); + + sechdr = &pi->sechdrs[sym->st_shndx]; + + /* + * Returns the address where symbol will finally be loaded after + * kexec_load_segment() + */ + return (void *)(sechdr->sh_addr + sym->st_value); +} + +/* + * Get or set value of a symbol. If "get_value" is true, symbol value is + * returned in buf otherwise symbol value is set based on value in buf. + */ +int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, + void *buf, unsigned int size, bool get_value) +{ + Elf_Sym *sym; + Elf_Shdr *sechdrs; + struct purgatory_info *pi = &image->purgatory_info; + char *sym_buf; + + sym = kexec_purgatory_find_symbol(pi, name); + if (!sym) + return -EINVAL; + + if (sym->st_size != size) { + pr_err("symbol %s size mismatch: expected %lu actual %u\n", + name, (unsigned long)sym->st_size, size); + return -EINVAL; + } + + sechdrs = pi->sechdrs; + + if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) { + pr_err("symbol %s is in a bss section. Cannot %s\n", name, + get_value ? "get" : "set"); + return -EINVAL; + } + + sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset + + sym->st_value; + + if (get_value) + memcpy((void *)buf, sym_buf, size); + else + memcpy((void *)sym_buf, buf, size); + + return 0; +} diff --git a/kernel/kernel/kexec_internal.h b/kernel/kernel/kexec_internal.h new file mode 100644 index 000000000..e4392a698 --- /dev/null +++ b/kernel/kernel/kexec_internal.h @@ -0,0 +1,22 @@ +#ifndef LINUX_KEXEC_INTERNAL_H +#define LINUX_KEXEC_INTERNAL_H + +#include <linux/kexec.h> + +struct kimage *do_kimage_alloc_init(void); +int sanity_check_segment_list(struct kimage *image); +void kimage_free_page_list(struct list_head *list); +void kimage_free(struct kimage *image); +int kimage_load_segment(struct kimage *image, struct kexec_segment *segment); +void kimage_terminate(struct kimage *image); +int kimage_is_destination_range(struct kimage *image, + unsigned long start, unsigned long end); + +extern struct mutex kexec_mutex; + +#ifdef CONFIG_KEXEC_FILE +void kimage_file_post_load_cleanup(struct kimage *image); +#else /* CONFIG_KEXEC_FILE */ +static inline void kimage_file_post_load_cleanup(struct kimage *image) { } +#endif /* CONFIG_KEXEC_FILE */ +#endif /* LINUX_KEXEC_INTERNAL_H */ diff --git a/kernel/kernel/kmod.c b/kernel/kernel/kmod.c index 2777f40a9..0277d1216 100644 --- a/kernel/kernel/kmod.c +++ b/kernel/kernel/kmod.c @@ -45,8 +45,6 @@ extern int max_threads; -static struct workqueue_struct *khelper_wq; - #define CAP_BSET (void *)1 #define CAP_PI (void *)2 @@ -114,10 +112,11 @@ out: * @...: arguments as specified in the format string * * Load a module using the user mode module loader. The function returns - * zero on success or a negative errno code on failure. Note that a - * successful module load does not mean the module did not then unload - * and exit on an error of its own. Callers must check that the service - * they requested is now available not blindly invoke it. + * zero on success or a negative errno code or positive exit code from + * "modprobe" on failure. Note that a successful module load does not mean + * the module did not then unload and exit on an error of its own. Callers + * must check that the service they requested is now available not blindly + * invoke it. * * If module auto-loading support is disabled then this function * becomes a no-operation. @@ -213,7 +212,7 @@ static void umh_complete(struct subprocess_info *sub_info) /* * This is the task which runs the usermode application */ -static int ____call_usermodehelper(void *data) +static int call_usermodehelper_exec_async(void *data) { struct subprocess_info *sub_info = data; struct cred *new; @@ -223,12 +222,9 @@ static int ____call_usermodehelper(void *data) flush_signal_handlers(current, 1); spin_unlock_irq(¤t->sighand->siglock); - /* We can run anywhere, unlike our parent keventd(). */ - set_cpus_allowed_ptr(current, cpu_all_mask); - /* - * Our parent is keventd, which runs with elevated scheduling priority. - * Avoid propagating that into the userspace child. + * Our parent (unbound workqueue) runs with elevated scheduling + * priority. Avoid propagating that into the userspace child. */ set_user_nice(current, 0); @@ -258,7 +254,10 @@ static int ____call_usermodehelper(void *data) (const char __user *const __user *)sub_info->envp); out: sub_info->retval = retval; - /* wait_for_helper() will call umh_complete if UHM_WAIT_PROC. */ + /* + * call_usermodehelper_exec_sync() will call umh_complete + * if UHM_WAIT_PROC. + */ if (!(sub_info->wait & UMH_WAIT_PROC)) umh_complete(sub_info); if (!retval) @@ -266,15 +265,14 @@ out: do_exit(0); } -/* Keventd can't block, but this (a child) can. */ -static int wait_for_helper(void *data) +/* Handles UMH_WAIT_PROC. */ +static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info) { - struct subprocess_info *sub_info = data; pid_t pid; /* If SIGCLD is ignored sys_wait4 won't populate the status. */ kernel_sigaction(SIGCHLD, SIG_DFL); - pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD); + pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD); if (pid < 0) { sub_info->retval = pid; } else { @@ -282,44 +280,64 @@ static int wait_for_helper(void *data) /* * Normally it is bogus to call wait4() from in-kernel because * wait4() wants to write the exit code to a userspace address. - * But wait_for_helper() always runs as keventd, and put_user() - * to a kernel address works OK for kernel threads, due to their - * having an mm_segment_t which spans the entire address space. + * But call_usermodehelper_exec_sync() always runs as kernel + * thread (workqueue) and put_user() to a kernel address works + * OK for kernel threads, due to their having an mm_segment_t + * which spans the entire address space. * * Thus the __user pointer cast is valid here. */ sys_wait4(pid, (int __user *)&ret, 0, NULL); /* - * If ret is 0, either ____call_usermodehelper failed and the - * real error code is already in sub_info->retval or + * If ret is 0, either call_usermodehelper_exec_async failed and + * the real error code is already in sub_info->retval or * sub_info->retval is 0 anyway, so don't mess with it then. */ if (ret) sub_info->retval = ret; } + /* Restore default kernel sig handler */ + kernel_sigaction(SIGCHLD, SIG_IGN); + umh_complete(sub_info); - do_exit(0); } -/* This is run by khelper thread */ -static void __call_usermodehelper(struct work_struct *work) +/* + * We need to create the usermodehelper kernel thread from a task that is affine + * to an optimized set of CPUs (or nohz housekeeping ones) such that they + * inherit a widest affinity irrespective of call_usermodehelper() callers with + * possibly reduced affinity (eg: per-cpu workqueues). We don't want + * usermodehelper targets to contend a busy CPU. + * + * Unbound workqueues provide such wide affinity and allow to block on + * UMH_WAIT_PROC requests without blocking pending request (up to some limit). + * + * Besides, workqueues provide the privilege level that caller might not have + * to perform the usermodehelper request. + * + */ +static void call_usermodehelper_exec_work(struct work_struct *work) { struct subprocess_info *sub_info = container_of(work, struct subprocess_info, work); - pid_t pid; - if (sub_info->wait & UMH_WAIT_PROC) - pid = kernel_thread(wait_for_helper, sub_info, - CLONE_FS | CLONE_FILES | SIGCHLD); - else - pid = kernel_thread(____call_usermodehelper, sub_info, - SIGCHLD); - - if (pid < 0) { - sub_info->retval = pid; - umh_complete(sub_info); + if (sub_info->wait & UMH_WAIT_PROC) { + call_usermodehelper_exec_sync(sub_info); + } else { + pid_t pid; + /* + * Use CLONE_PARENT to reparent it to kthreadd; we do not + * want to pollute current->children, and we need a parent + * that always ignores SIGCHLD to ensure auto-reaping. + */ + pid = kernel_thread(call_usermodehelper_exec_async, sub_info, + CLONE_PARENT | SIGCHLD); + if (pid < 0) { + sub_info->retval = pid; + umh_complete(sub_info); + } } } @@ -509,7 +527,7 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, if (!sub_info) goto out; - INIT_WORK(&sub_info->work, __call_usermodehelper); + INIT_WORK(&sub_info->work, call_usermodehelper_exec_work); sub_info->path = path; sub_info->argv = argv; sub_info->envp = envp; @@ -531,8 +549,8 @@ EXPORT_SYMBOL(call_usermodehelper_setup); * from interrupt context. * * Runs a user-space application. The application is started - * asynchronously if wait is not set, and runs as a child of keventd. - * (ie. it runs with full root capabilities). + * asynchronously if wait is not set, and runs as a child of system workqueues. + * (ie. it runs with full root capabilities and optimized affinity). */ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) { @@ -544,7 +562,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) return -EINVAL; } helper_lock(); - if (!khelper_wq || usermodehelper_disabled) { + if (usermodehelper_disabled) { retval = -EBUSY; goto out; } @@ -556,7 +574,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done; sub_info->wait = wait; - queue_work(khelper_wq, &sub_info->work); + queue_work(system_unbound_wq, &sub_info->work); if (wait == UMH_NO_WAIT) /* task has freed sub_info */ goto unlock; @@ -686,9 +704,3 @@ struct ctl_table usermodehelper_table[] = { }, { } }; - -void __init usermodehelper_init(void) -{ - khelper_wq = create_singlethread_workqueue("khelper"); - BUG_ON(!khelper_wq); -} diff --git a/kernel/kernel/kprobes.c b/kernel/kernel/kprobes.c index c90e417bb..d10ab6b9b 100644 --- a/kernel/kernel/kprobes.c +++ b/kernel/kernel/kprobes.c @@ -1332,7 +1332,7 @@ bool __weak arch_within_kprobe_blacklist(unsigned long addr) addr < (unsigned long)__kprobes_text_end; } -static bool within_kprobe_blacklist(unsigned long addr) +bool within_kprobe_blacklist(unsigned long addr) { struct kprobe_blacklist_entry *ent; diff --git a/kernel/kernel/ksysfs.c b/kernel/kernel/ksysfs.c index d6fc8eeaa..c0e08d1cf 100644 --- a/kernel/kernel/ksysfs.c +++ b/kernel/kernel/ksysfs.c @@ -90,7 +90,7 @@ static ssize_t profiling_store(struct kobject *kobj, KERNEL_ATTR_RW(profiling); #endif -#ifdef CONFIG_KEXEC +#ifdef CONFIG_KEXEC_CORE static ssize_t kexec_loaded_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -134,7 +134,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj, } KERNEL_ATTR_RO(vmcoreinfo); -#endif /* CONFIG_KEXEC */ +#endif /* CONFIG_KEXEC_CORE */ #if defined(CONFIG_PREEMPT_RT_FULL) static ssize_t realtime_show(struct kobject *kobj, @@ -205,7 +205,7 @@ static struct attribute * kernel_attrs[] = { #ifdef CONFIG_PROFILING &profiling_attr.attr, #endif -#ifdef CONFIG_KEXEC +#ifdef CONFIG_KEXEC_CORE &kexec_loaded_attr.attr, &kexec_crash_loaded_attr.attr, &kexec_crash_size_attr.attr, diff --git a/kernel/kernel/kthread.c b/kernel/kernel/kthread.c index 10e489c44..9ff173dca 100644 --- a/kernel/kernel/kthread.c +++ b/kernel/kernel/kthread.c @@ -97,6 +97,7 @@ bool kthread_should_park(void) { return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(current)->flags); } +EXPORT_SYMBOL_GPL(kthread_should_park); /** * kthread_freezable_should_stop - should this freezable kthread return now? @@ -171,6 +172,7 @@ void kthread_parkme(void) { __kthread_parkme(to_kthread(current)); } +EXPORT_SYMBOL_GPL(kthread_parkme); static int kthread(void *_create) { @@ -246,15 +248,16 @@ static void create_kthread(struct kthread_create_info *create) * kthread_create_on_node - create a kthread. * @threadfn: the function to run until signal_pending(current). * @data: data ptr for @threadfn. - * @node: memory node number. + * @node: task and thread structures for the thread are allocated on this node * @namefmt: printf-style name for the thread. * * Description: This helper function creates and names a kernel * thread. The thread will be stopped: use wake_up_process() to start - * it. See also kthread_run(). + * it. See also kthread_run(). The new thread has SCHED_NORMAL policy and + * is affine to all CPUs. * * If thread is going to be bound on a particular cpu, give its node - * in @node, to get NUMA affinity for kthread stack, or else give -1. + * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE. * When woken, the thread will run @threadfn() with @data as its * argument. @threadfn() can either call do_exit() directly if it is a * standalone thread for which no one will call kthread_stop(), or @@ -325,16 +328,30 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), } EXPORT_SYMBOL(kthread_create_on_node); -static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state) +static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, long state) { - /* Must have done schedule() in kthread() before we set_task_cpu */ + unsigned long flags; + if (!wait_task_inactive(p, state)) { WARN_ON(1); return; } + /* It's safe because the task is inactive. */ - do_set_cpus_allowed(p, cpumask_of(cpu)); + raw_spin_lock_irqsave(&p->pi_lock, flags); + do_set_cpus_allowed(p, mask); p->flags |= PF_NO_SETAFFINITY; + raw_spin_unlock_irqrestore(&p->pi_lock, flags); +} + +static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state) +{ + __kthread_bind_mask(p, cpumask_of(cpu), state); +} + +void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask) +{ + __kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE); } /** @@ -411,6 +428,7 @@ void kthread_unpark(struct task_struct *k) if (kthread) __kthread_unpark(k, kthread); } +EXPORT_SYMBOL_GPL(kthread_unpark); /** * kthread_park - park a thread created by kthread_create(). @@ -441,6 +459,7 @@ int kthread_park(struct task_struct *k) } return ret; } +EXPORT_SYMBOL_GPL(kthread_park); /** * kthread_stop - stop a thread created by kthread_create(). diff --git a/kernel/kernel/livepatch/core.c b/kernel/kernel/livepatch/core.c index 9ec555732..db545cbcd 100644 --- a/kernel/kernel/livepatch/core.c +++ b/kernel/kernel/livepatch/core.c @@ -128,7 +128,7 @@ static bool klp_is_patch_registered(struct klp_patch *patch) static bool klp_initialized(void) { - return klp_root_kobj; + return !!klp_root_kobj; } struct klp_find_arg { @@ -242,8 +242,9 @@ static int klp_find_verify_func_addr(struct klp_object *obj, int ret; #if defined(CONFIG_RANDOMIZE_BASE) - /* KASLR is enabled, disregard old_addr from user */ - func->old_addr = 0; + /* If KASLR has been enabled, adjust old_addr accordingly */ + if (kaslr_enabled() && func->old_addr) + func->old_addr += kaslr_offset(); #endif if (!func->old_addr || klp_is_module(obj)) @@ -293,6 +294,12 @@ static int klp_write_object_relocations(struct module *pmod, for (reloc = obj->relocs; reloc->name; reloc++) { if (!klp_is_module(obj)) { + +#if defined(CONFIG_RANDOMIZE_BASE) + /* If KASLR has been enabled, adjust old value accordingly */ + if (kaslr_enabled()) + reloc->val += kaslr_offset(); +#endif ret = klp_verify_vmlinux_symbol(reloc->name, reloc->val); if (ret) @@ -347,8 +354,10 @@ static void klp_disable_func(struct klp_func *func) { struct klp_ops *ops; - WARN_ON(func->state != KLP_ENABLED); - WARN_ON(!func->old_addr); + if (WARN_ON(func->state != KLP_ENABLED)) + return; + if (WARN_ON(!func->old_addr)) + return; ops = klp_find_ops(func->old_addr); if (WARN_ON(!ops)) @@ -430,7 +439,7 @@ static void klp_disable_object(struct klp_object *obj) { struct klp_func *func; - for (func = obj->funcs; func->old_name; func++) + klp_for_each_func(obj, func) if (func->state == KLP_ENABLED) klp_disable_func(func); @@ -448,7 +457,7 @@ static int klp_enable_object(struct klp_object *obj) if (WARN_ON(!klp_is_object_loaded(obj))) return -EINVAL; - for (func = obj->funcs; func->old_name; func++) { + klp_for_each_func(obj, func) { ret = klp_enable_func(func); if (ret) { klp_disable_object(obj); @@ -471,7 +480,7 @@ static int __klp_disable_patch(struct klp_patch *patch) pr_notice("disabling patch '%s'\n", patch->mod->name); - for (obj = patch->objs; obj->funcs; obj++) { + klp_for_each_object(patch, obj) { if (obj->state == KLP_ENABLED) klp_disable_object(obj); } @@ -531,7 +540,7 @@ static int __klp_enable_patch(struct klp_patch *patch) pr_notice("enabling patch '%s'\n", patch->mod->name); - for (obj = patch->objs; obj->funcs; obj++) { + klp_for_each_object(patch, obj) { if (!klp_is_object_loaded(obj)) continue; @@ -659,6 +668,15 @@ static struct kobj_type klp_ktype_patch = { .default_attrs = klp_patch_attrs, }; +static void klp_kobj_release_object(struct kobject *kobj) +{ +} + +static struct kobj_type klp_ktype_object = { + .release = klp_kobj_release_object, + .sysfs_ops = &kobj_sysfs_ops, +}; + static void klp_kobj_release_func(struct kobject *kobj) { } @@ -688,7 +706,7 @@ static void klp_free_object_loaded(struct klp_object *obj) obj->mod = NULL; - for (func = obj->funcs; func->old_name; func++) + klp_for_each_func(obj, func) func->old_addr = 0; } @@ -703,7 +721,7 @@ static void klp_free_objects_limited(struct klp_patch *patch, for (obj = patch->objs; obj->funcs && obj != limit; obj++) { klp_free_funcs_limited(obj, NULL); - kobject_put(obj->kobj); + kobject_put(&obj->kobj); } } @@ -721,7 +739,7 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func) func->state = KLP_DISABLED; return kobject_init_and_add(&func->kobj, &klp_ktype_func, - obj->kobj, "%s", func->old_name); + &obj->kobj, "%s", func->old_name); } /* parts of the initialization that is done only when the object is loaded */ @@ -737,7 +755,7 @@ static int klp_init_object_loaded(struct klp_patch *patch, return ret; } - for (func = obj->funcs; func->old_name; func++) { + klp_for_each_func(obj, func) { ret = klp_find_verify_func_addr(obj, func); if (ret) return ret; @@ -761,11 +779,12 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj) klp_find_object_module(obj); name = klp_is_module(obj) ? obj->name : "vmlinux"; - obj->kobj = kobject_create_and_add(name, &patch->kobj); - if (!obj->kobj) - return -ENOMEM; + ret = kobject_init_and_add(&obj->kobj, &klp_ktype_object, + &patch->kobj, "%s", name); + if (ret) + return ret; - for (func = obj->funcs; func->old_name; func++) { + klp_for_each_func(obj, func) { ret = klp_init_func(obj, func); if (ret) goto free; @@ -781,7 +800,7 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj) free: klp_free_funcs_limited(obj, func); - kobject_put(obj->kobj); + kobject_put(&obj->kobj); return ret; } @@ -802,7 +821,7 @@ static int klp_init_patch(struct klp_patch *patch) if (ret) goto unlock; - for (obj = patch->objs; obj->funcs; obj++) { + klp_for_each_object(patch, obj) { ret = klp_init_object(patch, obj); if (ret) goto free; @@ -891,7 +910,7 @@ int klp_register_patch(struct klp_patch *patch) } EXPORT_SYMBOL_GPL(klp_register_patch); -static void klp_module_notify_coming(struct klp_patch *patch, +static int klp_module_notify_coming(struct klp_patch *patch, struct klp_object *obj) { struct module *pmod = patch->mod; @@ -899,22 +918,23 @@ static void klp_module_notify_coming(struct klp_patch *patch, int ret; ret = klp_init_object_loaded(patch, obj); - if (ret) - goto err; + if (ret) { + pr_warn("failed to initialize patch '%s' for module '%s' (%d)\n", + pmod->name, mod->name, ret); + return ret; + } if (patch->state == KLP_DISABLED) - return; + return 0; pr_notice("applying patch '%s' to loading module '%s'\n", pmod->name, mod->name); ret = klp_enable_object(obj); - if (!ret) - return; - -err: - pr_warn("failed to apply patch '%s' to module '%s' (%d)\n", - pmod->name, mod->name, ret); + if (ret) + pr_warn("failed to apply patch '%s' to module '%s' (%d)\n", + pmod->name, mod->name, ret); + return ret; } static void klp_module_notify_going(struct klp_patch *patch, @@ -938,6 +958,7 @@ disabled: static int klp_module_notify(struct notifier_block *nb, unsigned long action, void *data) { + int ret; struct module *mod = data; struct klp_patch *patch; struct klp_object *obj; @@ -957,13 +978,18 @@ static int klp_module_notify(struct notifier_block *nb, unsigned long action, mod->klp_alive = false; list_for_each_entry(patch, &klp_patches, list) { - for (obj = patch->objs; obj->funcs; obj++) { + klp_for_each_object(patch, obj) { if (!klp_is_module(obj) || strcmp(obj->name, mod->name)) continue; if (action == MODULE_STATE_COMING) { obj->mod = mod; - klp_module_notify_coming(patch, obj); + ret = klp_module_notify_coming(patch, obj); + if (ret) { + obj->mod = NULL; + pr_warn("patch '%s' is in an inconsistent state!\n", + patch->mod->name); + } } else /* MODULE_STATE_GOING */ klp_module_notify_going(patch, obj); @@ -981,7 +1007,7 @@ static struct notifier_block klp_module_nb = { .priority = INT_MIN+1, /* called late but before ftrace notifier */ }; -static int klp_init(void) +static int __init klp_init(void) { int ret; diff --git a/kernel/kernel/locking/Makefile b/kernel/kernel/locking/Makefile index ab269cf04..447b03082 100644 --- a/kernel/kernel/locking/Makefile +++ b/kernel/kernel/locking/Makefile @@ -1,5 +1,5 @@ -obj-y += semaphore.o +obj-y += semaphore.o percpu-rwsem.o ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE) @@ -21,16 +21,15 @@ obj-$(CONFIG_SMP) += spinlock.o obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o obj-$(CONFIG_SMP) += lglock.o obj-$(CONFIG_PROVE_LOCKING) += spinlock.o +obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o obj-$(CONFIG_RT_MUTEXES) += rtmutex.o obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o -obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o ifneq ($(CONFIG_PREEMPT_RT_FULL),y) obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o endif -obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o obj-$(CONFIG_PREEMPT_RT_FULL) += rt.o -obj-$(CONFIG_QUEUE_RWLOCK) += qrwlock.o +obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o diff --git a/kernel/kernel/locking/lglock.c b/kernel/kernel/locking/lglock.c index 9397974b1..57e0ea72c 100644 --- a/kernel/kernel/locking/lglock.c +++ b/kernel/kernel/locking/lglock.c @@ -10,7 +10,7 @@ # define lg_do_unlock(l) arch_spin_unlock(l) #else # define lg_lock_ptr struct rt_mutex -# define lg_do_lock(l) __rt_spin_lock(l) +# define lg_do_lock(l) __rt_spin_lock__no_mg(l) # define lg_do_unlock(l) __rt_spin_unlock(l) #endif /* @@ -78,6 +78,28 @@ void lg_local_unlock_cpu(struct lglock *lg, int cpu) } EXPORT_SYMBOL(lg_local_unlock_cpu); +void lg_double_lock(struct lglock *lg, int cpu1, int cpu2) +{ + BUG_ON(cpu1 == cpu2); + + /* lock in cpu order, just like lg_global_lock */ + if (cpu2 < cpu1) + swap(cpu1, cpu2); + + preempt_disable_nort(); + lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); + lg_do_lock(per_cpu_ptr(lg->lock, cpu1)); + lg_do_lock(per_cpu_ptr(lg->lock, cpu2)); +} + +void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2) +{ + lock_release(&lg->lock_dep_map, 1, _RET_IP_); + lg_do_unlock(per_cpu_ptr(lg->lock, cpu1)); + lg_do_unlock(per_cpu_ptr(lg->lock, cpu2)); + preempt_enable_nort(); +} + void lg_global_lock(struct lglock *lg) { int i; diff --git a/kernel/kernel/locking/lockdep.c b/kernel/kernel/locking/lockdep.c index 577f02617..e98ee958a 100644 --- a/kernel/kernel/locking/lockdep.c +++ b/kernel/kernel/locking/lockdep.c @@ -6,7 +6,7 @@ * Started by Ingo Molnar: * * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * * this code maps all the lock dependencies as they occur in a live kernel * and will warn about the following classes of locking bugs: @@ -2738,7 +2738,7 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) return; /* no reclaim without waiting on it */ - if (!(gfp_mask & __GFP_WAIT)) + if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) return; /* this guy won't enter reclaim */ @@ -3068,7 +3068,7 @@ static int __lock_is_held(struct lockdep_map *lock); static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, int trylock, int read, int check, int hardirqs_off, struct lockdep_map *nest_lock, unsigned long ip, - int references) + int references, int pin_count) { struct task_struct *curr = current; struct lock_class *class = NULL; @@ -3157,6 +3157,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, hlock->waittime_stamp = 0; hlock->holdtime_stamp = lockstat_clock(); #endif + hlock->pin_count = pin_count; if (check && !mark_irqflags(curr, hlock)) return 0; @@ -3260,26 +3261,6 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock, return 0; } -/* - * Common debugging checks for both nested and non-nested unlock: - */ -static int check_unlock(struct task_struct *curr, struct lockdep_map *lock, - unsigned long ip) -{ - if (unlikely(!debug_locks)) - return 0; - /* - * Lockdep should run with IRQs disabled, recursion, head-ache, etc.. - */ - if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) - return 0; - - if (curr->lockdep_depth <= 0) - return print_unlock_imbalance_bug(curr, lock, ip); - - return 1; -} - static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock) { if (hlock->instance == lock) @@ -3362,7 +3343,7 @@ found_it: hlock_class(hlock)->subclass, hlock->trylock, hlock->read, hlock->check, hlock->hardirqs_off, hlock->nest_lock, hlock->acquire_ip, - hlock->references)) + hlock->references, hlock->pin_count)) return 0; } @@ -3376,31 +3357,35 @@ found_it: } /* - * Remove the lock to the list of currently held locks in a - * potentially non-nested (out of order) manner. This is a - * relatively rare operation, as all the unlock APIs default - * to nested mode (which uses lock_release()): + * Remove the lock to the list of currently held locks - this gets + * called on mutex_unlock()/spin_unlock*() (or on a failed + * mutex_lock_interruptible()). + * + * @nested is an hysterical artifact, needs a tree wide cleanup. */ static int -lock_release_non_nested(struct task_struct *curr, - struct lockdep_map *lock, unsigned long ip) +__lock_release(struct lockdep_map *lock, int nested, unsigned long ip) { + struct task_struct *curr = current; struct held_lock *hlock, *prev_hlock; unsigned int depth; int i; - /* - * Check whether the lock exists in the current stack - * of held locks: - */ + if (unlikely(!debug_locks)) + return 0; + depth = curr->lockdep_depth; /* * So we're all set to release this lock.. wait what lock? We don't * own any locks, you've been drinking again? */ - if (DEBUG_LOCKS_WARN_ON(!depth)) - return 0; + if (DEBUG_LOCKS_WARN_ON(depth <= 0)) + return print_unlock_imbalance_bug(curr, lock, ip); + /* + * Check whether the lock exists in the current stack + * of held locks: + */ prev_hlock = NULL; for (i = depth-1; i >= 0; i--) { hlock = curr->held_locks + i; @@ -3419,6 +3404,8 @@ found_it: if (hlock->instance == lock) lock_release_holdtime(hlock); + WARN(hlock->pin_count, "releasing a pinned lock\n"); + if (hlock->references) { hlock->references--; if (hlock->references) { @@ -3446,7 +3433,7 @@ found_it: hlock_class(hlock)->subclass, hlock->trylock, hlock->read, hlock->check, hlock->hardirqs_off, hlock->nest_lock, hlock->acquire_ip, - hlock->references)) + hlock->references, hlock->pin_count)) return 0; } @@ -3456,91 +3443,66 @@ found_it: */ if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1)) return 0; + return 1; } -/* - * Remove the lock to the list of currently held locks - this gets - * called on mutex_unlock()/spin_unlock*() (or on a failed - * mutex_lock_interruptible()). This is done for unlocks that nest - * perfectly. (i.e. the current top of the lock-stack is unlocked) - */ -static int lock_release_nested(struct task_struct *curr, - struct lockdep_map *lock, unsigned long ip) +static int __lock_is_held(struct lockdep_map *lock) { - struct held_lock *hlock; - unsigned int depth; - - /* - * Pop off the top of the lock stack: - */ - depth = curr->lockdep_depth - 1; - hlock = curr->held_locks + depth; - - /* - * Is the unlock non-nested: - */ - if (hlock->instance != lock || hlock->references) - return lock_release_non_nested(curr, lock, ip); - curr->lockdep_depth--; - - /* - * No more locks, but somehow we've got hash left over, who left it? - */ - if (DEBUG_LOCKS_WARN_ON(!depth && (hlock->prev_chain_key != 0))) - return 0; + struct task_struct *curr = current; + int i; - curr->curr_chain_key = hlock->prev_chain_key; + for (i = 0; i < curr->lockdep_depth; i++) { + struct held_lock *hlock = curr->held_locks + i; - lock_release_holdtime(hlock); + if (match_held_lock(hlock, lock)) + return 1; + } -#ifdef CONFIG_DEBUG_LOCKDEP - hlock->prev_chain_key = 0; - hlock->class_idx = 0; - hlock->acquire_ip = 0; - hlock->irq_context = 0; -#endif - return 1; + return 0; } -/* - * Remove the lock to the list of currently held locks - this gets - * called on mutex_unlock()/spin_unlock*() (or on a failed - * mutex_lock_interruptible()). This is done for unlocks that nest - * perfectly. (i.e. the current top of the lock-stack is unlocked) - */ -static void -__lock_release(struct lockdep_map *lock, int nested, unsigned long ip) +static void __lock_pin_lock(struct lockdep_map *lock) { struct task_struct *curr = current; + int i; - if (!check_unlock(curr, lock, ip)) + if (unlikely(!debug_locks)) return; - if (nested) { - if (!lock_release_nested(curr, lock, ip)) - return; - } else { - if (!lock_release_non_nested(curr, lock, ip)) + for (i = 0; i < curr->lockdep_depth; i++) { + struct held_lock *hlock = curr->held_locks + i; + + if (match_held_lock(hlock, lock)) { + hlock->pin_count++; return; + } } - check_chain_key(curr); + WARN(1, "pinning an unheld lock\n"); } -static int __lock_is_held(struct lockdep_map *lock) +static void __lock_unpin_lock(struct lockdep_map *lock) { struct task_struct *curr = current; int i; + if (unlikely(!debug_locks)) + return; + for (i = 0; i < curr->lockdep_depth; i++) { struct held_lock *hlock = curr->held_locks + i; - if (match_held_lock(hlock, lock)) - return 1; + if (match_held_lock(hlock, lock)) { + if (WARN(!hlock->pin_count, "unpinning an unpinned lock\n")) + return; + + hlock->pin_count--; + return; + } } - return 0; + WARN(1, "unpinning an unheld lock\n"); } /* @@ -3623,7 +3585,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, current->lockdep_recursion = 1; trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip); __lock_acquire(lock, subclass, trylock, read, check, - irqs_disabled_flags(flags), nest_lock, ip, 0); + irqs_disabled_flags(flags), nest_lock, ip, 0, 0); current->lockdep_recursion = 0; raw_local_irq_restore(flags); } @@ -3641,7 +3603,8 @@ void lock_release(struct lockdep_map *lock, int nested, check_flags(flags); current->lockdep_recursion = 1; trace_lock_release(lock, ip); - __lock_release(lock, nested, ip); + if (__lock_release(lock, nested, ip)) + check_chain_key(current); current->lockdep_recursion = 0; raw_local_irq_restore(flags); } @@ -3667,6 +3630,40 @@ int lock_is_held(struct lockdep_map *lock) } EXPORT_SYMBOL_GPL(lock_is_held); +void lock_pin_lock(struct lockdep_map *lock) +{ + unsigned long flags; + + if (unlikely(current->lockdep_recursion)) + return; + + raw_local_irq_save(flags); + check_flags(flags); + + current->lockdep_recursion = 1; + __lock_pin_lock(lock); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lock_pin_lock); + +void lock_unpin_lock(struct lockdep_map *lock) +{ + unsigned long flags; + + if (unlikely(current->lockdep_recursion)) + return; + + raw_local_irq_save(flags); + check_flags(flags); + + current->lockdep_recursion = 1; + __lock_unpin_lock(lock); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lock_unpin_lock); + void lockdep_set_current_reclaim_state(gfp_t gfp_mask) { current->lockdep_reclaim_gfp = gfp_mask; @@ -4069,8 +4066,7 @@ void __init lockdep_info(void) #ifdef CONFIG_DEBUG_LOCKDEP if (lockdep_init_error) { - printk("WARNING: lockdep init error! lock-%s was acquired" - "before lockdep_init\n", lock_init_error); + printk("WARNING: lockdep init error: lock '%s' was acquired before lockdep_init().\n", lock_init_error); printk("Call stack leading to lockdep invocation was:\n"); print_stack_trace(&lockdep_init_trace, 0); } diff --git a/kernel/kernel/locking/lockdep_proc.c b/kernel/kernel/locking/lockdep_proc.c index d83d798be..dbb61a302 100644 --- a/kernel/kernel/locking/lockdep_proc.c +++ b/kernel/kernel/locking/lockdep_proc.c @@ -6,7 +6,7 @@ * Started by Ingo Molnar: * * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * * Code for /proc/lockdep and /proc/lockdep_stats: * diff --git a/kernel/kernel/locking/locktorture.c b/kernel/kernel/locking/locktorture.c index aa60d919e..291fc19e2 100644 --- a/kernel/kernel/locking/locktorture.c +++ b/kernel/kernel/locking/locktorture.c @@ -17,12 +17,14 @@ * * Copyright (C) IBM Corporation, 2014 * - * Author: Paul E. McKenney <paulmck@us.ibm.com> + * Authors: Paul E. McKenney <paulmck@us.ibm.com> + * Davidlohr Bueso <dave@stgolabs.net> * Based on kernel/rcu/torture.c. */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/kthread.h> +#include <linux/sched/rt.h> #include <linux/spinlock.h> #include <linux/mutex.h> #include <linux/rwsem.h> @@ -33,6 +35,7 @@ #include <linux/moduleparam.h> #include <linux/delay.h> #include <linux/slab.h> +#include <linux/percpu-rwsem.h> #include <linux/torture.h> MODULE_LICENSE("GPL"); @@ -90,11 +93,13 @@ struct lock_torture_ops { void (*init)(void); int (*writelock)(void); void (*write_delay)(struct torture_random_state *trsp); + void (*task_boost)(struct torture_random_state *trsp); void (*writeunlock)(void); int (*readlock)(void); void (*read_delay)(struct torture_random_state *trsp); void (*readunlock)(void); - unsigned long flags; + + unsigned long flags; /* for irq spinlocks */ const char *name; }; @@ -121,12 +126,12 @@ static int torture_lock_busted_write_lock(void) static void torture_lock_busted_write_delay(struct torture_random_state *trsp) { - const unsigned long longdelay_us = 100; + const unsigned long longdelay_ms = 100; /* We want a long delay occasionally to force massive contention. */ if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2000 * longdelay_us))) - mdelay(longdelay_us); + (cxt.nrealwriters_stress * 2000 * longdelay_ms))) + mdelay(longdelay_ms); #ifdef CONFIG_PREEMPT if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) preempt_schedule(); /* Allow test to be preempted. */ @@ -138,9 +143,15 @@ static void torture_lock_busted_write_unlock(void) /* BUGGY, do not use in real life!!! */ } +static void torture_boost_dummy(struct torture_random_state *trsp) +{ + /* Only rtmutexes care about priority */ +} + static struct lock_torture_ops lock_busted_ops = { .writelock = torture_lock_busted_write_lock, .write_delay = torture_lock_busted_write_delay, + .task_boost = torture_boost_dummy, .writeunlock = torture_lock_busted_write_unlock, .readlock = NULL, .read_delay = NULL, @@ -159,14 +170,14 @@ static int torture_spin_lock_write_lock(void) __acquires(torture_spinlock) static void torture_spin_lock_write_delay(struct torture_random_state *trsp) { const unsigned long shortdelay_us = 2; - const unsigned long longdelay_us = 100; + const unsigned long longdelay_ms = 100; /* We want a short delay mostly to emulate likely code, and * we want a long delay occasionally to force massive contention. */ if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2000 * longdelay_us))) - mdelay(longdelay_us); + (cxt.nrealwriters_stress * 2000 * longdelay_ms))) + mdelay(longdelay_ms); if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 2 * shortdelay_us))) udelay(shortdelay_us); @@ -184,6 +195,7 @@ static void torture_spin_lock_write_unlock(void) __releases(torture_spinlock) static struct lock_torture_ops spin_lock_ops = { .writelock = torture_spin_lock_write_lock, .write_delay = torture_spin_lock_write_delay, + .task_boost = torture_boost_dummy, .writeunlock = torture_spin_lock_write_unlock, .readlock = NULL, .read_delay = NULL, @@ -210,6 +222,7 @@ __releases(torture_spinlock) static struct lock_torture_ops spin_lock_irq_ops = { .writelock = torture_spin_lock_write_lock_irq, .write_delay = torture_spin_lock_write_delay, + .task_boost = torture_boost_dummy, .writeunlock = torture_lock_spin_write_unlock_irq, .readlock = NULL, .read_delay = NULL, @@ -274,6 +287,7 @@ static void torture_rwlock_read_unlock(void) __releases(torture_rwlock) static struct lock_torture_ops rw_lock_ops = { .writelock = torture_rwlock_write_lock, .write_delay = torture_rwlock_write_delay, + .task_boost = torture_boost_dummy, .writeunlock = torture_rwlock_write_unlock, .readlock = torture_rwlock_read_lock, .read_delay = torture_rwlock_read_delay, @@ -308,12 +322,13 @@ static int torture_rwlock_read_lock_irq(void) __acquires(torture_rwlock) static void torture_rwlock_read_unlock_irq(void) __releases(torture_rwlock) { - write_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags); + read_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags); } static struct lock_torture_ops rw_lock_irq_ops = { .writelock = torture_rwlock_write_lock_irq, .write_delay = torture_rwlock_write_delay, + .task_boost = torture_boost_dummy, .writeunlock = torture_rwlock_write_unlock_irq, .readlock = torture_rwlock_read_lock_irq, .read_delay = torture_rwlock_read_delay, @@ -353,6 +368,7 @@ static void torture_mutex_unlock(void) __releases(torture_mutex) static struct lock_torture_ops mutex_lock_ops = { .writelock = torture_mutex_lock, .write_delay = torture_mutex_delay, + .task_boost = torture_boost_dummy, .writeunlock = torture_mutex_unlock, .readlock = NULL, .read_delay = NULL, @@ -360,6 +376,90 @@ static struct lock_torture_ops mutex_lock_ops = { .name = "mutex_lock" }; +#ifdef CONFIG_RT_MUTEXES +static DEFINE_RT_MUTEX(torture_rtmutex); + +static int torture_rtmutex_lock(void) __acquires(torture_rtmutex) +{ + rt_mutex_lock(&torture_rtmutex); + return 0; +} + +static void torture_rtmutex_boost(struct torture_random_state *trsp) +{ + int policy; + struct sched_param param; + const unsigned int factor = 50000; /* yes, quite arbitrary */ + + if (!rt_task(current)) { + /* + * (1) Boost priority once every ~50k operations. When the + * task tries to take the lock, the rtmutex it will account + * for the new priority, and do any corresponding pi-dance. + */ + if (!(torture_random(trsp) % + (cxt.nrealwriters_stress * factor))) { + policy = SCHED_FIFO; + param.sched_priority = MAX_RT_PRIO - 1; + } else /* common case, do nothing */ + return; + } else { + /* + * The task will remain boosted for another ~500k operations, + * then restored back to its original prio, and so forth. + * + * When @trsp is nil, we want to force-reset the task for + * stopping the kthread. + */ + if (!trsp || !(torture_random(trsp) % + (cxt.nrealwriters_stress * factor * 2))) { + policy = SCHED_NORMAL; + param.sched_priority = 0; + } else /* common case, do nothing */ + return; + } + + sched_setscheduler_nocheck(current, policy, ¶m); +} + +static void torture_rtmutex_delay(struct torture_random_state *trsp) +{ + const unsigned long shortdelay_us = 2; + const unsigned long longdelay_ms = 100; + + /* + * We want a short delay mostly to emulate likely code, and + * we want a long delay occasionally to force massive contention. + */ + if (!(torture_random(trsp) % + (cxt.nrealwriters_stress * 2000 * longdelay_ms))) + mdelay(longdelay_ms); + if (!(torture_random(trsp) % + (cxt.nrealwriters_stress * 2 * shortdelay_us))) + udelay(shortdelay_us); +#ifdef CONFIG_PREEMPT + if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) + preempt_schedule(); /* Allow test to be preempted. */ +#endif +} + +static void torture_rtmutex_unlock(void) __releases(torture_rtmutex) +{ + rt_mutex_unlock(&torture_rtmutex); +} + +static struct lock_torture_ops rtmutex_lock_ops = { + .writelock = torture_rtmutex_lock, + .write_delay = torture_rtmutex_delay, + .task_boost = torture_rtmutex_boost, + .writeunlock = torture_rtmutex_unlock, + .readlock = NULL, + .read_delay = NULL, + .readunlock = NULL, + .name = "rtmutex_lock" +}; +#endif + static DECLARE_RWSEM(torture_rwsem); static int torture_rwsem_down_write(void) __acquires(torture_rwsem) { @@ -418,6 +518,7 @@ static void torture_rwsem_up_read(void) __releases(torture_rwsem) static struct lock_torture_ops rwsem_lock_ops = { .writelock = torture_rwsem_down_write, .write_delay = torture_rwsem_write_delay, + .task_boost = torture_boost_dummy, .writeunlock = torture_rwsem_up_write, .readlock = torture_rwsem_down_read, .read_delay = torture_rwsem_read_delay, @@ -425,6 +526,48 @@ static struct lock_torture_ops rwsem_lock_ops = { .name = "rwsem_lock" }; +#include <linux/percpu-rwsem.h> +static struct percpu_rw_semaphore pcpu_rwsem; + +void torture_percpu_rwsem_init(void) +{ + BUG_ON(percpu_init_rwsem(&pcpu_rwsem)); +} + +static int torture_percpu_rwsem_down_write(void) __acquires(pcpu_rwsem) +{ + percpu_down_write(&pcpu_rwsem); + return 0; +} + +static void torture_percpu_rwsem_up_write(void) __releases(pcpu_rwsem) +{ + percpu_up_write(&pcpu_rwsem); +} + +static int torture_percpu_rwsem_down_read(void) __acquires(pcpu_rwsem) +{ + percpu_down_read(&pcpu_rwsem); + return 0; +} + +static void torture_percpu_rwsem_up_read(void) __releases(pcpu_rwsem) +{ + percpu_up_read(&pcpu_rwsem); +} + +static struct lock_torture_ops percpu_rwsem_lock_ops = { + .init = torture_percpu_rwsem_init, + .writelock = torture_percpu_rwsem_down_write, + .write_delay = torture_rwsem_write_delay, + .task_boost = torture_boost_dummy, + .writeunlock = torture_percpu_rwsem_up_write, + .readlock = torture_percpu_rwsem_down_read, + .read_delay = torture_rwsem_read_delay, + .readunlock = torture_percpu_rwsem_up_read, + .name = "percpu_rwsem_lock" +}; + /* * Lock torture writer kthread. Repeatedly acquires and releases * the lock, checking for duplicate acquisitions. @@ -441,6 +584,7 @@ static int lock_torture_writer(void *arg) if ((torture_random(&rand) & 0xfffff) == 0) schedule_timeout_uninterruptible(1); + cxt.cur_ops->task_boost(&rand); cxt.cur_ops->writelock(); if (WARN_ON_ONCE(lock_is_write_held)) lwsp->n_lock_fail++; @@ -455,6 +599,8 @@ static int lock_torture_writer(void *arg) stutter_wait("lock_torture_writer"); } while (!torture_must_stop()); + + cxt.cur_ops->task_boost(NULL); /* reset prio */ torture_kthread_stopping("lock_torture_writer"); return 0; } @@ -641,7 +787,11 @@ static int __init lock_torture_init(void) &spin_lock_ops, &spin_lock_irq_ops, &rw_lock_ops, &rw_lock_irq_ops, &mutex_lock_ops, +#ifdef CONFIG_RT_MUTEXES + &rtmutex_lock_ops, +#endif &rwsem_lock_ops, + &percpu_rwsem_lock_ops, }; if (!torture_init_begin(torture_type, verbose, &torture_runnable)) @@ -660,11 +810,11 @@ static int __init lock_torture_init(void) for (i = 0; i < ARRAY_SIZE(torture_ops); i++) pr_alert(" %s", torture_ops[i]->name); pr_alert("\n"); - torture_init_end(); - return -EINVAL; + firsterr = -EINVAL; + goto unwind; } if (cxt.cur_ops->init) - cxt.cur_ops->init(); /* no "goto unwind" prior to this point!!! */ + cxt.cur_ops->init(); if (nwriters_stress >= 0) cxt.nrealwriters_stress = nwriters_stress; @@ -675,6 +825,10 @@ static int __init lock_torture_init(void) if (strncmp(torture_type, "mutex", 5) == 0) cxt.debug_lock = true; #endif +#ifdef CONFIG_DEBUG_RT_MUTEXES + if (strncmp(torture_type, "rtmutex", 7) == 0) + cxt.debug_lock = true; +#endif #ifdef CONFIG_DEBUG_SPINLOCK if ((strncmp(torture_type, "spin", 4) == 0) || (strncmp(torture_type, "rw_lock", 7) == 0)) diff --git a/kernel/kernel/locking/mcs_spinlock.h b/kernel/kernel/locking/mcs_spinlock.h index 75e114bdf..5b9102a47 100644 --- a/kernel/kernel/locking/mcs_spinlock.h +++ b/kernel/kernel/locking/mcs_spinlock.h @@ -17,6 +17,7 @@ struct mcs_spinlock { struct mcs_spinlock *next; int locked; /* 1 if lock acquired */ + int count; /* nesting count, see qspinlock.c */ }; #ifndef arch_mcs_spin_lock_contended @@ -66,7 +67,7 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node) node->locked = 0; node->next = NULL; - prev = xchg(lock, node); + prev = xchg_acquire(lock, node); if (likely(prev == NULL)) { /* * Lock acquired, don't need to set node->locked to 1. Threads @@ -97,7 +98,7 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) /* * Release the lock by setting it to NULL */ - if (likely(cmpxchg(lock, node, NULL) == node)) + if (likely(cmpxchg_release(lock, node, NULL) == node)) return; /* Wait until the next pointer is set */ while (!(next = READ_ONCE(node->next))) diff --git a/kernel/kernel/locking/mutex.c b/kernel/kernel/locking/mutex.c index 4cccea6b8..0551c219c 100644 --- a/kernel/kernel/locking/mutex.c +++ b/kernel/kernel/locking/mutex.c @@ -277,7 +277,7 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock) static inline bool mutex_try_to_acquire(struct mutex *lock) { return !mutex_is_locked(lock) && - (atomic_cmpxchg(&lock->count, 1, 0) == 1); + (atomic_cmpxchg_acquire(&lock->count, 1, 0) == 1); } /* @@ -529,7 +529,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * Once more, try to acquire the lock. Only try-lock the mutex if * it is unlocked to reduce unnecessary xchg() operations. */ - if (!mutex_is_locked(lock) && (atomic_xchg(&lock->count, 0) == 1)) + if (!mutex_is_locked(lock) && + (atomic_xchg_acquire(&lock->count, 0) == 1)) goto skip_wait; debug_mutex_lock_common(lock, &waiter); @@ -553,7 +554,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * non-negative in order to avoid unnecessary xchg operations: */ if (atomic_read(&lock->count) >= 0 && - (atomic_xchg(&lock->count, -1) == 1)) + (atomic_xchg_acquire(&lock->count, -1) == 1)) break; /* @@ -867,7 +868,7 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count) spin_lock_mutex(&lock->wait_lock, flags); - prev = atomic_xchg(&lock->count, -1); + prev = atomic_xchg_acquire(&lock->count, -1); if (likely(prev == 1)) { mutex_set_owner(lock); mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); diff --git a/kernel/kernel/locking/osq_lock.c b/kernel/kernel/locking/osq_lock.c index dc85ee23a..05a37857a 100644 --- a/kernel/kernel/locking/osq_lock.c +++ b/kernel/kernel/locking/osq_lock.c @@ -50,7 +50,7 @@ osq_wait_next(struct optimistic_spin_queue *lock, for (;;) { if (atomic_read(&lock->tail) == curr && - atomic_cmpxchg(&lock->tail, curr, old) == curr) { + atomic_cmpxchg_acquire(&lock->tail, curr, old) == curr) { /* * We were the last queued, we moved @lock back. @prev * will now observe @lock and will complete its @@ -92,6 +92,12 @@ bool osq_lock(struct optimistic_spin_queue *lock) node->next = NULL; node->cpu = curr; + /* + * We need both ACQUIRE (pairs with corresponding RELEASE in + * unlock() uncontended, or fastpath) and RELEASE (to publish + * the node fields we just initialised) semantics when updating + * the lock tail. + */ old = atomic_xchg(&lock->tail, curr); if (old == OSQ_UNLOCKED_VAL) return true; @@ -184,7 +190,8 @@ void osq_unlock(struct optimistic_spin_queue *lock) /* * Fast path for the uncontended case. */ - if (likely(atomic_cmpxchg(&lock->tail, curr, OSQ_UNLOCKED_VAL) == curr)) + if (likely(atomic_cmpxchg_release(&lock->tail, curr, + OSQ_UNLOCKED_VAL) == curr)) return; /* diff --git a/kernel/kernel/locking/percpu-rwsem.c b/kernel/kernel/locking/percpu-rwsem.c index 652a8ee8e..f231e0bb3 100644 --- a/kernel/kernel/locking/percpu-rwsem.c +++ b/kernel/kernel/locking/percpu-rwsem.c @@ -17,50 +17,43 @@ int __percpu_init_rwsem(struct percpu_rw_semaphore *brw, /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */ __init_rwsem(&brw->rw_sem, name, rwsem_key); - atomic_set(&brw->write_ctr, 0); + rcu_sync_init(&brw->rss, RCU_SCHED_SYNC); atomic_set(&brw->slow_read_ctr, 0); init_waitqueue_head(&brw->write_waitq); return 0; } +EXPORT_SYMBOL_GPL(__percpu_init_rwsem); void percpu_free_rwsem(struct percpu_rw_semaphore *brw) { + /* + * XXX: temporary kludge. The error path in alloc_super() + * assumes that percpu_free_rwsem() is safe after kzalloc(). + */ + if (!brw->fast_read_ctr) + return; + + rcu_sync_dtor(&brw->rss); free_percpu(brw->fast_read_ctr); brw->fast_read_ctr = NULL; /* catch use after free bugs */ } /* - * This is the fast-path for down_read/up_read, it only needs to ensure - * there is no pending writer (atomic_read(write_ctr) == 0) and inc/dec the - * fast per-cpu counter. The writer uses synchronize_sched_expedited() to - * serialize with the preempt-disabled section below. - * - * The nontrivial part is that we should guarantee acquire/release semantics - * in case when - * - * R_W: down_write() comes after up_read(), the writer should see all - * changes done by the reader - * or - * W_R: down_read() comes after up_write(), the reader should see all - * changes done by the writer + * This is the fast-path for down_read/up_read. If it succeeds we rely + * on the barriers provided by rcu_sync_enter/exit; see the comments in + * percpu_down_write() and percpu_up_write(). * * If this helper fails the callers rely on the normal rw_semaphore and * atomic_dec_and_test(), so in this case we have the necessary barriers. - * - * But if it succeeds we do not have any barriers, atomic_read(write_ctr) or - * __this_cpu_add() below can be reordered with any LOAD/STORE done by the - * reader inside the critical section. See the comments in down_write and - * up_write below. */ static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val) { - bool success = false; + bool success; preempt_disable(); - if (likely(!atomic_read(&brw->write_ctr))) { + success = rcu_sync_is_idle(&brw->rss); + if (likely(success)) __this_cpu_add(*brw->fast_read_ctr, val); - success = true; - } preempt_enable(); return success; @@ -77,16 +70,30 @@ static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val) void percpu_down_read(struct percpu_rw_semaphore *brw) { might_sleep(); - if (likely(update_fast_ctr(brw, +1))) { - rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_); + rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_); + + if (likely(update_fast_ctr(brw, +1))) return; - } - down_read(&brw->rw_sem); + /* Avoid rwsem_acquire_read() and rwsem_release() */ + __down_read(&brw->rw_sem); atomic_inc(&brw->slow_read_ctr); - /* avoid up_read()->rwsem_release() */ __up_read(&brw->rw_sem); } +EXPORT_SYMBOL_GPL(percpu_down_read); + +int percpu_down_read_trylock(struct percpu_rw_semaphore *brw) +{ + if (unlikely(!update_fast_ctr(brw, +1))) { + if (!__down_read_trylock(&brw->rw_sem)) + return 0; + atomic_inc(&brw->slow_read_ctr); + __up_read(&brw->rw_sem); + } + + rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 1, _RET_IP_); + return 1; +} void percpu_up_read(struct percpu_rw_semaphore *brw) { @@ -99,6 +106,7 @@ void percpu_up_read(struct percpu_rw_semaphore *brw) if (atomic_dec_and_test(&brw->slow_read_ctr)) wake_up_all(&brw->write_waitq); } +EXPORT_SYMBOL_GPL(percpu_up_read); static int clear_fast_ctr(struct percpu_rw_semaphore *brw) { @@ -113,33 +121,17 @@ static int clear_fast_ctr(struct percpu_rw_semaphore *brw) return sum; } -/* - * A writer increments ->write_ctr to force the readers to switch to the - * slow mode, note the atomic_read() check in update_fast_ctr(). - * - * After that the readers can only inc/dec the slow ->slow_read_ctr counter, - * ->fast_read_ctr is stable. Once the writer moves its sum into the slow - * counter it represents the number of active readers. - * - * Finally the writer takes ->rw_sem for writing and blocks the new readers, - * then waits until the slow counter becomes zero. - */ void percpu_down_write(struct percpu_rw_semaphore *brw) { - /* tell update_fast_ctr() there is a pending writer */ - atomic_inc(&brw->write_ctr); /* - * 1. Ensures that write_ctr != 0 is visible to any down_read/up_read - * so that update_fast_ctr() can't succeed. - * - * 2. Ensures we see the result of every previous this_cpu_add() in - * update_fast_ctr(). + * Make rcu_sync_is_idle() == F and thus disable the fast-path in + * percpu_down_read() and percpu_up_read(), and wait for gp pass. * - * 3. Ensures that if any reader has exited its critical section via - * fast-path, it executes a full memory barrier before we return. - * See R_W case in the comment above update_fast_ctr(). + * The latter synchronises us with the preceding readers which used + * the fast-past, so we can not miss the result of __this_cpu_add() + * or anything else inside their criticial sections. */ - synchronize_sched_expedited(); + rcu_sync_enter(&brw->rss); /* exclude other writers, and block the new readers completely */ down_write(&brw->rw_sem); @@ -150,16 +142,17 @@ void percpu_down_write(struct percpu_rw_semaphore *brw) /* wait for all readers to complete their percpu_up_read() */ wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr)); } +EXPORT_SYMBOL_GPL(percpu_down_write); void percpu_up_write(struct percpu_rw_semaphore *brw) { /* release the lock, but the readers can't use the fast-path */ up_write(&brw->rw_sem); /* - * Insert the barrier before the next fast-path in down_read, - * see W_R case in the comment above update_fast_ctr(). + * Enable the fast-path in percpu_down_read() and percpu_up_read() + * but only after another gp pass; this adds the necessary barrier + * to ensure the reader can't miss the changes done by us. */ - synchronize_sched_expedited(); - /* the last writer unblocks update_fast_ctr() */ - atomic_dec(&brw->write_ctr); + rcu_sync_exit(&brw->rss); } +EXPORT_SYMBOL_GPL(percpu_up_write); diff --git a/kernel/kernel/locking/qrwlock.c b/kernel/kernel/locking/qrwlock.c index f956ede7f..fec082338 100644 --- a/kernel/kernel/locking/qrwlock.c +++ b/kernel/kernel/locking/qrwlock.c @@ -1,5 +1,5 @@ /* - * Queue read/write lock + * Queued read/write locks * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -22,6 +22,26 @@ #include <linux/hardirq.h> #include <asm/qrwlock.h> +/* + * This internal data structure is used for optimizing access to some of + * the subfields within the atomic_t cnts. + */ +struct __qrwlock { + union { + atomic_t cnts; + struct { +#ifdef __LITTLE_ENDIAN + u8 wmode; /* Writer mode */ + u8 rcnts[3]; /* Reader counts */ +#else + u8 rcnts[3]; /* Reader counts */ + u8 wmode; /* Writer mode */ +#endif + }; + }; + arch_spinlock_t lock; +}; + /** * rspin_until_writer_unlock - inc reader count & spin until writer is gone * @lock : Pointer to queue rwlock structure @@ -35,27 +55,29 @@ rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts) { while ((cnts & _QW_WMASK) == _QW_LOCKED) { cpu_relax_lowlatency(); - cnts = smp_load_acquire((u32 *)&lock->cnts); + cnts = atomic_read_acquire(&lock->cnts); } } /** - * queue_read_lock_slowpath - acquire read lock of a queue rwlock + * queued_read_lock_slowpath - acquire read lock of a queue rwlock * @lock: Pointer to queue rwlock structure + * @cnts: Current qrwlock lock value */ -void queue_read_lock_slowpath(struct qrwlock *lock) +void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts) { - u32 cnts; - /* * Readers come here when they cannot get the lock without waiting */ if (unlikely(in_interrupt())) { /* - * Readers in interrupt context will spin until the lock is - * available without waiting in the queue. + * Readers in interrupt context will get the lock immediately + * if the writer is just waiting (not holding the lock yet). + * The rspin_until_writer_unlock() function returns immediately + * in this case. Otherwise, they will spin (with ACQUIRE + * semantics) until the lock is available without waiting in + * the queue. */ - cnts = smp_load_acquire((u32 *)&lock->cnts); rspin_until_writer_unlock(lock, cnts); return; } @@ -64,42 +86,37 @@ void queue_read_lock_slowpath(struct qrwlock *lock) /* * Put the reader into the wait queue */ - arch_spin_lock(&lock->lock); + arch_spin_lock(&lock->wait_lock); /* - * At the head of the wait queue now, wait until the writer state - * goes to 0 and then try to increment the reader count and get - * the lock. It is possible that an incoming writer may steal the - * lock in the interim, so it is necessary to check the writer byte - * to make sure that the write lock isn't taken. + * The ACQUIRE semantics of the following spinning code ensure + * that accesses can't leak upwards out of our subsequent critical + * section in the case that the lock is currently held for write. */ - while (atomic_read(&lock->cnts) & _QW_WMASK) - cpu_relax_lowlatency(); - - cnts = atomic_add_return(_QR_BIAS, &lock->cnts) - _QR_BIAS; + cnts = atomic_add_return_acquire(_QR_BIAS, &lock->cnts) - _QR_BIAS; rspin_until_writer_unlock(lock, cnts); /* * Signal the next one in queue to become queue head */ - arch_spin_unlock(&lock->lock); + arch_spin_unlock(&lock->wait_lock); } -EXPORT_SYMBOL(queue_read_lock_slowpath); +EXPORT_SYMBOL(queued_read_lock_slowpath); /** - * queue_write_lock_slowpath - acquire write lock of a queue rwlock + * queued_write_lock_slowpath - acquire write lock of a queue rwlock * @lock : Pointer to queue rwlock structure */ -void queue_write_lock_slowpath(struct qrwlock *lock) +void queued_write_lock_slowpath(struct qrwlock *lock) { u32 cnts; /* Put the writer into the wait queue */ - arch_spin_lock(&lock->lock); + arch_spin_lock(&lock->wait_lock); /* Try to acquire the lock directly if no reader is present */ if (!atomic_read(&lock->cnts) && - (atomic_cmpxchg(&lock->cnts, 0, _QW_LOCKED) == 0)) + (atomic_cmpxchg_acquire(&lock->cnts, 0, _QW_LOCKED) == 0)) goto unlock; /* @@ -107,10 +124,10 @@ void queue_write_lock_slowpath(struct qrwlock *lock) * or wait for a previous writer to go away. */ for (;;) { - cnts = atomic_read(&lock->cnts); - if (!(cnts & _QW_WMASK) && - (atomic_cmpxchg(&lock->cnts, cnts, - cnts | _QW_WAITING) == cnts)) + struct __qrwlock *l = (struct __qrwlock *)lock; + + if (!READ_ONCE(l->wmode) && + (cmpxchg_relaxed(&l->wmode, 0, _QW_WAITING) == 0)) break; cpu_relax_lowlatency(); @@ -120,13 +137,13 @@ void queue_write_lock_slowpath(struct qrwlock *lock) for (;;) { cnts = atomic_read(&lock->cnts); if ((cnts == _QW_WAITING) && - (atomic_cmpxchg(&lock->cnts, _QW_WAITING, - _QW_LOCKED) == _QW_WAITING)) + (atomic_cmpxchg_acquire(&lock->cnts, _QW_WAITING, + _QW_LOCKED) == _QW_WAITING)) break; cpu_relax_lowlatency(); } unlock: - arch_spin_unlock(&lock->lock); + arch_spin_unlock(&lock->wait_lock); } -EXPORT_SYMBOL(queue_write_lock_slowpath); +EXPORT_SYMBOL(queued_write_lock_slowpath); diff --git a/kernel/kernel/locking/qspinlock.c b/kernel/kernel/locking/qspinlock.c new file mode 100644 index 000000000..87e9ce6a6 --- /dev/null +++ b/kernel/kernel/locking/qspinlock.c @@ -0,0 +1,473 @@ +/* + * Queued spinlock + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P. + * (C) Copyright 2013-2014 Red Hat, Inc. + * (C) Copyright 2015 Intel Corp. + * + * Authors: Waiman Long <waiman.long@hp.com> + * Peter Zijlstra <peterz@infradead.org> + */ + +#ifndef _GEN_PV_LOCK_SLOWPATH + +#include <linux/smp.h> +#include <linux/bug.h> +#include <linux/cpumask.h> +#include <linux/percpu.h> +#include <linux/hardirq.h> +#include <linux/mutex.h> +#include <asm/byteorder.h> +#include <asm/qspinlock.h> + +/* + * The basic principle of a queue-based spinlock can best be understood + * by studying a classic queue-based spinlock implementation called the + * MCS lock. The paper below provides a good description for this kind + * of lock. + * + * http://www.cise.ufl.edu/tr/DOC/REP-1992-71.pdf + * + * This queued spinlock implementation is based on the MCS lock, however to make + * it fit the 4 bytes we assume spinlock_t to be, and preserve its existing + * API, we must modify it somehow. + * + * In particular; where the traditional MCS lock consists of a tail pointer + * (8 bytes) and needs the next pointer (another 8 bytes) of its own node to + * unlock the next pending (next->locked), we compress both these: {tail, + * next->locked} into a single u32 value. + * + * Since a spinlock disables recursion of its own context and there is a limit + * to the contexts that can nest; namely: task, softirq, hardirq, nmi. As there + * are at most 4 nesting levels, it can be encoded by a 2-bit number. Now + * we can encode the tail by combining the 2-bit nesting level with the cpu + * number. With one byte for the lock value and 3 bytes for the tail, only a + * 32-bit word is now needed. Even though we only need 1 bit for the lock, + * we extend it to a full byte to achieve better performance for architectures + * that support atomic byte write. + * + * We also change the first spinner to spin on the lock bit instead of its + * node; whereby avoiding the need to carry a node from lock to unlock, and + * preserving existing lock API. This also makes the unlock code simpler and + * faster. + * + * N.B. The current implementation only supports architectures that allow + * atomic operations on smaller 8-bit and 16-bit data types. + * + */ + +#include "mcs_spinlock.h" + +#ifdef CONFIG_PARAVIRT_SPINLOCKS +#define MAX_NODES 8 +#else +#define MAX_NODES 4 +#endif + +/* + * Per-CPU queue node structures; we can never have more than 4 nested + * contexts: task, softirq, hardirq, nmi. + * + * Exactly fits one 64-byte cacheline on a 64-bit architecture. + * + * PV doubles the storage and uses the second cacheline for PV state. + */ +static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[MAX_NODES]); + +/* + * We must be able to distinguish between no-tail and the tail at 0:0, + * therefore increment the cpu number by one. + */ + +static inline u32 encode_tail(int cpu, int idx) +{ + u32 tail; + +#ifdef CONFIG_DEBUG_SPINLOCK + BUG_ON(idx > 3); +#endif + tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET; + tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */ + + return tail; +} + +static inline struct mcs_spinlock *decode_tail(u32 tail) +{ + int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; + int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; + + return per_cpu_ptr(&mcs_nodes[idx], cpu); +} + +#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) + +/* + * By using the whole 2nd least significant byte for the pending bit, we + * can allow better optimization of the lock acquisition for the pending + * bit holder. + * + * This internal structure is also used by the set_locked function which + * is not restricted to _Q_PENDING_BITS == 8. + */ +struct __qspinlock { + union { + atomic_t val; +#ifdef __LITTLE_ENDIAN + struct { + u8 locked; + u8 pending; + }; + struct { + u16 locked_pending; + u16 tail; + }; +#else + struct { + u16 tail; + u16 locked_pending; + }; + struct { + u8 reserved[2]; + u8 pending; + u8 locked; + }; +#endif + }; +}; + +#if _Q_PENDING_BITS == 8 +/** + * clear_pending_set_locked - take ownership and clear the pending bit. + * @lock: Pointer to queued spinlock structure + * + * *,1,0 -> *,0,1 + * + * Lock stealing is not allowed if this function is used. + */ +static __always_inline void clear_pending_set_locked(struct qspinlock *lock) +{ + struct __qspinlock *l = (void *)lock; + + WRITE_ONCE(l->locked_pending, _Q_LOCKED_VAL); +} + +/* + * xchg_tail - Put in the new queue tail code word & retrieve previous one + * @lock : Pointer to queued spinlock structure + * @tail : The new queue tail code word + * Return: The previous queue tail code word + * + * xchg(lock, tail) + * + * p,*,* -> n,*,* ; prev = xchg(lock, node) + */ +static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) +{ + struct __qspinlock *l = (void *)lock; + + return (u32)xchg(&l->tail, tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET; +} + +#else /* _Q_PENDING_BITS == 8 */ + +/** + * clear_pending_set_locked - take ownership and clear the pending bit. + * @lock: Pointer to queued spinlock structure + * + * *,1,0 -> *,0,1 + */ +static __always_inline void clear_pending_set_locked(struct qspinlock *lock) +{ + atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val); +} + +/** + * xchg_tail - Put in the new queue tail code word & retrieve previous one + * @lock : Pointer to queued spinlock structure + * @tail : The new queue tail code word + * Return: The previous queue tail code word + * + * xchg(lock, tail) + * + * p,*,* -> n,*,* ; prev = xchg(lock, node) + */ +static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) +{ + u32 old, new, val = atomic_read(&lock->val); + + for (;;) { + new = (val & _Q_LOCKED_PENDING_MASK) | tail; + old = atomic_cmpxchg(&lock->val, val, new); + if (old == val) + break; + + val = old; + } + return old; +} +#endif /* _Q_PENDING_BITS == 8 */ + +/** + * set_locked - Set the lock bit and own the lock + * @lock: Pointer to queued spinlock structure + * + * *,*,0 -> *,0,1 + */ +static __always_inline void set_locked(struct qspinlock *lock) +{ + struct __qspinlock *l = (void *)lock; + + WRITE_ONCE(l->locked, _Q_LOCKED_VAL); +} + + +/* + * Generate the native code for queued_spin_unlock_slowpath(); provide NOPs for + * all the PV callbacks. + */ + +static __always_inline void __pv_init_node(struct mcs_spinlock *node) { } +static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { } +static __always_inline void __pv_kick_node(struct qspinlock *lock, + struct mcs_spinlock *node) { } +static __always_inline void __pv_wait_head(struct qspinlock *lock, + struct mcs_spinlock *node) { } + +#define pv_enabled() false + +#define pv_init_node __pv_init_node +#define pv_wait_node __pv_wait_node +#define pv_kick_node __pv_kick_node +#define pv_wait_head __pv_wait_head + +#ifdef CONFIG_PARAVIRT_SPINLOCKS +#define queued_spin_lock_slowpath native_queued_spin_lock_slowpath +#endif + +#endif /* _GEN_PV_LOCK_SLOWPATH */ + +/** + * queued_spin_lock_slowpath - acquire the queued spinlock + * @lock: Pointer to queued spinlock structure + * @val: Current value of the queued spinlock 32-bit word + * + * (queue tail, pending bit, lock value) + * + * fast : slow : unlock + * : : + * uncontended (0,0,0) -:--> (0,0,1) ------------------------------:--> (*,*,0) + * : | ^--------.------. / : + * : v \ \ | : + * pending : (0,1,1) +--> (0,1,0) \ | : + * : | ^--' | | : + * : v | | : + * uncontended : (n,x,y) +--> (n,0,0) --' | : + * queue : | ^--' | : + * : v | : + * contended : (*,x,y) +--> (*,0,0) ---> (*,0,1) -' : + * queue : ^--' : + */ +void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) +{ + struct mcs_spinlock *prev, *next, *node; + u32 new, old, tail; + int idx; + + BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS)); + + if (pv_enabled()) + goto queue; + + if (virt_spin_lock(lock)) + return; + + /* + * wait for in-progress pending->locked hand-overs + * + * 0,1,0 -> 0,0,1 + */ + if (val == _Q_PENDING_VAL) { + while ((val = atomic_read(&lock->val)) == _Q_PENDING_VAL) + cpu_relax(); + } + + /* + * trylock || pending + * + * 0,0,0 -> 0,0,1 ; trylock + * 0,0,1 -> 0,1,1 ; pending + */ + for (;;) { + /* + * If we observe any contention; queue. + */ + if (val & ~_Q_LOCKED_MASK) + goto queue; + + new = _Q_LOCKED_VAL; + if (val == new) + new |= _Q_PENDING_VAL; + + old = atomic_cmpxchg(&lock->val, val, new); + if (old == val) + break; + + val = old; + } + + /* + * we won the trylock + */ + if (new == _Q_LOCKED_VAL) + return; + + /* + * we're pending, wait for the owner to go away. + * + * *,1,1 -> *,1,0 + * + * this wait loop must be a load-acquire such that we match the + * store-release that clears the locked bit and create lock + * sequentiality; this is because not all clear_pending_set_locked() + * implementations imply full barriers. + */ + while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_MASK) + cpu_relax(); + + /* + * take ownership and clear the pending bit. + * + * *,1,0 -> *,0,1 + */ + clear_pending_set_locked(lock); + return; + + /* + * End of pending bit optimistic spinning and beginning of MCS + * queuing. + */ +queue: + node = this_cpu_ptr(&mcs_nodes[0]); + idx = node->count++; + tail = encode_tail(smp_processor_id(), idx); + + node += idx; + node->locked = 0; + node->next = NULL; + pv_init_node(node); + + /* + * We touched a (possibly) cold cacheline in the per-cpu queue node; + * attempt the trylock once more in the hope someone let go while we + * weren't watching. + */ + if (queued_spin_trylock(lock)) + goto release; + + /* + * We have already touched the queueing cacheline; don't bother with + * pending stuff. + * + * p,*,* -> n,*,* + */ + old = xchg_tail(lock, tail); + + /* + * if there was a previous node; link it and wait until reaching the + * head of the waitqueue. + */ + if (old & _Q_TAIL_MASK) { + prev = decode_tail(old); + WRITE_ONCE(prev->next, node); + + pv_wait_node(node); + arch_mcs_spin_lock_contended(&node->locked); + } + + /* + * we're at the head of the waitqueue, wait for the owner & pending to + * go away. + * + * *,x,y -> *,0,0 + * + * this wait loop must use a load-acquire such that we match the + * store-release that clears the locked bit and create lock + * sequentiality; this is because the set_locked() function below + * does not imply a full barrier. + * + */ + pv_wait_head(lock, node); + while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_PENDING_MASK) + cpu_relax(); + + /* + * claim the lock: + * + * n,0,0 -> 0,0,1 : lock, uncontended + * *,0,0 -> *,0,1 : lock, contended + * + * If the queue head is the only one in the queue (lock value == tail), + * clear the tail code and grab the lock. Otherwise, we only need + * to grab the lock. + */ + for (;;) { + if (val != tail) { + set_locked(lock); + break; + } + old = atomic_cmpxchg(&lock->val, val, _Q_LOCKED_VAL); + if (old == val) + goto release; /* No contention */ + + val = old; + } + + /* + * contended path; wait for next, release. + */ + while (!(next = READ_ONCE(node->next))) + cpu_relax(); + + arch_mcs_spin_unlock_contended(&next->locked); + pv_kick_node(lock, next); + +release: + /* + * release the node + */ + this_cpu_dec(mcs_nodes[0].count); +} +EXPORT_SYMBOL(queued_spin_lock_slowpath); + +/* + * Generate the paravirt code for queued_spin_unlock_slowpath(). + */ +#if !defined(_GEN_PV_LOCK_SLOWPATH) && defined(CONFIG_PARAVIRT_SPINLOCKS) +#define _GEN_PV_LOCK_SLOWPATH + +#undef pv_enabled +#define pv_enabled() true + +#undef pv_init_node +#undef pv_wait_node +#undef pv_kick_node +#undef pv_wait_head + +#undef queued_spin_lock_slowpath +#define queued_spin_lock_slowpath __pv_queued_spin_lock_slowpath + +#include "qspinlock_paravirt.h" +#include "qspinlock.c" + +#endif diff --git a/kernel/kernel/locking/qspinlock_paravirt.h b/kernel/kernel/locking/qspinlock_paravirt.h new file mode 100644 index 000000000..f0450ff48 --- /dev/null +++ b/kernel/kernel/locking/qspinlock_paravirt.h @@ -0,0 +1,374 @@ +#ifndef _GEN_PV_LOCK_SLOWPATH +#error "do not include this file" +#endif + +#include <linux/hash.h> +#include <linux/bootmem.h> +#include <linux/debug_locks.h> + +/* + * Implement paravirt qspinlocks; the general idea is to halt the vcpus instead + * of spinning them. + * + * This relies on the architecture to provide two paravirt hypercalls: + * + * pv_wait(u8 *ptr, u8 val) -- suspends the vcpu if *ptr == val + * pv_kick(cpu) -- wakes a suspended vcpu + * + * Using these we implement __pv_queued_spin_lock_slowpath() and + * __pv_queued_spin_unlock() to replace native_queued_spin_lock_slowpath() and + * native_queued_spin_unlock(). + */ + +#define _Q_SLOW_VAL (3U << _Q_LOCKED_OFFSET) + +/* + * Queue node uses: vcpu_running & vcpu_halted. + * Queue head uses: vcpu_running & vcpu_hashed. + */ +enum vcpu_state { + vcpu_running = 0, + vcpu_halted, /* Used only in pv_wait_node */ + vcpu_hashed, /* = pv_hash'ed + vcpu_halted */ +}; + +struct pv_node { + struct mcs_spinlock mcs; + struct mcs_spinlock __res[3]; + + int cpu; + u8 state; +}; + +/* + * Lock and MCS node addresses hash table for fast lookup + * + * Hashing is done on a per-cacheline basis to minimize the need to access + * more than one cacheline. + * + * Dynamically allocate a hash table big enough to hold at least 4X the + * number of possible cpus in the system. Allocation is done on page + * granularity. So the minimum number of hash buckets should be at least + * 256 (64-bit) or 512 (32-bit) to fully utilize a 4k page. + * + * Since we should not be holding locks from NMI context (very rare indeed) the + * max load factor is 0.75, which is around the point where open addressing + * breaks down. + * + */ +struct pv_hash_entry { + struct qspinlock *lock; + struct pv_node *node; +}; + +#define PV_HE_PER_LINE (SMP_CACHE_BYTES / sizeof(struct pv_hash_entry)) +#define PV_HE_MIN (PAGE_SIZE / sizeof(struct pv_hash_entry)) + +static struct pv_hash_entry *pv_lock_hash; +static unsigned int pv_lock_hash_bits __read_mostly; + +/* + * Allocate memory for the PV qspinlock hash buckets + * + * This function should be called from the paravirt spinlock initialization + * routine. + */ +void __init __pv_init_lock_hash(void) +{ + int pv_hash_size = ALIGN(4 * num_possible_cpus(), PV_HE_PER_LINE); + + if (pv_hash_size < PV_HE_MIN) + pv_hash_size = PV_HE_MIN; + + /* + * Allocate space from bootmem which should be page-size aligned + * and hence cacheline aligned. + */ + pv_lock_hash = alloc_large_system_hash("PV qspinlock", + sizeof(struct pv_hash_entry), + pv_hash_size, 0, HASH_EARLY, + &pv_lock_hash_bits, NULL, + pv_hash_size, pv_hash_size); +} + +#define for_each_hash_entry(he, offset, hash) \ + for (hash &= ~(PV_HE_PER_LINE - 1), he = &pv_lock_hash[hash], offset = 0; \ + offset < (1 << pv_lock_hash_bits); \ + offset++, he = &pv_lock_hash[(hash + offset) & ((1 << pv_lock_hash_bits) - 1)]) + +static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node) +{ + unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits); + struct pv_hash_entry *he; + + for_each_hash_entry(he, offset, hash) { + if (!cmpxchg(&he->lock, NULL, lock)) { + WRITE_ONCE(he->node, node); + return &he->lock; + } + } + /* + * Hard assume there is a free entry for us. + * + * This is guaranteed by ensuring every blocked lock only ever consumes + * a single entry, and since we only have 4 nesting levels per CPU + * and allocated 4*nr_possible_cpus(), this must be so. + * + * The single entry is guaranteed by having the lock owner unhash + * before it releases. + */ + BUG(); +} + +static struct pv_node *pv_unhash(struct qspinlock *lock) +{ + unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits); + struct pv_hash_entry *he; + struct pv_node *node; + + for_each_hash_entry(he, offset, hash) { + if (READ_ONCE(he->lock) == lock) { + node = READ_ONCE(he->node); + WRITE_ONCE(he->lock, NULL); + return node; + } + } + /* + * Hard assume we'll find an entry. + * + * This guarantees a limited lookup time and is itself guaranteed by + * having the lock owner do the unhash -- IFF the unlock sees the + * SLOW flag, there MUST be a hash entry. + */ + BUG(); +} + +/* + * Initialize the PV part of the mcs_spinlock node. + */ +static void pv_init_node(struct mcs_spinlock *node) +{ + struct pv_node *pn = (struct pv_node *)node; + + BUILD_BUG_ON(sizeof(struct pv_node) > 5*sizeof(struct mcs_spinlock)); + + pn->cpu = smp_processor_id(); + pn->state = vcpu_running; +} + +/* + * Wait for node->locked to become true, halt the vcpu after a short spin. + * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its + * behalf. + */ +static void pv_wait_node(struct mcs_spinlock *node) +{ + struct pv_node *pn = (struct pv_node *)node; + int loop; + + for (;;) { + for (loop = SPIN_THRESHOLD; loop; loop--) { + if (READ_ONCE(node->locked)) + return; + cpu_relax(); + } + + /* + * Order pn->state vs pn->locked thusly: + * + * [S] pn->state = vcpu_halted [S] next->locked = 1 + * MB MB + * [L] pn->locked [RmW] pn->state = vcpu_hashed + * + * Matches the cmpxchg() from pv_kick_node(). + */ + smp_store_mb(pn->state, vcpu_halted); + + if (!READ_ONCE(node->locked)) + pv_wait(&pn->state, vcpu_halted); + + /* + * If pv_kick_node() changed us to vcpu_hashed, retain that value + * so that pv_wait_head() knows to not also try to hash this lock. + */ + cmpxchg(&pn->state, vcpu_halted, vcpu_running); + + /* + * If the locked flag is still not set after wakeup, it is a + * spurious wakeup and the vCPU should wait again. However, + * there is a pretty high overhead for CPU halting and kicking. + * So it is better to spin for a while in the hope that the + * MCS lock will be released soon. + */ + } + + /* + * By now our node->locked should be 1 and our caller will not actually + * spin-wait for it. We do however rely on our caller to do a + * load-acquire for us. + */ +} + +/* + * Called after setting next->locked = 1 when we're the lock owner. + * + * Instead of waking the waiters stuck in pv_wait_node() advance their state such + * that they're waiting in pv_wait_head(), this avoids a wake/sleep cycle. + */ +static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) +{ + struct pv_node *pn = (struct pv_node *)node; + struct __qspinlock *l = (void *)lock; + + /* + * If the vCPU is indeed halted, advance its state to match that of + * pv_wait_node(). If OTOH this fails, the vCPU was running and will + * observe its next->locked value and advance itself. + * + * Matches with smp_store_mb() and cmpxchg() in pv_wait_node() + */ + if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted) + return; + + /* + * Put the lock into the hash table and set the _Q_SLOW_VAL. + * + * As this is the same vCPU that will check the _Q_SLOW_VAL value and + * the hash table later on at unlock time, no atomic instruction is + * needed. + */ + WRITE_ONCE(l->locked, _Q_SLOW_VAL); + (void)pv_hash(lock, pn); +} + +/* + * Wait for l->locked to become clear; halt the vcpu after a short spin. + * __pv_queued_spin_unlock() will wake us. + */ +static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) +{ + struct pv_node *pn = (struct pv_node *)node; + struct __qspinlock *l = (void *)lock; + struct qspinlock **lp = NULL; + int loop; + + /* + * If pv_kick_node() already advanced our state, we don't need to + * insert ourselves into the hash table anymore. + */ + if (READ_ONCE(pn->state) == vcpu_hashed) + lp = (struct qspinlock **)1; + + for (;;) { + for (loop = SPIN_THRESHOLD; loop; loop--) { + if (!READ_ONCE(l->locked)) + return; + cpu_relax(); + } + + if (!lp) { /* ONCE */ + lp = pv_hash(lock, pn); + + /* + * We must hash before setting _Q_SLOW_VAL, such that + * when we observe _Q_SLOW_VAL in __pv_queued_spin_unlock() + * we'll be sure to be able to observe our hash entry. + * + * [S] <hash> [Rmw] l->locked == _Q_SLOW_VAL + * MB RMB + * [RmW] l->locked = _Q_SLOW_VAL [L] <unhash> + * + * Matches the smp_rmb() in __pv_queued_spin_unlock(). + */ + if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) { + /* + * The lock is free and _Q_SLOW_VAL has never + * been set. Therefore we need to unhash before + * getting the lock. + */ + WRITE_ONCE(*lp, NULL); + return; + } + } + pv_wait(&l->locked, _Q_SLOW_VAL); + + /* + * The unlocker should have freed the lock before kicking the + * CPU. So if the lock is still not free, it is a spurious + * wakeup and so the vCPU should wait again after spinning for + * a while. + */ + } + + /* + * Lock is unlocked now; the caller will acquire it without waiting. + * As with pv_wait_node() we rely on the caller to do a load-acquire + * for us. + */ +} + +/* + * PV version of the unlock function to be used in stead of + * queued_spin_unlock(). + */ +__visible void __pv_queued_spin_unlock(struct qspinlock *lock) +{ + struct __qspinlock *l = (void *)lock; + struct pv_node *node; + u8 locked; + + /* + * We must not unlock if SLOW, because in that case we must first + * unhash. Otherwise it would be possible to have multiple @lock + * entries, which would be BAD. + */ + locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0); + if (likely(locked == _Q_LOCKED_VAL)) + return; + + if (unlikely(locked != _Q_SLOW_VAL)) { + WARN(!debug_locks_silent, + "pvqspinlock: lock 0x%lx has corrupted value 0x%x!\n", + (unsigned long)lock, atomic_read(&lock->val)); + return; + } + + /* + * A failed cmpxchg doesn't provide any memory-ordering guarantees, + * so we need a barrier to order the read of the node data in + * pv_unhash *after* we've read the lock being _Q_SLOW_VAL. + * + * Matches the cmpxchg() in pv_wait_head() setting _Q_SLOW_VAL. + */ + smp_rmb(); + + /* + * Since the above failed to release, this must be the SLOW path. + * Therefore start by looking up the blocked node and unhashing it. + */ + node = pv_unhash(lock); + + /* + * Now that we have a reference to the (likely) blocked pv_node, + * release the lock. + */ + smp_store_release(&l->locked, 0); + + /* + * At this point the memory pointed at by lock can be freed/reused, + * however we can still use the pv_node to kick the CPU. + * The other vCPU may not really be halted, but kicking an active + * vCPU is harmless other than the additional latency in completing + * the unlock. + */ + pv_kick(node->cpu); +} +/* + * Include the architecture specific callee-save thunk of the + * __pv_queued_spin_unlock(). This thunk is put together with + * __pv_queued_spin_unlock() near the top of the file to make sure + * that the callee-save thunk and the real unlock function are close + * to each other sharing consecutive instruction cachelines. + */ +#include <asm/qspinlock_paravirt.h> + diff --git a/kernel/kernel/locking/rt.c b/kernel/kernel/locking/rt.c index c236efa48..d4ab61c18 100644 --- a/kernel/kernel/locking/rt.c +++ b/kernel/kernel/locking/rt.c @@ -235,7 +235,6 @@ EXPORT_SYMBOL(rt_read_trylock); void __lockfunc rt_write_lock(rwlock_t *rwlock) { rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_); - migrate_disable(); __rt_spin_lock(&rwlock->lock); } EXPORT_SYMBOL(rt_write_lock); @@ -249,7 +248,6 @@ void __lockfunc rt_read_lock(rwlock_t *rwlock) * recursive read locks succeed when current owns the lock */ if (rt_mutex_owner(lock) != current) { - migrate_disable(); rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_); __rt_spin_lock(lock); } @@ -375,7 +373,7 @@ void rt_down_write_nested_lock(struct rw_semaphore *rwsem, } EXPORT_SYMBOL(rt_down_write_nested_lock); -int rt_down_read_trylock(struct rw_semaphore *rwsem) +int rt__down_read_trylock(struct rw_semaphore *rwsem) { struct rt_mutex *lock = &rwsem->lock; int ret = 1; @@ -390,24 +388,39 @@ int rt_down_read_trylock(struct rw_semaphore *rwsem) else if (!rwsem->read_depth) ret = 0; - if (ret) { + if (ret) rwsem->read_depth++; + return ret; + +} + +int rt_down_read_trylock(struct rw_semaphore *rwsem) +{ + int ret; + + ret = rt__down_read_trylock(rwsem); + if (ret) rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_); - } + return ret; } EXPORT_SYMBOL(rt_down_read_trylock); -static void __rt_down_read(struct rw_semaphore *rwsem, int subclass) +void rt__down_read(struct rw_semaphore *rwsem) { struct rt_mutex *lock = &rwsem->lock; - rwsem_acquire_read(&rwsem->dep_map, subclass, 0, _RET_IP_); - if (rt_mutex_owner(lock) != current) rt_mutex_lock(&rwsem->lock); rwsem->read_depth++; } +EXPORT_SYMBOL(rt__down_read); + +static void __rt_down_read(struct rw_semaphore *rwsem, int subclass) +{ + rwsem_acquire_read(&rwsem->dep_map, subclass, 0, _RET_IP_); + rt__down_read(rwsem); +} void rt_down_read(struct rw_semaphore *rwsem) { diff --git a/kernel/kernel/locking/rtmutex-tester.c b/kernel/kernel/locking/rtmutex-tester.c deleted file mode 100644 index 1d96dd0d9..000000000 --- a/kernel/kernel/locking/rtmutex-tester.c +++ /dev/null @@ -1,420 +0,0 @@ -/* - * RT-Mutex-tester: scriptable tester for rt mutexes - * - * started by Thomas Gleixner: - * - * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> - * - */ -#include <linux/device.h> -#include <linux/kthread.h> -#include <linux/export.h> -#include <linux/sched.h> -#include <linux/sched/rt.h> -#include <linux/spinlock.h> -#include <linux/timer.h> -#include <linux/freezer.h> -#include <linux/stat.h> - -#include "rtmutex.h" - -#define MAX_RT_TEST_THREADS 8 -#define MAX_RT_TEST_MUTEXES 8 - -static spinlock_t rttest_lock; -static atomic_t rttest_event; - -struct test_thread_data { - int opcode; - int opdata; - int mutexes[MAX_RT_TEST_MUTEXES]; - int event; - struct device dev; -}; - -static struct test_thread_data thread_data[MAX_RT_TEST_THREADS]; -static struct task_struct *threads[MAX_RT_TEST_THREADS]; -static struct rt_mutex mutexes[MAX_RT_TEST_MUTEXES]; - -enum test_opcodes { - RTTEST_NOP = 0, - RTTEST_SCHEDOT, /* 1 Sched other, data = nice */ - RTTEST_SCHEDRT, /* 2 Sched fifo, data = prio */ - RTTEST_LOCK, /* 3 Lock uninterruptible, data = lockindex */ - RTTEST_LOCKNOWAIT, /* 4 Lock uninterruptible no wait in wakeup, data = lockindex */ - RTTEST_LOCKINT, /* 5 Lock interruptible, data = lockindex */ - RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */ - RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */ - RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */ - /* 9, 10 - reserved for BKL commemoration */ - RTTEST_SIGNAL = 11, /* 11 Signal other test thread, data = thread id */ - RTTEST_RESETEVENT = 98, /* 98 Reset event counter */ - RTTEST_RESET = 99, /* 99 Reset all pending operations */ -}; - -static int handle_op(struct test_thread_data *td, int lockwakeup) -{ - int i, id, ret = -EINVAL; - - switch(td->opcode) { - - case RTTEST_NOP: - return 0; - - case RTTEST_LOCKCONT: - td->mutexes[td->opdata] = 1; - td->event = atomic_add_return(1, &rttest_event); - return 0; - - case RTTEST_RESET: - for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) { - if (td->mutexes[i] == 4) { - rt_mutex_unlock(&mutexes[i]); - td->mutexes[i] = 0; - } - } - return 0; - - case RTTEST_RESETEVENT: - atomic_set(&rttest_event, 0); - return 0; - - default: - if (lockwakeup) - return ret; - } - - switch(td->opcode) { - - case RTTEST_LOCK: - case RTTEST_LOCKNOWAIT: - id = td->opdata; - if (id < 0 || id >= MAX_RT_TEST_MUTEXES) - return ret; - - td->mutexes[id] = 1; - td->event = atomic_add_return(1, &rttest_event); - rt_mutex_lock(&mutexes[id]); - td->event = atomic_add_return(1, &rttest_event); - td->mutexes[id] = 4; - return 0; - - case RTTEST_LOCKINT: - case RTTEST_LOCKINTNOWAIT: - id = td->opdata; - if (id < 0 || id >= MAX_RT_TEST_MUTEXES) - return ret; - - td->mutexes[id] = 1; - td->event = atomic_add_return(1, &rttest_event); - ret = rt_mutex_lock_interruptible(&mutexes[id], 0); - td->event = atomic_add_return(1, &rttest_event); - td->mutexes[id] = ret ? 0 : 4; - return ret ? -EINTR : 0; - - case RTTEST_UNLOCK: - id = td->opdata; - if (id < 0 || id >= MAX_RT_TEST_MUTEXES || td->mutexes[id] != 4) - return ret; - - td->event = atomic_add_return(1, &rttest_event); - rt_mutex_unlock(&mutexes[id]); - td->event = atomic_add_return(1, &rttest_event); - td->mutexes[id] = 0; - return 0; - - default: - break; - } - return ret; -} - -/* - * Schedule replacement for rtsem_down(). Only called for threads with - * PF_MUTEX_TESTER set. - * - * This allows us to have finegrained control over the event flow. - * - */ -void schedule_rt_mutex_test(struct rt_mutex *mutex) -{ - int tid, op, dat; - struct test_thread_data *td; - - /* We have to lookup the task */ - for (tid = 0; tid < MAX_RT_TEST_THREADS; tid++) { - if (threads[tid] == current) - break; - } - - BUG_ON(tid == MAX_RT_TEST_THREADS); - - td = &thread_data[tid]; - - op = td->opcode; - dat = td->opdata; - - switch (op) { - case RTTEST_LOCK: - case RTTEST_LOCKINT: - case RTTEST_LOCKNOWAIT: - case RTTEST_LOCKINTNOWAIT: - if (mutex != &mutexes[dat]) - break; - - if (td->mutexes[dat] != 1) - break; - - td->mutexes[dat] = 2; - td->event = atomic_add_return(1, &rttest_event); - break; - - default: - break; - } - - schedule(); - - - switch (op) { - case RTTEST_LOCK: - case RTTEST_LOCKINT: - if (mutex != &mutexes[dat]) - return; - - if (td->mutexes[dat] != 2) - return; - - td->mutexes[dat] = 3; - td->event = atomic_add_return(1, &rttest_event); - break; - - case RTTEST_LOCKNOWAIT: - case RTTEST_LOCKINTNOWAIT: - if (mutex != &mutexes[dat]) - return; - - if (td->mutexes[dat] != 2) - return; - - td->mutexes[dat] = 1; - td->event = atomic_add_return(1, &rttest_event); - return; - - default: - return; - } - - td->opcode = 0; - - for (;;) { - set_current_state(TASK_INTERRUPTIBLE); - - if (td->opcode > 0) { - int ret; - - set_current_state(TASK_RUNNING); - ret = handle_op(td, 1); - set_current_state(TASK_INTERRUPTIBLE); - if (td->opcode == RTTEST_LOCKCONT) - break; - td->opcode = ret; - } - - /* Wait for the next command to be executed */ - schedule(); - } - - /* Restore previous command and data */ - td->opcode = op; - td->opdata = dat; -} - -static int test_func(void *data) -{ - struct test_thread_data *td = data; - int ret; - - current->flags |= PF_MUTEX_TESTER; - set_freezable(); - allow_signal(SIGHUP); - - for(;;) { - - set_current_state(TASK_INTERRUPTIBLE); - - if (td->opcode > 0) { - set_current_state(TASK_RUNNING); - ret = handle_op(td, 0); - set_current_state(TASK_INTERRUPTIBLE); - td->opcode = ret; - } - - /* Wait for the next command to be executed */ - schedule(); - try_to_freeze(); - - if (signal_pending(current)) - flush_signals(current); - - if(kthread_should_stop()) - break; - } - return 0; -} - -/** - * sysfs_test_command - interface for test commands - * @dev: thread reference - * @buf: command for actual step - * @count: length of buffer - * - * command syntax: - * - * opcode:data - */ -static ssize_t sysfs_test_command(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) -{ - struct sched_param schedpar; - struct test_thread_data *td; - char cmdbuf[32]; - int op, dat, tid, ret; - - td = container_of(dev, struct test_thread_data, dev); - tid = td->dev.id; - - /* strings from sysfs write are not 0 terminated! */ - if (count >= sizeof(cmdbuf)) - return -EINVAL; - - /* strip of \n: */ - if (buf[count-1] == '\n') - count--; - if (count < 1) - return -EINVAL; - - memcpy(cmdbuf, buf, count); - cmdbuf[count] = 0; - - if (sscanf(cmdbuf, "%d:%d", &op, &dat) != 2) - return -EINVAL; - - switch (op) { - case RTTEST_SCHEDOT: - schedpar.sched_priority = 0; - ret = sched_setscheduler(threads[tid], SCHED_NORMAL, &schedpar); - if (ret) - return ret; - set_user_nice(current, 0); - break; - - case RTTEST_SCHEDRT: - schedpar.sched_priority = dat; - ret = sched_setscheduler(threads[tid], SCHED_FIFO, &schedpar); - if (ret) - return ret; - break; - - case RTTEST_SIGNAL: - send_sig(SIGHUP, threads[tid], 0); - break; - - default: - if (td->opcode > 0) - return -EBUSY; - td->opdata = dat; - td->opcode = op; - wake_up_process(threads[tid]); - } - - return count; -} - -/** - * sysfs_test_status - sysfs interface for rt tester - * @dev: thread to query - * @buf: char buffer to be filled with thread status info - */ -static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *attr, - char *buf) -{ - struct test_thread_data *td; - struct task_struct *tsk; - char *curr = buf; - int i; - - td = container_of(dev, struct test_thread_data, dev); - tsk = threads[td->dev.id]; - - spin_lock(&rttest_lock); - - curr += sprintf(curr, - "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, M:", - td->opcode, td->event, tsk->state, - (MAX_RT_PRIO - 1) - tsk->prio, - (MAX_RT_PRIO - 1) - tsk->normal_prio, - tsk->pi_blocked_on); - - for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--) - curr += sprintf(curr, "%d", td->mutexes[i]); - - spin_unlock(&rttest_lock); - - curr += sprintf(curr, ", T: %p, R: %p\n", tsk, - mutexes[td->dev.id].owner); - - return curr - buf; -} - -static DEVICE_ATTR(status, S_IRUSR, sysfs_test_status, NULL); -static DEVICE_ATTR(command, S_IWUSR, NULL, sysfs_test_command); - -static struct bus_type rttest_subsys = { - .name = "rttest", - .dev_name = "rttest", -}; - -static int init_test_thread(int id) -{ - thread_data[id].dev.bus = &rttest_subsys; - thread_data[id].dev.id = id; - - threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id); - if (IS_ERR(threads[id])) - return PTR_ERR(threads[id]); - - return device_register(&thread_data[id].dev); -} - -static int init_rttest(void) -{ - int ret, i; - - spin_lock_init(&rttest_lock); - - for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) - rt_mutex_init(&mutexes[i]); - - ret = subsys_system_register(&rttest_subsys, NULL); - if (ret) - return ret; - - for (i = 0; i < MAX_RT_TEST_THREADS; i++) { - ret = init_test_thread(i); - if (ret) - break; - ret = device_create_file(&thread_data[i].dev, &dev_attr_status); - if (ret) - break; - ret = device_create_file(&thread_data[i].dev, &dev_attr_command); - if (ret) - break; - } - - printk("Initializing RT-Tester: %s\n", ret ? "Failed" : "OK" ); - - return ret; -} - -device_initcall(init_rttest); diff --git a/kernel/kernel/locking/rtmutex.c b/kernel/kernel/locking/rtmutex.c index 2822aceb8..66971005c 100644 --- a/kernel/kernel/locking/rtmutex.c +++ b/kernel/kernel/locking/rtmutex.c @@ -82,18 +82,27 @@ static int rt_mutex_real_waiter(struct rt_mutex_waiter *waiter) } /* - * We can speed up the acquire/release, if the architecture - * supports cmpxchg and if there's no debugging state to be set up + * We can speed up the acquire/release, if there's no debugging state to be + * set up. + */ +#ifndef CONFIG_DEBUG_RT_MUTEXES +# define rt_mutex_cmpxchg_relaxed(l,c,n) (cmpxchg_relaxed(&l->owner, c, n) == c) +# define rt_mutex_cmpxchg_acquire(l,c,n) (cmpxchg_acquire(&l->owner, c, n) == c) +# define rt_mutex_cmpxchg_release(l,c,n) (cmpxchg_release(&l->owner, c, n) == c) + +/* + * Callers must hold the ->wait_lock -- which is the whole purpose as we force + * all future threads that attempt to [Rmw] the lock to the slowpath. As such + * relaxed semantics suffice. */ -#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES) -# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c) static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) { unsigned long owner, *p = (unsigned long *) &lock->owner; do { owner = *p; - } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner); + } while (cmpxchg_relaxed(p, owner, + owner | RT_MUTEX_HAS_WAITERS) != owner); } /* @@ -102,13 +111,14 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) * 2) Drop lock->wait_lock * 3) Try to unlock the lock with cmpxchg */ -static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock) +static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, + unsigned long flags) __releases(lock->wait_lock) { struct task_struct *owner = rt_mutex_owner(lock); clear_rt_mutex_waiters(lock); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); /* * If a new waiter comes in between the unlock and the cmpxchg * we have two situations: @@ -133,11 +143,14 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock) * lock(wait_lock); * acquire(lock); */ - return rt_mutex_cmpxchg(lock, owner, NULL); + return rt_mutex_cmpxchg_release(lock, owner, NULL); } #else -# define rt_mutex_cmpxchg(l,c,n) (0) +# define rt_mutex_cmpxchg_relaxed(l,c,n) (0) +# define rt_mutex_cmpxchg_acquire(l,c,n) (0) +# define rt_mutex_cmpxchg_release(l,c,n) (0) + static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) { lock->owner = (struct task_struct *) @@ -147,11 +160,12 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) /* * Simple slow path only version: lock->owner is protected by lock->wait_lock. */ -static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock) +static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, + unsigned long flags) __releases(lock->wait_lock) { lock->owner = NULL; - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); return true; } #endif @@ -170,7 +184,8 @@ rt_mutex_waiter_less(struct rt_mutex_waiter *left, * then right waiter has a dl_prio() too. */ if (dl_prio(left->prio)) - return (left->task->dl.deadline < right->task->dl.deadline); + return dl_time_before(left->task->dl.deadline, + right->task->dl.deadline); return 0; } @@ -441,7 +456,6 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, int ret = 0, depth = 0; struct rt_mutex *lock; bool detect_deadlock; - unsigned long flags; bool requeue = true; detect_deadlock = rt_mutex_cond_detect_deadlock(orig_waiter, chwalk); @@ -484,7 +498,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, /* * [1] Task cannot go away as we did a get_task() before ! */ - raw_spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock_irq(&task->pi_lock); /* * [2] Get the waiter on which @task is blocked on. @@ -568,7 +582,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * operations. */ if (!raw_spin_trylock(&lock->wait_lock)) { - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock_irq(&task->pi_lock); cpu_relax(); goto retry; } @@ -599,7 +613,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, /* * No requeue[7] here. Just release @task [8] */ - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock(&task->pi_lock); put_task_struct(task); /* @@ -607,14 +621,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * If there is no owner of the lock, end of chain. */ if (!rt_mutex_owner(lock)) { - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); return 0; } /* [10] Grab the next task, i.e. owner of @lock */ task = rt_mutex_owner(lock); get_task_struct(task); - raw_spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock(&task->pi_lock); /* * No requeue [11] here. We just do deadlock detection. @@ -629,8 +643,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, top_waiter = rt_mutex_top_waiter(lock); /* [13] Drop locks */ - raw_spin_unlock_irqrestore(&task->pi_lock, flags); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock(&task->pi_lock); + raw_spin_unlock_irq(&lock->wait_lock); /* If owner is not blocked, end of chain. */ if (!next_lock) @@ -645,13 +659,13 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, */ prerequeue_top_waiter = rt_mutex_top_waiter(lock); - /* [7] Requeue the waiter in the lock waiter list. */ + /* [7] Requeue the waiter in the lock waiter tree. */ rt_mutex_dequeue(lock, waiter); waiter->prio = task->prio; rt_mutex_enqueue(lock, waiter); /* [8] Release the task */ - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock(&task->pi_lock); put_task_struct(task); /* @@ -672,21 +686,21 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, lock_top_waiter = rt_mutex_top_waiter(lock); if (prerequeue_top_waiter != lock_top_waiter) rt_mutex_wake_waiter(lock_top_waiter); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); return 0; } /* [10] Grab the next task, i.e. the owner of @lock */ task = rt_mutex_owner(lock); get_task_struct(task); - raw_spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock(&task->pi_lock); /* [11] requeue the pi waiters if necessary */ if (waiter == rt_mutex_top_waiter(lock)) { /* * The waiter became the new top (highest priority) * waiter on the lock. Replace the previous top waiter - * in the owner tasks pi waiters list with this waiter + * in the owner tasks pi waiters tree with this waiter * and adjust the priority of the owner. */ rt_mutex_dequeue_pi(task, prerequeue_top_waiter); @@ -697,7 +711,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, /* * The waiter was the top waiter on the lock, but is * no longer the top prority waiter. Replace waiter in - * the owner tasks pi waiters list with the new top + * the owner tasks pi waiters tree with the new top * (highest priority) waiter and adjust the priority * of the owner. * The new top waiter is stored in @waiter so that @@ -733,8 +747,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, top_waiter = rt_mutex_top_waiter(lock); /* [13] Drop the locks */ - raw_spin_unlock_irqrestore(&task->pi_lock, flags); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock(&task->pi_lock); + raw_spin_unlock_irq(&lock->wait_lock); /* * Make the actual exit decisions [12], based on the stored @@ -757,7 +771,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, goto again; out_unlock_pi: - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock_irq(&task->pi_lock); out_put_task: put_task_struct(task); @@ -786,19 +800,17 @@ static inline int lock_is_stealable(struct task_struct *task, /* * Try to take an rt-mutex * - * Must be called with lock->wait_lock held. + * Must be called with lock->wait_lock held and interrupts disabled * * @lock: The lock to be acquired. * @task: The task which wants to acquire the lock - * @waiter: The waiter that is queued to the lock's wait list if the + * @waiter: The waiter that is queued to the lock's wait tree if the * callsite called task_blocked_on_lock(), otherwise NULL */ static int __try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, struct rt_mutex_waiter *waiter, int mode) { - unsigned long flags; - /* * Before testing whether we can acquire @lock, we set the * RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all @@ -826,7 +838,7 @@ static int __try_to_take_rt_mutex(struct rt_mutex *lock, /* * If @waiter != NULL, @task has already enqueued the waiter - * into @lock waiter list. If @waiter == NULL then this is a + * into @lock waiter tree. If @waiter == NULL then this is a * trylock attempt. */ if (waiter) { @@ -841,7 +853,7 @@ static int __try_to_take_rt_mutex(struct rt_mutex *lock, /* * We can acquire the lock. Remove the waiter from the - * lock waiters list. + * lock waiters tree. */ rt_mutex_dequeue(lock, waiter); @@ -869,7 +881,7 @@ static int __try_to_take_rt_mutex(struct rt_mutex *lock, * No waiters. Take the lock without the * pi_lock dance.@task->pi_blocked_on is NULL * and we have no waiters to enqueue in @task - * pi waiters list. + * pi waiters tree. */ goto takeit; } @@ -881,16 +893,16 @@ static int __try_to_take_rt_mutex(struct rt_mutex *lock, * case, but conditionals are more expensive than a redundant * store. */ - raw_spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock(&task->pi_lock); task->pi_blocked_on = NULL; /* * Finish the lock acquisition. @task is the new owner. If * other waiters exist we have to insert the highest priority - * waiter into @task->pi_waiters list. + * waiter into @task->pi_waiters tree. */ if (rt_mutex_has_waiters(lock)) rt_mutex_enqueue_pi(task, rt_mutex_top_waiter(lock)); - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock(&task->pi_lock); takeit: /* We got the lock. */ @@ -912,20 +924,25 @@ takeit: * preemptible spin_lock functions: */ static inline void rt_spin_lock_fastlock(struct rt_mutex *lock, - void (*slowfn)(struct rt_mutex *lock)) + void (*slowfn)(struct rt_mutex *lock, + bool mg_off), + bool do_mig_dis) { might_sleep_no_state_check(); - if (likely(rt_mutex_cmpxchg(lock, NULL, current))) + if (do_mig_dis) + migrate_disable(); + + if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) rt_mutex_deadlock_account_lock(lock, current); else - slowfn(lock); + slowfn(lock, do_mig_dis); } static inline void rt_spin_lock_fastunlock(struct rt_mutex *lock, void (*slowfn)(struct rt_mutex *lock)) { - if (likely(rt_mutex_cmpxchg(lock, current, NULL))) + if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) rt_mutex_deadlock_account_unlock(current); else slowfn(lock); @@ -966,9 +983,6 @@ static int adaptive_wait(struct rt_mutex *lock, } #endif -# define pi_lock(lock) raw_spin_lock_irq(lock) -# define pi_unlock(lock) raw_spin_unlock_irq(lock) - static int task_blocks_on_rt_mutex(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, struct task_struct *task, @@ -980,18 +994,20 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, * We store the current state under p->pi_lock in p->saved_state and * the try_to_wake_up() code handles this accordingly. */ -static void noinline __sched rt_spin_lock_slowlock(struct rt_mutex *lock) +static void noinline __sched rt_spin_lock_slowlock(struct rt_mutex *lock, + bool mg_off) { struct task_struct *lock_owner, *self = current; struct rt_mutex_waiter waiter, *top_waiter; + unsigned long flags; int ret; rt_mutex_init_waiter(&waiter, true); - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irqsave(&lock->wait_lock, flags); if (__try_to_take_rt_mutex(lock, self, NULL, STEAL_LATERAL)) { - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); return; } @@ -1003,12 +1019,12 @@ static void noinline __sched rt_spin_lock_slowlock(struct rt_mutex *lock) * as well. We are serialized via pi_lock against wakeups. See * try_to_wake_up(). */ - pi_lock(&self->pi_lock); + raw_spin_lock(&self->pi_lock); self->saved_state = self->state; __set_current_state_no_track(TASK_UNINTERRUPTIBLE); - pi_unlock(&self->pi_lock); + raw_spin_unlock(&self->pi_lock); - ret = task_blocks_on_rt_mutex(lock, &waiter, self, 0); + ret = task_blocks_on_rt_mutex(lock, &waiter, self, RT_MUTEX_MIN_CHAINWALK); BUG_ON(ret); for (;;) { @@ -1019,18 +1035,23 @@ static void noinline __sched rt_spin_lock_slowlock(struct rt_mutex *lock) top_waiter = rt_mutex_top_waiter(lock); lock_owner = rt_mutex_owner(lock); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); debug_rt_mutex_print_deadlock(&waiter); - if (top_waiter != &waiter || adaptive_wait(lock, lock_owner)) - schedule_rt_mutex(lock); + if (top_waiter != &waiter || adaptive_wait(lock, lock_owner)) { + if (mg_off) + migrate_enable(); + schedule(); + if (mg_off) + migrate_disable(); + } - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irqsave(&lock->wait_lock, flags); - pi_lock(&self->pi_lock); + raw_spin_lock(&self->pi_lock); __set_current_state_no_track(TASK_UNINTERRUPTIBLE); - pi_unlock(&self->pi_lock); + raw_spin_unlock(&self->pi_lock); } /* @@ -1040,10 +1061,10 @@ static void noinline __sched rt_spin_lock_slowlock(struct rt_mutex *lock) * happened while we were blocked. Clear saved_state so * try_to_wakeup() does not get confused. */ - pi_lock(&self->pi_lock); + raw_spin_lock(&self->pi_lock); __set_current_state_no_track(self->saved_state); self->saved_state = TASK_RUNNING; - pi_unlock(&self->pi_lock); + raw_spin_unlock(&self->pi_lock); /* * try_to_take_rt_mutex() sets the waiter bit @@ -1054,18 +1075,24 @@ static void noinline __sched rt_spin_lock_slowlock(struct rt_mutex *lock) BUG_ON(rt_mutex_has_waiters(lock) && &waiter == rt_mutex_top_waiter(lock)); BUG_ON(!RB_EMPTY_NODE(&waiter.tree_entry)); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); debug_rt_mutex_free_waiter(&waiter); } -static void wakeup_next_waiter(struct rt_mutex *lock); +static void mark_wakeup_next_waiter(struct wake_q_head *wake_q, + struct wake_q_head *wake_sleeper_q, + struct rt_mutex *lock); /* * Slow path to release a rt_mutex spin_lock style */ static void noinline __sched rt_spin_lock_slowunlock(struct rt_mutex *lock) { - raw_spin_lock(&lock->wait_lock); + unsigned long flags; + WAKE_Q(wake_q); + WAKE_Q(wake_sleeper_q); + + raw_spin_lock_irqsave(&lock->wait_lock, flags); debug_rt_mutex_unlock(lock); @@ -1073,45 +1100,69 @@ static void noinline __sched rt_spin_lock_slowunlock(struct rt_mutex *lock) if (!rt_mutex_has_waiters(lock)) { lock->owner = NULL; - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); return; } - wakeup_next_waiter(lock); + mark_wakeup_next_waiter(&wake_q, &wake_sleeper_q, lock); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); + wake_up_q(&wake_q); + wake_up_q_sleeper(&wake_sleeper_q); /* Undo pi boosting.when necessary */ rt_mutex_adjust_prio(current); } +void __lockfunc rt_spin_lock__no_mg(spinlock_t *lock) +{ + rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock, false); + spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); +} +EXPORT_SYMBOL(rt_spin_lock__no_mg); + void __lockfunc rt_spin_lock(spinlock_t *lock) { - rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock); + rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock, true); spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); } EXPORT_SYMBOL(rt_spin_lock); void __lockfunc __rt_spin_lock(struct rt_mutex *lock) { - rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock); + rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock, true); } EXPORT_SYMBOL(__rt_spin_lock); +void __lockfunc __rt_spin_lock__no_mg(struct rt_mutex *lock) +{ + rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock, false); +} +EXPORT_SYMBOL(__rt_spin_lock__no_mg); + #ifdef CONFIG_DEBUG_LOCK_ALLOC void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass) { - rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock); spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock, true); } EXPORT_SYMBOL(rt_spin_lock_nested); #endif +void __lockfunc rt_spin_unlock__no_mg(spinlock_t *lock) +{ + /* NOTE: we always pass in '1' for nested, for simplicity */ + spin_release(&lock->dep_map, 1, _RET_IP_); + rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock); +} +EXPORT_SYMBOL(rt_spin_unlock__no_mg); + void __lockfunc rt_spin_unlock(spinlock_t *lock) { /* NOTE: we always pass in '1' for nested, for simplicity */ spin_release(&lock->dep_map, 1, _RET_IP_); rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock); + migrate_enable(); } EXPORT_SYMBOL(rt_spin_unlock); @@ -1138,12 +1189,27 @@ int __lockfunc __rt_spin_trylock(struct rt_mutex *lock) return rt_mutex_trylock(lock); } +int __lockfunc rt_spin_trylock__no_mg(spinlock_t *lock) +{ + int ret; + + ret = rt_mutex_trylock(&lock->lock); + if (ret) + spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); + return ret; +} +EXPORT_SYMBOL(rt_spin_trylock__no_mg); + int __lockfunc rt_spin_trylock(spinlock_t *lock) { - int ret = rt_mutex_trylock(&lock->lock); + int ret; + migrate_disable(); + ret = rt_mutex_trylock(&lock->lock); if (ret) spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); + else + migrate_enable(); return ret; } EXPORT_SYMBOL(rt_spin_trylock); @@ -1182,12 +1248,10 @@ int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock) /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */ if (atomic_add_unless(atomic, -1, 1)) return 0; - migrate_disable(); rt_spin_lock(lock); if (atomic_dec_and_test(atomic)) return 1; rt_spin_unlock(lock); - migrate_enable(); return 0; } EXPORT_SYMBOL(atomic_dec_and_spin_lock); @@ -1253,7 +1317,7 @@ try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, * * Prepare waiter and propagate pi chain * - * This must be called with lock->wait_lock held. + * This must be called with lock->wait_lock held and interrupts disabled */ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, @@ -1264,7 +1328,6 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, struct rt_mutex_waiter *top_waiter = waiter; struct rt_mutex *next_lock; int chain_walk = 0, res; - unsigned long flags; /* * Early deadlock detection. We really don't want the task to @@ -1278,7 +1341,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, if (owner == task) return -EDEADLK; - raw_spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock(&task->pi_lock); /* * In the case of futex requeue PI, this will be a proxy @@ -1290,7 +1353,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, * the task if PI_WAKEUP_INPROGRESS is set. */ if (task != current && task->pi_blocked_on == PI_WAKEUP_INPROGRESS) { - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock(&task->pi_lock); return -EAGAIN; } @@ -1308,12 +1371,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, task->pi_blocked_on = waiter; - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock(&task->pi_lock); if (!owner) return 0; - raw_spin_lock_irqsave(&owner->pi_lock, flags); + raw_spin_lock(&owner->pi_lock); if (waiter == rt_mutex_top_waiter(lock)) { rt_mutex_dequeue_pi(owner, top_waiter); rt_mutex_enqueue_pi(owner, waiter); @@ -1328,7 +1391,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, /* Store the lock on which owner is blocked or NULL */ next_lock = task_blocked_on_lock(owner); - raw_spin_unlock_irqrestore(&owner->pi_lock, flags); + raw_spin_unlock(&owner->pi_lock); /* * Even if full deadlock detection is on, if the owner is not * blocked itself, we can avoid finding this out in the chain @@ -1344,31 +1407,29 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, */ get_task_struct(owner); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); res = rt_mutex_adjust_prio_chain(owner, chwalk, lock, next_lock, waiter, task); - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irq(&lock->wait_lock); return res; } /* - * Wake up the next waiter on the lock. + * Remove the top waiter from the current tasks pi waiter tree and + * queue it up. * - * Remove the top waiter from the current tasks pi waiter list, - * wake it up and return whether the current task needs to undo - * a potential priority boosting. - * - * Called with lock->wait_lock held. + * Called with lock->wait_lock held and interrupts disabled. */ -static void wakeup_next_waiter(struct rt_mutex *lock) +static void mark_wakeup_next_waiter(struct wake_q_head *wake_q, + struct wake_q_head *wake_sleeper_q, + struct rt_mutex *lock) { struct rt_mutex_waiter *waiter; - unsigned long flags; - raw_spin_lock_irqsave(¤t->pi_lock, flags); + raw_spin_lock(¤t->pi_lock); waiter = rt_mutex_top_waiter(lock); @@ -1390,20 +1451,18 @@ static void wakeup_next_waiter(struct rt_mutex *lock) */ lock->owner = (void *) RT_MUTEX_HAS_WAITERS; - raw_spin_unlock_irqrestore(¤t->pi_lock, flags); + raw_spin_unlock(¤t->pi_lock); - /* - * It's safe to dereference waiter as it cannot go away as - * long as we hold lock->wait_lock. The waiter task needs to - * acquire it in order to dequeue the waiter. - */ - rt_mutex_wake_waiter(waiter); + if (waiter->savestate) + wake_q_add(wake_sleeper_q, waiter->task); + else + wake_q_add(wake_q, waiter->task); } /* * Remove a waiter from a lock and give up * - * Must be called with lock->wait_lock held and + * Must be called with lock->wait_lock held and interrupts disabled. I must * have just failed to try_to_take_rt_mutex(). */ static void remove_waiter(struct rt_mutex *lock, @@ -1412,12 +1471,11 @@ static void remove_waiter(struct rt_mutex *lock, bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock)); struct task_struct *owner = rt_mutex_owner(lock); struct rt_mutex *next_lock = NULL; - unsigned long flags; - raw_spin_lock_irqsave(¤t->pi_lock, flags); + raw_spin_lock(¤t->pi_lock); rt_mutex_dequeue(lock, waiter); current->pi_blocked_on = NULL; - raw_spin_unlock_irqrestore(¤t->pi_lock, flags); + raw_spin_unlock(¤t->pi_lock); /* * Only update priority if the waiter was the highest priority @@ -1426,7 +1484,7 @@ static void remove_waiter(struct rt_mutex *lock, if (!owner || !is_top_waiter) return; - raw_spin_lock_irqsave(&owner->pi_lock, flags); + raw_spin_lock(&owner->pi_lock); rt_mutex_dequeue_pi(owner, waiter); @@ -1439,7 +1497,7 @@ static void remove_waiter(struct rt_mutex *lock, if (rt_mutex_real_waiter(owner->pi_blocked_on)) next_lock = task_blocked_on_lock(owner); - raw_spin_unlock_irqrestore(&owner->pi_lock, flags); + raw_spin_unlock(&owner->pi_lock); /* * Don't walk the chain, if the owner task is not blocked @@ -1451,12 +1509,12 @@ static void remove_waiter(struct rt_mutex *lock, /* gets dropped in rt_mutex_adjust_prio_chain()! */ get_task_struct(owner); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock, next_lock, NULL, current); - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irq(&lock->wait_lock); } /* @@ -1492,11 +1550,11 @@ void rt_mutex_adjust_pi(struct task_struct *task) * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop * @lock: the rt_mutex to take * @state: the state the task should block in (TASK_INTERRUPTIBLE - * or TASK_UNINTERRUPTIBLE) + * or TASK_UNINTERRUPTIBLE) * @timeout: the pre-initialized and started timer, or NULL for none * @waiter: the pre-initialized rt_mutex_waiter * - * lock->wait_lock must be held by the caller. + * Must be called with lock->wait_lock held and interrupts disabled */ static int __sched __rt_mutex_slowlock(struct rt_mutex *lock, int state, @@ -1531,13 +1589,13 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, break; } - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); debug_rt_mutex_print_deadlock(waiter); - schedule_rt_mutex(lock); + schedule(); - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irq(&lock->wait_lock); set_current_state(state); } @@ -1651,28 +1709,34 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, struct ww_acquire_ctx *ww_ctx) { struct rt_mutex_waiter waiter; + unsigned long flags; int ret = 0; rt_mutex_init_waiter(&waiter, false); - raw_spin_lock(&lock->wait_lock); + /* + * Technically we could use raw_spin_[un]lock_irq() here, but this can + * be called in early boot if the cmpxchg() fast path is disabled + * (debug, no architecture support). In this case we will acquire the + * rtmutex with lock->wait_lock held. But we cannot unconditionally + * enable interrupts in that early boot case. So we need to use the + * irqsave/restore variants. + */ + raw_spin_lock_irqsave(&lock->wait_lock, flags); /* Try to acquire the lock again: */ if (try_to_take_rt_mutex(lock, current, NULL)) { if (ww_ctx) ww_mutex_account_lock(lock, ww_ctx); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); return 0; } set_current_state(state); /* Setup the timer, when timeout != NULL */ - if (unlikely(timeout)) { + if (unlikely(timeout)) hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); - if (!hrtimer_active(&timeout->timer)) - timeout->task = NULL; - } ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk); @@ -1703,7 +1767,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, */ fixup_rt_mutex_waiters(lock); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); /* Remove pending timer: */ if (unlikely(timeout)) @@ -1719,6 +1783,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, */ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) { + unsigned long flags; int ret; /* @@ -1730,10 +1795,10 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) return 0; /* - * The mutex has currently no owner. Lock the wait lock and - * try to acquire the lock. + * The mutex has currently no owner. Lock the wait lock and try to + * acquire the lock. We use irqsave here to support early boot calls. */ - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irqsave(&lock->wait_lock, flags); ret = try_to_take_rt_mutex(lock, current, NULL); @@ -1743,18 +1808,23 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) */ fixup_rt_mutex_waiters(lock); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); return ret; } /* - * Slow path to release a rt-mutex: + * Slow path to release a rt-mutex. + * Return whether the current task needs to undo a potential priority boosting. */ -static bool __sched -rt_mutex_slowunlock(struct rt_mutex *lock) +static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, + struct wake_q_head *wake_q, + struct wake_q_head *wake_sleeper_q) { - raw_spin_lock(&lock->wait_lock); + unsigned long flags; + + /* irqsave required to support early boot calls */ + raw_spin_lock_irqsave(&lock->wait_lock, flags); debug_rt_mutex_unlock(lock); @@ -1793,20 +1863,23 @@ rt_mutex_slowunlock(struct rt_mutex *lock) */ while (!rt_mutex_has_waiters(lock)) { /* Drops lock->wait_lock ! */ - if (unlock_rt_mutex_safe(lock) == true) + if (unlock_rt_mutex_safe(lock, flags) == true) return false; /* Relock the rtmutex and try again */ - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irqsave(&lock->wait_lock, flags); } /* * The wakeup next waiter path does not suffer from the above * race. See the comments there. + * + * Queue the next waiter for wakeup once we release the wait_lock. */ - wakeup_next_waiter(lock); + mark_wakeup_next_waiter(wake_q, wake_sleeper_q, lock); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); + /* check PI boosting */ return true; } @@ -1824,7 +1897,7 @@ rt_mutex_fastlock(struct rt_mutex *lock, int state, enum rtmutex_chainwalk chwalk, struct ww_acquire_ctx *ww_ctx)) { - if (likely(rt_mutex_cmpxchg(lock, NULL, current))) { + if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) { rt_mutex_deadlock_account_lock(lock, current); return 0; } else @@ -1843,7 +1916,7 @@ rt_mutex_timed_fastlock(struct rt_mutex *lock, int state, struct ww_acquire_ctx *ww_ctx)) { if (chwalk == RT_MUTEX_MIN_CHAINWALK && - likely(rt_mutex_cmpxchg(lock, NULL, current))) { + likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) { rt_mutex_deadlock_account_lock(lock, current); return 0; } else @@ -1854,7 +1927,7 @@ static inline int rt_mutex_fasttrylock(struct rt_mutex *lock, int (*slowfn)(struct rt_mutex *lock)) { - if (likely(rt_mutex_cmpxchg(lock, NULL, current))) { + if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) { rt_mutex_deadlock_account_lock(lock, current); return 1; } @@ -1863,13 +1936,25 @@ rt_mutex_fasttrylock(struct rt_mutex *lock, static inline void rt_mutex_fastunlock(struct rt_mutex *lock, - bool (*slowfn)(struct rt_mutex *lock)) + bool (*slowfn)(struct rt_mutex *lock, + struct wake_q_head *wqh, + struct wake_q_head *wq_sleeper)) { - if (likely(rt_mutex_cmpxchg(lock, current, NULL))) { + WAKE_Q(wake_q); + WAKE_Q(wake_sleeper_q); + + if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) { rt_mutex_deadlock_account_unlock(current); - } else if (slowfn(lock)) { + + } else { + bool deboost = slowfn(lock, &wake_q, &wake_sleeper_q); + + wake_up_q(&wake_q); + wake_up_q_sleeper(&wake_sleeper_q); + /* Undo pi boosting if necessary: */ - rt_mutex_adjust_prio(current); + if (deboost) + rt_mutex_adjust_prio(current); } } @@ -1965,10 +2050,21 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); * * @lock: the rt_mutex to be locked * + * This function can only be called in thread context. It's safe to + * call it from atomic regions, but not from hard interrupt or soft + * interrupt context. + * * Returns 1 on success and 0 on contention */ int __sched rt_mutex_trylock(struct rt_mutex *lock) { +#ifdef CONFIG_PREEMPT_RT_FULL + if (WARN_ON(in_irq() || in_nmi())) +#else + if (WARN_ON(in_irq() || in_nmi() || in_serving_softirq())) +#endif + return 0; + return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); } EXPORT_SYMBOL_GPL(rt_mutex_trylock); @@ -1991,13 +2087,15 @@ EXPORT_SYMBOL_GPL(rt_mutex_unlock); * Returns: true/false indicating whether priority adjustment is * required or not. */ -bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock) +bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock, + struct wake_q_head *wqh, + struct wake_q_head *wq_sleeper) { - if (likely(rt_mutex_cmpxchg(lock, current, NULL))) { + if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) { rt_mutex_deadlock_account_unlock(current); return false; } - return rt_mutex_slowunlock(lock); + return rt_mutex_slowunlock(lock, wqh, wq_sleeper); } /** @@ -2091,10 +2189,10 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, { int ret; - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irq(&lock->wait_lock); if (try_to_take_rt_mutex(lock, task, NULL)) { - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); return 1; } @@ -2117,14 +2215,14 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, * PI_REQUEUE_INPROGRESS, so that if the task is waking up * it will know that we are in the process of requeuing it. */ - raw_spin_lock_irq(&task->pi_lock); + raw_spin_lock(&task->pi_lock); if (task->pi_blocked_on) { - raw_spin_unlock_irq(&task->pi_lock); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock(&task->pi_lock); + raw_spin_unlock_irq(&lock->wait_lock); return -EAGAIN; } task->pi_blocked_on = PI_REQUEUE_INPROGRESS; - raw_spin_unlock_irq(&task->pi_lock); + raw_spin_unlock(&task->pi_lock); #endif /* We enforce deadlock detection for futexes */ @@ -2141,10 +2239,10 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, ret = 0; } - if (unlikely(ret)) + if (ret && rt_mutex_has_waiters(lock)) remove_waiter(lock, waiter); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); debug_rt_mutex_print_deadlock(waiter); @@ -2192,7 +2290,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, { int ret; - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irq(&lock->wait_lock); set_current_state(TASK_INTERRUPTIBLE); @@ -2208,7 +2306,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, */ fixup_rt_mutex_waiters(lock); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); return ret; } diff --git a/kernel/kernel/locking/rtmutex_common.h b/kernel/kernel/locking/rtmutex_common.h index 4d317e9a5..289f062f2 100644 --- a/kernel/kernel/locking/rtmutex_common.h +++ b/kernel/kernel/locking/rtmutex_common.h @@ -15,28 +15,6 @@ #include <linux/rtmutex.h> /* - * The rtmutex in kernel tester is independent of rtmutex debugging. We - * call schedule_rt_mutex_test() instead of schedule() for the tasks which - * belong to the tester. That way we can delay the wakeup path of those - * threads to provoke lock stealing and testing of complex boosting scenarios. - */ -#ifdef CONFIG_RT_MUTEX_TESTER - -extern void schedule_rt_mutex_test(struct rt_mutex *lock); - -#define schedule_rt_mutex(_lock) \ - do { \ - if (!(current->flags & PF_MUTEX_TESTER)) \ - schedule(); \ - else \ - schedule_rt_mutex_test(_lock); \ - } while (0) - -#else -# define schedule_rt_mutex(_lock) schedule() -#endif - -/* * This is the control structure for tasks blocked on a rt_mutex, * which is allocated on the kernel stack on of the blocked task. * @@ -135,9 +113,9 @@ extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, struct hrtimer_sleeper *to, struct rt_mutex_waiter *waiter); extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to); - -extern bool rt_mutex_futex_unlock(struct rt_mutex *lock); - +extern bool rt_mutex_futex_unlock(struct rt_mutex *lock, + struct wake_q_head *wqh, + struct wake_q_head *wq_sleeper); extern void rt_mutex_adjust_prio(struct task_struct *task); #ifdef CONFIG_DEBUG_RT_MUTEXES diff --git a/kernel/kernel/locking/rwsem-xadd.c b/kernel/kernel/locking/rwsem-xadd.c index 3417d0172..a4d4de05b 100644 --- a/kernel/kernel/locking/rwsem-xadd.c +++ b/kernel/kernel/locking/rwsem-xadd.c @@ -262,7 +262,7 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) * to reduce unnecessary expensive cmpxchg() operations. */ if (count == RWSEM_WAITING_BIAS && - cmpxchg(&sem->count, RWSEM_WAITING_BIAS, + cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS, RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) { if (!list_is_singular(&sem->wait_list)) rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); @@ -285,7 +285,8 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) if (!(count == 0 || count == RWSEM_WAITING_BIAS)) return false; - old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS); + old = cmpxchg_acquire(&sem->count, count, + count + RWSEM_ACTIVE_WRITE_BIAS); if (old == count) { rwsem_set_owner(sem); return true; @@ -409,11 +410,24 @@ done: return taken; } +/* + * Return true if the rwsem has active spinner + */ +static inline bool rwsem_has_spinner(struct rw_semaphore *sem) +{ + return osq_is_locked(&sem->osq); +} + #else static bool rwsem_optimistic_spin(struct rw_semaphore *sem) { return false; } + +static inline bool rwsem_has_spinner(struct rw_semaphore *sem) +{ + return false; +} #endif /* @@ -496,7 +510,38 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) { unsigned long flags; + /* + * If a spinner is present, it is not necessary to do the wakeup. + * Try to do wakeup only if the trylock succeeds to minimize + * spinlock contention which may introduce too much delay in the + * unlock operation. + * + * spinning writer up_write/up_read caller + * --------------- ----------------------- + * [S] osq_unlock() [L] osq + * MB RMB + * [RmW] rwsem_try_write_lock() [RmW] spin_trylock(wait_lock) + * + * Here, it is important to make sure that there won't be a missed + * wakeup while the rwsem is free and the only spinning writer goes + * to sleep without taking the rwsem. Even when the spinning writer + * is just going to break out of the waiting loop, it will still do + * a trylock in rwsem_down_write_failed() before sleeping. IOW, if + * rwsem_has_spinner() is true, it will guarantee at least one + * trylock attempt on the rwsem later on. + */ + if (rwsem_has_spinner(sem)) { + /* + * The smp_rmb() here is to make sure that the spinner + * state is consulted before reading the wait_lock. + */ + smp_rmb(); + if (!raw_spin_trylock_irqsave(&sem->wait_lock, flags)) + return sem; + goto locked; + } raw_spin_lock_irqsave(&sem->wait_lock, flags); +locked: /* do nothing if list empty */ if (!list_empty(&sem->wait_list)) diff --git a/kernel/kernel/membarrier.c b/kernel/kernel/membarrier.c new file mode 100644 index 000000000..536c727a5 --- /dev/null +++ b/kernel/kernel/membarrier.c @@ -0,0 +1,66 @@ +/* + * Copyright (C) 2010, 2015 Mathieu Desnoyers <mathieu.desnoyers@efficios.com> + * + * membarrier system call + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/syscalls.h> +#include <linux/membarrier.h> + +/* + * Bitmask made from a "or" of all commands within enum membarrier_cmd, + * except MEMBARRIER_CMD_QUERY. + */ +#define MEMBARRIER_CMD_BITMASK (MEMBARRIER_CMD_SHARED) + +/** + * sys_membarrier - issue memory barriers on a set of threads + * @cmd: Takes command values defined in enum membarrier_cmd. + * @flags: Currently needs to be 0. For future extensions. + * + * If this system call is not implemented, -ENOSYS is returned. If the + * command specified does not exist, or if the command argument is invalid, + * this system call returns -EINVAL. For a given command, with flags argument + * set to 0, this system call is guaranteed to always return the same value + * until reboot. + * + * All memory accesses performed in program order from each targeted thread + * is guaranteed to be ordered with respect to sys_membarrier(). If we use + * the semantic "barrier()" to represent a compiler barrier forcing memory + * accesses to be performed in program order across the barrier, and + * smp_mb() to represent explicit memory barriers forcing full memory + * ordering across the barrier, we have the following ordering table for + * each pair of barrier(), sys_membarrier() and smp_mb(): + * + * The pair ordering is detailed as (O: ordered, X: not ordered): + * + * barrier() smp_mb() sys_membarrier() + * barrier() X X O + * smp_mb() X O O + * sys_membarrier() O O O + */ +SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) +{ + if (unlikely(flags)) + return -EINVAL; + switch (cmd) { + case MEMBARRIER_CMD_QUERY: + return MEMBARRIER_CMD_BITMASK; + case MEMBARRIER_CMD_SHARED: + if (num_online_cpus() > 1) + synchronize_sched(); + return 0; + default: + return -EINVAL; + } +} diff --git a/kernel/kernel/memremap.c b/kernel/kernel/memremap.c new file mode 100644 index 000000000..25ced161e --- /dev/null +++ b/kernel/kernel/memremap.c @@ -0,0 +1,202 @@ +/* + * Copyright(c) 2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <linux/device.h> +#include <linux/types.h> +#include <linux/io.h> +#include <linux/mm.h> +#include <linux/memory_hotplug.h> + +#ifndef ioremap_cache +/* temporary while we convert existing ioremap_cache users to memremap */ +__weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size) +{ + return ioremap(offset, size); +} +#endif + +static void *try_ram_remap(resource_size_t offset, size_t size) +{ + struct page *page = pfn_to_page(offset >> PAGE_SHIFT); + + /* In the simple case just return the existing linear address */ + if (!PageHighMem(page)) + return __va(offset); + return NULL; /* fallback to ioremap_cache */ +} + +/** + * memremap() - remap an iomem_resource as cacheable memory + * @offset: iomem resource start address + * @size: size of remap + * @flags: either MEMREMAP_WB or MEMREMAP_WT + * + * memremap() is "ioremap" for cases where it is known that the resource + * being mapped does not have i/o side effects and the __iomem + * annotation is not applicable. + * + * MEMREMAP_WB - matches the default mapping for "System RAM" on + * the architecture. This is usually a read-allocate write-back cache. + * Morever, if MEMREMAP_WB is specified and the requested remap region is RAM + * memremap() will bypass establishing a new mapping and instead return + * a pointer into the direct map. + * + * MEMREMAP_WT - establish a mapping whereby writes either bypass the + * cache or are written through to memory and never exist in a + * cache-dirty state with respect to program visibility. Attempts to + * map "System RAM" with this mapping type will fail. + */ +void *memremap(resource_size_t offset, size_t size, unsigned long flags) +{ + int is_ram = region_intersects(offset, size, "System RAM"); + void *addr = NULL; + + if (is_ram == REGION_MIXED) { + WARN_ONCE(1, "memremap attempted on mixed range %pa size: %#lx\n", + &offset, (unsigned long) size); + return NULL; + } + + /* Try all mapping types requested until one returns non-NULL */ + if (flags & MEMREMAP_WB) { + flags &= ~MEMREMAP_WB; + /* + * MEMREMAP_WB is special in that it can be satisifed + * from the direct map. Some archs depend on the + * capability of memremap() to autodetect cases where + * the requested range is potentially in "System RAM" + */ + if (is_ram == REGION_INTERSECTS) + addr = try_ram_remap(offset, size); + if (!addr) + addr = ioremap_cache(offset, size); + } + + /* + * If we don't have a mapping yet and more request flags are + * pending then we will be attempting to establish a new virtual + * address mapping. Enforce that this mapping is not aliasing + * "System RAM" + */ + if (!addr && is_ram == REGION_INTERSECTS && flags) { + WARN_ONCE(1, "memremap attempted on ram %pa size: %#lx\n", + &offset, (unsigned long) size); + return NULL; + } + + if (!addr && (flags & MEMREMAP_WT)) { + flags &= ~MEMREMAP_WT; + addr = ioremap_wt(offset, size); + } + + return addr; +} +EXPORT_SYMBOL(memremap); + +void memunmap(void *addr) +{ + if (is_vmalloc_addr(addr)) + iounmap((void __iomem *) addr); +} +EXPORT_SYMBOL(memunmap); + +static void devm_memremap_release(struct device *dev, void *res) +{ + memunmap(*(void **)res); +} + +static int devm_memremap_match(struct device *dev, void *res, void *match_data) +{ + return *(void **)res == match_data; +} + +void *devm_memremap(struct device *dev, resource_size_t offset, + size_t size, unsigned long flags) +{ + void **ptr, *addr; + + ptr = devres_alloc_node(devm_memremap_release, sizeof(*ptr), GFP_KERNEL, + dev_to_node(dev)); + if (!ptr) + return ERR_PTR(-ENOMEM); + + addr = memremap(offset, size, flags); + if (addr) { + *ptr = addr; + devres_add(dev, ptr); + } else { + devres_free(ptr); + return ERR_PTR(-ENXIO); + } + + return addr; +} +EXPORT_SYMBOL(devm_memremap); + +void devm_memunmap(struct device *dev, void *addr) +{ + WARN_ON(devres_release(dev, devm_memremap_release, + devm_memremap_match, addr)); +} +EXPORT_SYMBOL(devm_memunmap); + +#ifdef CONFIG_ZONE_DEVICE +struct page_map { + struct resource res; +}; + +static void devm_memremap_pages_release(struct device *dev, void *res) +{ + struct page_map *page_map = res; + + /* pages are dead and unused, undo the arch mapping */ + arch_remove_memory(page_map->res.start, resource_size(&page_map->res)); +} + +void *devm_memremap_pages(struct device *dev, struct resource *res) +{ + int is_ram = region_intersects(res->start, resource_size(res), + "System RAM"); + struct page_map *page_map; + int error, nid; + + if (is_ram == REGION_MIXED) { + WARN_ONCE(1, "%s attempted on mixed region %pr\n", + __func__, res); + return ERR_PTR(-ENXIO); + } + + if (is_ram == REGION_INTERSECTS) + return __va(res->start); + + page_map = devres_alloc_node(devm_memremap_pages_release, + sizeof(*page_map), GFP_KERNEL, dev_to_node(dev)); + if (!page_map) + return ERR_PTR(-ENOMEM); + + memcpy(&page_map->res, res, sizeof(*res)); + + nid = dev_to_node(dev); + if (nid < 0) + nid = numa_mem_id(); + + error = arch_add_memory(nid, res->start, resource_size(res), true); + if (error) { + devres_free(page_map); + return ERR_PTR(error); + } + + devres_add(dev, page_map); + return __va(res->start); +} +EXPORT_SYMBOL(devm_memremap_pages); +#endif /* CONFIG_ZONE_DEVICE */ diff --git a/kernel/kernel/module.c b/kernel/kernel/module.c index cfc9e843a..0e5c71195 100644 --- a/kernel/kernel/module.c +++ b/kernel/kernel/module.c @@ -18,7 +18,7 @@ */ #include <linux/export.h> #include <linux/moduleloader.h> -#include <linux/ftrace_event.h> +#include <linux/trace_events.h> #include <linux/init.h> #include <linux/kallsyms.h> #include <linux/file.h> @@ -101,48 +101,201 @@ DEFINE_MUTEX(module_mutex); EXPORT_SYMBOL_GPL(module_mutex); static LIST_HEAD(modules); -#ifdef CONFIG_KGDB_KDB -struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */ -#endif /* CONFIG_KGDB_KDB */ -#ifdef CONFIG_MODULE_SIG -#ifdef CONFIG_MODULE_SIG_FORCE -static bool sig_enforce = true; -#else -static bool sig_enforce = false; +#ifdef CONFIG_MODULES_TREE_LOOKUP + +/* + * Use a latched RB-tree for __module_address(); this allows us to use + * RCU-sched lookups of the address from any context. + * + * Because modules have two address ranges: init and core, we need two + * latch_tree_nodes entries. Therefore we need the back-pointer from + * mod_tree_node. + * + * Because init ranges are short lived we mark them unlikely and have placed + * them outside the critical cacheline in struct module. + * + * This is conditional on PERF_EVENTS || TRACING because those can really hit + * __module_address() hard by doing a lot of stack unwinding; potentially from + * NMI context. + */ -static int param_set_bool_enable_only(const char *val, - const struct kernel_param *kp) +static __always_inline unsigned long __mod_tree_val(struct latch_tree_node *n) { - int err; - bool test; - struct kernel_param dummy_kp = *kp; + struct mod_tree_node *mtn = container_of(n, struct mod_tree_node, node); + struct module *mod = mtn->mod; - dummy_kp.arg = &test; + if (unlikely(mtn == &mod->mtn_init)) + return (unsigned long)mod->module_init; - err = param_set_bool(val, &dummy_kp); - if (err) - return err; + return (unsigned long)mod->module_core; +} + +static __always_inline unsigned long __mod_tree_size(struct latch_tree_node *n) +{ + struct mod_tree_node *mtn = container_of(n, struct mod_tree_node, node); + struct module *mod = mtn->mod; + + if (unlikely(mtn == &mod->mtn_init)) + return (unsigned long)mod->init_size; + + return (unsigned long)mod->core_size; +} + +static __always_inline bool +mod_tree_less(struct latch_tree_node *a, struct latch_tree_node *b) +{ + return __mod_tree_val(a) < __mod_tree_val(b); +} + +static __always_inline int +mod_tree_comp(void *key, struct latch_tree_node *n) +{ + unsigned long val = (unsigned long)key; + unsigned long start, end; + + start = __mod_tree_val(n); + if (val < start) + return -1; - /* Don't let them unset it once it's set! */ - if (!test && sig_enforce) - return -EROFS; + end = start + __mod_tree_size(n); + if (val >= end) + return 1; - if (test) - sig_enforce = true; return 0; } -static const struct kernel_param_ops param_ops_bool_enable_only = { - .flags = KERNEL_PARAM_OPS_FL_NOARG, - .set = param_set_bool_enable_only, - .get = param_get_bool, +static const struct latch_tree_ops mod_tree_ops = { + .less = mod_tree_less, + .comp = mod_tree_comp, }; -#define param_check_bool_enable_only param_check_bool +static struct mod_tree_root { + struct latch_tree_root root; + unsigned long addr_min; + unsigned long addr_max; +} mod_tree __cacheline_aligned = { + .addr_min = -1UL, +}; + +#define module_addr_min mod_tree.addr_min +#define module_addr_max mod_tree.addr_max + +static noinline void __mod_tree_insert(struct mod_tree_node *node) +{ + latch_tree_insert(&node->node, &mod_tree.root, &mod_tree_ops); +} + +static void __mod_tree_remove(struct mod_tree_node *node) +{ + latch_tree_erase(&node->node, &mod_tree.root, &mod_tree_ops); +} + +/* + * These modifications: insert, remove_init and remove; are serialized by the + * module_mutex. + */ +static void mod_tree_insert(struct module *mod) +{ + mod->mtn_core.mod = mod; + mod->mtn_init.mod = mod; + + __mod_tree_insert(&mod->mtn_core); + if (mod->init_size) + __mod_tree_insert(&mod->mtn_init); +} + +static void mod_tree_remove_init(struct module *mod) +{ + if (mod->init_size) + __mod_tree_remove(&mod->mtn_init); +} + +static void mod_tree_remove(struct module *mod) +{ + __mod_tree_remove(&mod->mtn_core); + mod_tree_remove_init(mod); +} + +static struct module *mod_find(unsigned long addr) +{ + struct latch_tree_node *ltn; + + ltn = latch_tree_find((void *)addr, &mod_tree.root, &mod_tree_ops); + if (!ltn) + return NULL; + + return container_of(ltn, struct mod_tree_node, node)->mod; +} + +#else /* MODULES_TREE_LOOKUP */ + +static unsigned long module_addr_min = -1UL, module_addr_max = 0; + +static void mod_tree_insert(struct module *mod) { } +static void mod_tree_remove_init(struct module *mod) { } +static void mod_tree_remove(struct module *mod) { } + +static struct module *mod_find(unsigned long addr) +{ + struct module *mod; + + list_for_each_entry_rcu(mod, &modules, list) { + if (within_module(addr, mod)) + return mod; + } + + return NULL; +} + +#endif /* MODULES_TREE_LOOKUP */ + +/* + * Bounds of module text, for speeding up __module_address. + * Protected by module_mutex. + */ +static void __mod_update_bounds(void *base, unsigned int size) +{ + unsigned long min = (unsigned long)base; + unsigned long max = min + size; + + if (min < module_addr_min) + module_addr_min = min; + if (max > module_addr_max) + module_addr_max = max; +} + +static void mod_update_bounds(struct module *mod) +{ + __mod_update_bounds(mod->module_core, mod->core_size); + if (mod->init_size) + __mod_update_bounds(mod->module_init, mod->init_size); +} + +#ifdef CONFIG_KGDB_KDB +struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */ +#endif /* CONFIG_KGDB_KDB */ + +static void module_assert_mutex(void) +{ + lockdep_assert_held(&module_mutex); +} + +static void module_assert_mutex_or_preempt(void) +{ +#ifdef CONFIG_LOCKDEP + if (unlikely(!debug_locks)) + return; + + WARN_ON(!rcu_read_lock_sched_held() && + !lockdep_is_held(&module_mutex)); +#endif +} + +static bool sig_enforce = IS_ENABLED(CONFIG_MODULE_SIG_FORCE); +#ifndef CONFIG_MODULE_SIG_FORCE module_param(sig_enforce, bool_enable_only, 0644); #endif /* !CONFIG_MODULE_SIG_FORCE */ -#endif /* CONFIG_MODULE_SIG */ /* Block module loading/unloading? */ int modules_disabled = 0; @@ -153,10 +306,6 @@ static DECLARE_WAIT_QUEUE_HEAD(module_wq); static BLOCKING_NOTIFIER_HEAD(module_notify_list); -/* Bounds of module allocation, for speeding __module_address. - * Protected by module_mutex. */ -static unsigned long module_addr_min = -1UL, module_addr_max = 0; - int register_module_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&module_notify_list, nb); @@ -178,6 +327,9 @@ struct load_info { struct _ddebug *debug; unsigned int num_debug; bool sig_ok; +#ifdef CONFIG_KALLSYMS + unsigned long mod_kallsyms_init_off; +#endif struct { unsigned int sym, str, mod, vers, info, pcpu; } index; @@ -318,6 +470,8 @@ bool each_symbol_section(bool (*fn)(const struct symsearch *arr, #endif }; + module_assert_mutex_or_preempt(); + if (each_symbol_in_section(arr, ARRAY_SIZE(arr), NULL, fn, data)) return true; @@ -451,12 +605,17 @@ const struct kernel_symbol *find_symbol(const char *name, } EXPORT_SYMBOL_GPL(find_symbol); -/* Search for module by name: must hold module_mutex. */ +/* + * Search for module by name: must hold module_mutex (or preempt disabled + * for read-only access). + */ static struct module *find_module_all(const char *name, size_t len, bool even_unformed) { struct module *mod; + module_assert_mutex_or_preempt(); + list_for_each_entry(mod, &modules, list) { if (!even_unformed && mod->state == MODULE_STATE_UNFORMED) continue; @@ -468,6 +627,7 @@ static struct module *find_module_all(const char *name, size_t len, struct module *find_module(const char *name) { + module_assert_mutex(); return find_module_all(name, strlen(name), false); } EXPORT_SYMBOL_GPL(find_module); @@ -906,11 +1066,15 @@ void symbol_put_addr(void *addr) if (core_kernel_text(a)) return; - /* module_text_address is safe here: we're supposed to have reference - * to module from symbol_get, so it can't go away. */ + /* + * Even though we hold a reference on the module; we still need to + * disable preemption in order to safely traverse the data structure. + */ + preempt_disable(); modaddr = __module_text_address(a); BUG_ON(!modaddr); module_put(modaddr); + preempt_enable(); } EXPORT_SYMBOL_GPL(symbol_put_addr); @@ -1169,11 +1333,17 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs, { const unsigned long *crc; - /* Since this should be found in kernel (which can't be removed), - * no locking is necessary. */ + /* + * Since this should be found in kernel (which can't be removed), no + * locking is necessary -- use preempt_disable() to placate lockdep. + */ + preempt_disable(); if (!find_symbol(VMLINUX_SYMBOL_STR(module_layout), NULL, - &crc, true, false)) + &crc, true, false)) { + preempt_enable(); BUG(); + } + preempt_enable(); return check_version(sechdrs, versindex, VMLINUX_SYMBOL_STR(module_layout), mod, crc, NULL); @@ -1661,6 +1831,10 @@ static void mod_sysfs_fini(struct module *mod) mod_kobject_put(mod); } +static void init_param_lock(struct module *mod) +{ + mutex_init(&mod->param_lock); +} #else /* !CONFIG_SYSFS */ static int mod_sysfs_setup(struct module *mod, @@ -1683,6 +1857,9 @@ static void del_usage_links(struct module *mod) { } +static void init_param_lock(struct module *mod) +{ +} #endif /* CONFIG_SYSFS */ static void mod_sysfs_teardown(struct module *mod) @@ -1852,10 +2029,11 @@ static void free_module(struct module *mod) mutex_lock(&module_mutex); /* Unlink carefully: kallsyms could be walking list. */ list_del_rcu(&mod->list); + mod_tree_remove(mod); /* Remove this module from bug list, this uses list_del_rcu */ module_bug_cleanup(mod); - /* Wait for RCU synchronizing before releasing mod->list and buglist. */ - synchronize_rcu(); + /* Wait for RCU-sched synchronizing before releasing mod->list and buglist. */ + synchronize_sched(); mutex_unlock(&module_mutex); /* This may be NULL, but that's OK */ @@ -2317,10 +2495,21 @@ static void layout_symtab(struct module *mod, struct load_info *info) strsect->sh_flags |= SHF_ALLOC; strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect, info->index.str) | INIT_OFFSET_MASK; - mod->init_size = debug_align(mod->init_size); pr_debug("\t%s\n", info->secstrings + strsect->sh_name); + + /* We'll tack temporary mod_kallsyms on the end. */ + mod->init_size = ALIGN(mod->init_size, + __alignof__(struct mod_kallsyms)); + info->mod_kallsyms_init_off = mod->init_size; + mod->init_size += sizeof(struct mod_kallsyms); + mod->init_size = debug_align(mod->init_size); } +/* + * We use the full symtab and strtab which layout_symtab arranged to + * be appended to the init section. Later we switch to the cut-down + * core-only ones. + */ static void add_kallsyms(struct module *mod, const struct load_info *info) { unsigned int i, ndst; @@ -2329,28 +2518,33 @@ static void add_kallsyms(struct module *mod, const struct load_info *info) char *s; Elf_Shdr *symsec = &info->sechdrs[info->index.sym]; - mod->symtab = (void *)symsec->sh_addr; - mod->num_symtab = symsec->sh_size / sizeof(Elf_Sym); + /* Set up to point into init section. */ + mod->kallsyms = mod->module_init + info->mod_kallsyms_init_off; + + mod->kallsyms->symtab = (void *)symsec->sh_addr; + mod->kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym); /* Make sure we get permanent strtab: don't use info->strtab. */ - mod->strtab = (void *)info->sechdrs[info->index.str].sh_addr; + mod->kallsyms->strtab = (void *)info->sechdrs[info->index.str].sh_addr; /* Set types up while we still have access to sections. */ - for (i = 0; i < mod->num_symtab; i++) - mod->symtab[i].st_info = elf_type(&mod->symtab[i], info); - - mod->core_symtab = dst = mod->module_core + info->symoffs; - mod->core_strtab = s = mod->module_core + info->stroffs; - src = mod->symtab; - for (ndst = i = 0; i < mod->num_symtab; i++) { + for (i = 0; i < mod->kallsyms->num_symtab; i++) + mod->kallsyms->symtab[i].st_info + = elf_type(&mod->kallsyms->symtab[i], info); + + /* Now populate the cut down core kallsyms for after init. */ + mod->core_kallsyms.symtab = dst = mod->module_core + info->symoffs; + mod->core_kallsyms.strtab = s = mod->module_core + info->stroffs; + src = mod->kallsyms->symtab; + for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) { if (i == 0 || is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) { dst[ndst] = src[i]; - dst[ndst++].st_name = s - mod->core_strtab; - s += strlcpy(s, &mod->strtab[src[i].st_name], + dst[ndst++].st_name = s - mod->core_kallsyms.strtab; + s += strlcpy(s, &mod->kallsyms->strtab[src[i].st_name], KSYM_NAME_LEN) + 1; } } - mod->core_num_syms = ndst; + mod->core_kallsyms.num_symtab = ndst; } #else static inline void layout_symtab(struct module *mod, struct load_info *info) @@ -2384,22 +2578,6 @@ void * __weak module_alloc(unsigned long size) return vmalloc_exec(size); } -static void *module_alloc_update_bounds(unsigned long size) -{ - void *ret = module_alloc(size); - - if (ret) { - mutex_lock(&module_mutex); - /* Update module bounds. */ - if ((unsigned long)ret < module_addr_min) - module_addr_min = (unsigned long)ret; - if ((unsigned long)ret + size > module_addr_max) - module_addr_max = (unsigned long)ret + size; - mutex_unlock(&module_mutex); - } - return ret; -} - #ifdef CONFIG_DEBUG_KMEMLEAK static void kmemleak_load_module(const struct module *mod, const struct load_info *info) @@ -2805,7 +2983,7 @@ static int move_module(struct module *mod, struct load_info *info) void *ptr; /* Do the allocs. */ - ptr = module_alloc_update_bounds(mod->core_size); + ptr = module_alloc(mod->core_size); /* * The pointer to this block is stored in the module structure * which is inside the block. Just mark it as not being a @@ -2819,7 +2997,7 @@ static int move_module(struct module *mod, struct load_info *info) mod->module_core = ptr; if (mod->init_size) { - ptr = module_alloc_update_bounds(mod->init_size); + ptr = module_alloc(mod->init_size); /* * The pointer to this block is stored in the module structure * which is inside the block. This block doesn't need to be @@ -3107,7 +3285,7 @@ static noinline int do_init_module(struct module *mod) * * http://thread.gmane.org/gmane.linux.kernel/1420814 */ - if (current->flags & PF_USED_ASYNC) + if (!mod->async_probe_requested && (current->flags & PF_USED_ASYNC)) async_synchronize_full(); mutex_lock(&module_mutex); @@ -3115,10 +3293,10 @@ static noinline int do_init_module(struct module *mod) module_put(mod); trim_init_extable(mod); #ifdef CONFIG_KALLSYMS - mod->num_symtab = mod->core_num_syms; - mod->symtab = mod->core_symtab; - mod->strtab = mod->core_strtab; + /* Switch to core kallsyms now init is done: kallsyms may be walking! */ + rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms); #endif + mod_tree_remove_init(mod); unset_module_init_ro_nx(mod); module_arch_freeing_init(mod); mod->module_init = NULL; @@ -3127,11 +3305,11 @@ static noinline int do_init_module(struct module *mod) mod->init_text_size = 0; /* * We want to free module_init, but be aware that kallsyms may be - * walking this with preempt disabled. In all the failure paths, - * we call synchronize_rcu/synchronize_sched, but we don't want - * to slow down the success path, so use actual RCU here. + * walking this with preempt disabled. In all the failure paths, we + * call synchronize_sched(), but we don't want to slow down the success + * path, so use actual RCU here. */ - call_rcu(&freeinit->rcu, do_free_init); + call_rcu_sched(&freeinit->rcu, do_free_init); mutex_unlock(&module_mutex); wake_up_all(&module_wq); @@ -3188,7 +3366,9 @@ again: err = -EEXIST; goto out; } + mod_update_bounds(mod); list_add_rcu(&mod->list, &modules); + mod_tree_insert(mod); err = 0; out: @@ -3237,10 +3417,19 @@ out: return err; } -static int unknown_module_param_cb(char *param, char *val, const char *modname) +static int unknown_module_param_cb(char *param, char *val, const char *modname, + void *arg) { + struct module *mod = arg; + int ret; + + if (strcmp(param, "async_probe") == 0) { + mod->async_probe_requested = true; + return 0; + } + /* Check for magic 'dyndbg' arg */ - int ret = ddebug_dyndbg_module_param_cb(param, val, modname); + ret = ddebug_dyndbg_module_param_cb(param, val, modname); if (ret != 0) pr_warn("%s: unknown parameter '%s' ignored\n", modname, param); return 0; @@ -3295,6 +3484,8 @@ static int load_module(struct load_info *info, const char __user *uargs, if (err) goto unlink_mod; + init_param_lock(mod); + /* Now we've got everything in the final locations, we can * find optional sections. */ err = find_module_sections(mod, info); @@ -3342,7 +3533,8 @@ static int load_module(struct load_info *info, const char __user *uargs, /* Module is ready to execute: parsing args may do that. */ after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, - -32768, 32767, unknown_module_param_cb); + -32768, 32767, mod, + unknown_module_param_cb); if (IS_ERR(after_dashes)) { err = PTR_ERR(after_dashes); goto bug_cleanup; @@ -3391,11 +3583,18 @@ static int load_module(struct load_info *info, const char __user *uargs, mutex_lock(&module_mutex); /* Unlink carefully: kallsyms could be walking list. */ list_del_rcu(&mod->list); + mod_tree_remove(mod); wake_up_all(&module_wq); - /* Wait for RCU synchronizing before releasing mod->list. */ - synchronize_rcu(); + /* Wait for RCU-sched synchronizing before releasing mod->list. */ + synchronize_sched(); mutex_unlock(&module_mutex); free_module: + /* + * Ftrace needs to clean up what it initialized. + * This does nothing if ftrace_module_init() wasn't called, + * but it must be called outside of module_mutex. + */ + ftrace_release_mod(mod); /* Free lock-classes; relies on the preceding sync_rcu() */ lockdep_free_key_range(mod->module_core, mod->core_size); @@ -3465,6 +3664,11 @@ static inline int is_arm_mapping_symbol(const char *str) && (str[2] == '\0' || str[2] == '.'); } +static const char *symname(struct mod_kallsyms *kallsyms, unsigned int symnum) +{ + return kallsyms->strtab + kallsyms->symtab[symnum].st_name; +} + static const char *get_ksymbol(struct module *mod, unsigned long addr, unsigned long *size, @@ -3472,6 +3676,7 @@ static const char *get_ksymbol(struct module *mod, { unsigned int i, best = 0; unsigned long nextval; + struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms); /* At worse, next value is at end of module */ if (within_module_init(addr, mod)) @@ -3481,32 +3686,32 @@ static const char *get_ksymbol(struct module *mod, /* Scan for closest preceding symbol, and next symbol. (ELF starts real symbols at 1). */ - for (i = 1; i < mod->num_symtab; i++) { - if (mod->symtab[i].st_shndx == SHN_UNDEF) + for (i = 1; i < kallsyms->num_symtab; i++) { + if (kallsyms->symtab[i].st_shndx == SHN_UNDEF) continue; /* We ignore unnamed symbols: they're uninformative * and inserted at a whim. */ - if (mod->symtab[i].st_value <= addr - && mod->symtab[i].st_value > mod->symtab[best].st_value - && *(mod->strtab + mod->symtab[i].st_name) != '\0' - && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name)) + if (*symname(kallsyms, i) == '\0' + || is_arm_mapping_symbol(symname(kallsyms, i))) + continue; + + if (kallsyms->symtab[i].st_value <= addr + && kallsyms->symtab[i].st_value > kallsyms->symtab[best].st_value) best = i; - if (mod->symtab[i].st_value > addr - && mod->symtab[i].st_value < nextval - && *(mod->strtab + mod->symtab[i].st_name) != '\0' - && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name)) - nextval = mod->symtab[i].st_value; + if (kallsyms->symtab[i].st_value > addr + && kallsyms->symtab[i].st_value < nextval) + nextval = kallsyms->symtab[i].st_value; } if (!best) return NULL; if (size) - *size = nextval - mod->symtab[best].st_value; + *size = nextval - kallsyms->symtab[best].st_value; if (offset) - *offset = addr - mod->symtab[best].st_value; - return mod->strtab + mod->symtab[best].st_name; + *offset = addr - kallsyms->symtab[best].st_value; + return symname(kallsyms, best); } /* For kallsyms to ask for address resolution. NULL means not found. Careful @@ -3517,19 +3722,15 @@ const char *module_address_lookup(unsigned long addr, char **modname, char *namebuf) { - struct module *mod; const char *ret = NULL; + struct module *mod; preempt_disable(); - list_for_each_entry_rcu(mod, &modules, list) { - if (mod->state == MODULE_STATE_UNFORMED) - continue; - if (within_module(addr, mod)) { - if (modname) - *modname = mod->name; - ret = get_ksymbol(mod, addr, size, offset); - break; - } + mod = __module_address(addr); + if (mod) { + if (modname) + *modname = mod->name; + ret = get_ksymbol(mod, addr, size, offset); } /* Make a copy in here where it's safe */ if (ret) { @@ -3537,6 +3738,7 @@ const char *module_address_lookup(unsigned long addr, ret = namebuf; } preempt_enable(); + return ret; } @@ -3599,19 +3801,21 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, preempt_disable(); list_for_each_entry_rcu(mod, &modules, list) { + struct mod_kallsyms *kallsyms; + if (mod->state == MODULE_STATE_UNFORMED) continue; - if (symnum < mod->num_symtab) { - *value = mod->symtab[symnum].st_value; - *type = mod->symtab[symnum].st_info; - strlcpy(name, mod->strtab + mod->symtab[symnum].st_name, - KSYM_NAME_LEN); + kallsyms = rcu_dereference_sched(mod->kallsyms); + if (symnum < kallsyms->num_symtab) { + *value = kallsyms->symtab[symnum].st_value; + *type = kallsyms->symtab[symnum].st_info; + strlcpy(name, symname(kallsyms, symnum), KSYM_NAME_LEN); strlcpy(module_name, mod->name, MODULE_NAME_LEN); *exported = is_exported(name, *value, mod); preempt_enable(); return 0; } - symnum -= mod->num_symtab; + symnum -= kallsyms->num_symtab; } preempt_enable(); return -ERANGE; @@ -3620,11 +3824,12 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, static unsigned long mod_find_symname(struct module *mod, const char *name) { unsigned int i; + struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms); - for (i = 0; i < mod->num_symtab; i++) - if (strcmp(name, mod->strtab+mod->symtab[i].st_name) == 0 && - mod->symtab[i].st_info != 'U') - return mod->symtab[i].st_value; + for (i = 0; i < kallsyms->num_symtab; i++) + if (strcmp(name, symname(kallsyms, i)) == 0 && + kallsyms->symtab[i].st_info != 'U') + return kallsyms->symtab[i].st_value; return 0; } @@ -3660,12 +3865,17 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *, unsigned int i; int ret; + module_assert_mutex(); + list_for_each_entry(mod, &modules, list) { + /* We hold module_mutex: no need for rcu_dereference_sched */ + struct mod_kallsyms *kallsyms = mod->kallsyms; + if (mod->state == MODULE_STATE_UNFORMED) continue; - for (i = 0; i < mod->num_symtab; i++) { - ret = fn(data, mod->strtab + mod->symtab[i].st_name, - mod, mod->symtab[i].st_value); + for (i = 0; i < kallsyms->num_symtab; i++) { + ret = fn(data, symname(kallsyms, i), + mod, kallsyms->symtab[i].st_value); if (ret != 0) return ret; } @@ -3834,13 +4044,15 @@ struct module *__module_address(unsigned long addr) if (addr < module_addr_min || addr > module_addr_max) return NULL; - list_for_each_entry_rcu(mod, &modules, list) { + module_assert_mutex_or_preempt(); + + mod = mod_find(addr); + if (mod) { + BUG_ON(!within_module(addr, mod)); if (mod->state == MODULE_STATE_UNFORMED) - continue; - if (within_module(addr, mod)) - return mod; + mod = NULL; } - return NULL; + return mod; } EXPORT_SYMBOL_GPL(__module_address); diff --git a/kernel/kernel/module_signing.c b/kernel/kernel/module_signing.c index be5b8fac4..6528a79d9 100644 --- a/kernel/kernel/module_signing.c +++ b/kernel/kernel/module_signing.c @@ -10,11 +10,9 @@ */ #include <linux/kernel.h> -#include <linux/err.h> -#include <crypto/public_key.h> -#include <crypto/hash.h> -#include <keys/asymmetric-type.h> +#include <linux/errno.h> #include <keys/system_keyring.h> +#include <crypto/public_key.h> #include "module-internal.h" /* @@ -28,170 +26,22 @@ * - Information block */ struct module_signature { - u8 algo; /* Public-key crypto algorithm [enum pkey_algo] */ - u8 hash; /* Digest algorithm [enum hash_algo] */ - u8 id_type; /* Key identifier type [enum pkey_id_type] */ - u8 signer_len; /* Length of signer's name */ - u8 key_id_len; /* Length of key identifier */ + u8 algo; /* Public-key crypto algorithm [0] */ + u8 hash; /* Digest algorithm [0] */ + u8 id_type; /* Key identifier type [PKEY_ID_PKCS7] */ + u8 signer_len; /* Length of signer's name [0] */ + u8 key_id_len; /* Length of key identifier [0] */ u8 __pad[3]; __be32 sig_len; /* Length of signature data */ }; /* - * Digest the module contents. - */ -static struct public_key_signature *mod_make_digest(enum hash_algo hash, - const void *mod, - unsigned long modlen) -{ - struct public_key_signature *pks; - struct crypto_shash *tfm; - struct shash_desc *desc; - size_t digest_size, desc_size; - int ret; - - pr_devel("==>%s()\n", __func__); - - /* Allocate the hashing algorithm we're going to need and find out how - * big the hash operational data will be. - */ - tfm = crypto_alloc_shash(hash_algo_name[hash], 0, 0); - if (IS_ERR(tfm)) - return (PTR_ERR(tfm) == -ENOENT) ? ERR_PTR(-ENOPKG) : ERR_CAST(tfm); - - desc_size = crypto_shash_descsize(tfm) + sizeof(*desc); - digest_size = crypto_shash_digestsize(tfm); - - /* We allocate the hash operational data storage on the end of our - * context data and the digest output buffer on the end of that. - */ - ret = -ENOMEM; - pks = kzalloc(digest_size + sizeof(*pks) + desc_size, GFP_KERNEL); - if (!pks) - goto error_no_pks; - - pks->pkey_hash_algo = hash; - pks->digest = (u8 *)pks + sizeof(*pks) + desc_size; - pks->digest_size = digest_size; - - desc = (void *)pks + sizeof(*pks); - desc->tfm = tfm; - desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; - - ret = crypto_shash_init(desc); - if (ret < 0) - goto error; - - ret = crypto_shash_finup(desc, mod, modlen, pks->digest); - if (ret < 0) - goto error; - - crypto_free_shash(tfm); - pr_devel("<==%s() = ok\n", __func__); - return pks; - -error: - kfree(pks); -error_no_pks: - crypto_free_shash(tfm); - pr_devel("<==%s() = %d\n", __func__, ret); - return ERR_PTR(ret); -} - -/* - * Extract an MPI array from the signature data. This represents the actual - * signature. Each raw MPI is prefaced by a BE 2-byte value indicating the - * size of the MPI in bytes. - * - * RSA signatures only have one MPI, so currently we only read one. - */ -static int mod_extract_mpi_array(struct public_key_signature *pks, - const void *data, size_t len) -{ - size_t nbytes; - MPI mpi; - - if (len < 3) - return -EBADMSG; - nbytes = ((const u8 *)data)[0] << 8 | ((const u8 *)data)[1]; - data += 2; - len -= 2; - if (len != nbytes) - return -EBADMSG; - - mpi = mpi_read_raw_data(data, nbytes); - if (!mpi) - return -ENOMEM; - pks->mpi[0] = mpi; - pks->nr_mpi = 1; - return 0; -} - -/* - * Request an asymmetric key. - */ -static struct key *request_asymmetric_key(const char *signer, size_t signer_len, - const u8 *key_id, size_t key_id_len) -{ - key_ref_t key; - size_t i; - char *id, *q; - - pr_devel("==>%s(,%zu,,%zu)\n", __func__, signer_len, key_id_len); - - /* Construct an identifier. */ - id = kmalloc(signer_len + 2 + key_id_len * 2 + 1, GFP_KERNEL); - if (!id) - return ERR_PTR(-ENOKEY); - - memcpy(id, signer, signer_len); - - q = id + signer_len; - *q++ = ':'; - *q++ = ' '; - for (i = 0; i < key_id_len; i++) { - *q++ = hex_asc[*key_id >> 4]; - *q++ = hex_asc[*key_id++ & 0x0f]; - } - - *q = 0; - - pr_debug("Look up: \"%s\"\n", id); - - key = keyring_search(make_key_ref(system_trusted_keyring, 1), - &key_type_asymmetric, id); - if (IS_ERR(key)) - pr_warn("Request for unknown module key '%s' err %ld\n", - id, PTR_ERR(key)); - kfree(id); - - if (IS_ERR(key)) { - switch (PTR_ERR(key)) { - /* Hide some search errors */ - case -EACCES: - case -ENOTDIR: - case -EAGAIN: - return ERR_PTR(-ENOKEY); - default: - return ERR_CAST(key); - } - } - - pr_devel("<==%s() = 0 [%x]\n", __func__, key_serial(key_ref_to_ptr(key))); - return key_ref_to_ptr(key); -} - -/* * Verify the signature on a module. */ int mod_verify_sig(const void *mod, unsigned long *_modlen) { - struct public_key_signature *pks; struct module_signature ms; - struct key *key; - const void *sig; size_t modlen = *_modlen, sig_len; - int ret; pr_devel("==>%s(,%zu)\n", __func__, modlen); @@ -205,46 +55,24 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen) if (sig_len >= modlen) return -EBADMSG; modlen -= sig_len; - if ((size_t)ms.signer_len + ms.key_id_len >= modlen) - return -EBADMSG; - modlen -= (size_t)ms.signer_len + ms.key_id_len; - *_modlen = modlen; - sig = mod + modlen; - - /* For the moment, only support RSA and X.509 identifiers */ - if (ms.algo != PKEY_ALGO_RSA || - ms.id_type != PKEY_ID_X509) - return -ENOPKG; - if (ms.hash >= PKEY_HASH__LAST || - !hash_algo_name[ms.hash]) + if (ms.id_type != PKEY_ID_PKCS7) { + pr_err("Module is not signed with expected PKCS#7 message\n"); return -ENOPKG; - - key = request_asymmetric_key(sig, ms.signer_len, - sig + ms.signer_len, ms.key_id_len); - if (IS_ERR(key)) - return PTR_ERR(key); - - pks = mod_make_digest(ms.hash, mod, modlen); - if (IS_ERR(pks)) { - ret = PTR_ERR(pks); - goto error_put_key; } - ret = mod_extract_mpi_array(pks, sig + ms.signer_len + ms.key_id_len, - sig_len); - if (ret < 0) - goto error_free_pks; - - ret = verify_signature(key, pks); - pr_devel("verify_signature() = %d\n", ret); + if (ms.algo != 0 || + ms.hash != 0 || + ms.signer_len != 0 || + ms.key_id_len != 0 || + ms.__pad[0] != 0 || + ms.__pad[1] != 0 || + ms.__pad[2] != 0) { + pr_err("PKCS#7 signature info has unexpected non-zero params\n"); + return -EBADMSG; + } -error_free_pks: - mpi_free(pks->rsa.s); - kfree(pks); -error_put_key: - key_put(key); - pr_devel("<==%s() = %d\n", __func__, ret); - return ret; + return system_verify_data(mod, modlen, mod + modlen, sig_len, + VERIFYING_MODULE_SIGNATURE); } diff --git a/kernel/kernel/notifier.c b/kernel/kernel/notifier.c index ae9fc7cc3..fd2c9acbc 100644 --- a/kernel/kernel/notifier.c +++ b/kernel/kernel/notifier.c @@ -544,6 +544,8 @@ int notrace notify_die(enum die_val val, const char *str, .signr = sig, }; + RCU_LOCKDEP_WARN(!rcu_is_watching(), + "notify_die called but RCU thinks we're quiescent"); return atomic_notifier_call_chain(&die_chain, val, &args); } NOKPROBE_SYMBOL(notify_die); diff --git a/kernel/kernel/panic.c b/kernel/kernel/panic.c index 04360904f..50d4ae2e7 100644 --- a/kernel/kernel/panic.c +++ b/kernel/kernel/panic.c @@ -23,6 +23,7 @@ #include <linux/sysrq.h> #include <linux/init.h> #include <linux/nmi.h> +#include <linux/console.h> #define PANIC_TIMER_STEP 100 #define PANIC_BLINK_SPD 18 @@ -32,7 +33,7 @@ static unsigned long tainted_mask; static int pause_on_oops; static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); -static bool crash_kexec_post_notifiers; +bool crash_kexec_post_notifiers; int panic_on_warn __read_mostly; int panic_timeout = CONFIG_PANIC_TIMEOUT; @@ -142,10 +143,22 @@ void panic(const char *fmt, ...) * Note: since some panic_notifiers can make crashed kernel * more unstable, it can increase risks of the kdump failure too. */ - crash_kexec(NULL); + if (crash_kexec_post_notifiers) + crash_kexec(NULL); bust_spinlocks(0); + /* + * We may have ended up stopping the CPU holding the lock (in + * smp_send_stop()) while still having some valuable data in the console + * buffer. Try to acquire the lock then release it regardless of the + * result. The release will also print the buffers out. Locks debug + * should be disabled to avoid reporting bad unlock balance when + * panic() is not being callled from OOPS. + */ + debug_locks_off(); + console_flush_on_panic(); + if (!panic_blink) panic_blink = no_blink; diff --git a/kernel/kernel/params.c b/kernel/kernel/params.c index a22d6a759..a6d6149c0 100644 --- a/kernel/kernel/params.c +++ b/kernel/kernel/params.c @@ -25,15 +25,34 @@ #include <linux/slab.h> #include <linux/ctype.h> -/* Protects all parameters, and incidentally kmalloced_param list. */ +#ifdef CONFIG_SYSFS +/* Protects all built-in parameters, modules use their own param_lock */ static DEFINE_MUTEX(param_lock); +/* Use the module's mutex, or if built-in use the built-in mutex */ +#ifdef CONFIG_MODULES +#define KPARAM_MUTEX(mod) ((mod) ? &(mod)->param_lock : ¶m_lock) +#else +#define KPARAM_MUTEX(mod) (¶m_lock) +#endif + +static inline void check_kparam_locked(struct module *mod) +{ + BUG_ON(!mutex_is_locked(KPARAM_MUTEX(mod))); +} +#else +static inline void check_kparam_locked(struct module *mod) +{ +} +#endif /* !CONFIG_SYSFS */ + /* This just allows us to keep track of which parameters are kmalloced. */ struct kmalloced_param { struct list_head list; char val[]; }; static LIST_HEAD(kmalloced_params); +static DEFINE_SPINLOCK(kmalloced_params_lock); static void *kmalloc_parameter(unsigned int size) { @@ -43,7 +62,10 @@ static void *kmalloc_parameter(unsigned int size) if (!p) return NULL; + spin_lock(&kmalloced_params_lock); list_add(&p->list, &kmalloced_params); + spin_unlock(&kmalloced_params_lock); + return p->val; } @@ -52,6 +74,7 @@ static void maybe_kfree_parameter(void *param) { struct kmalloced_param *p; + spin_lock(&kmalloced_params_lock); list_for_each_entry(p, &kmalloced_params, list) { if (p->val == param) { list_del(&p->list); @@ -59,6 +82,7 @@ static void maybe_kfree_parameter(void *param) break; } } + spin_unlock(&kmalloced_params_lock); } static char dash2underscore(char c) @@ -100,8 +124,9 @@ static int parse_one(char *param, unsigned num_params, s16 min_level, s16 max_level, + void *arg, int (*handle_unknown)(char *param, char *val, - const char *doing)) + const char *doing, void *arg)) { unsigned int i; int err; @@ -118,17 +143,17 @@ static int parse_one(char *param, return -EINVAL; pr_debug("handling %s with %p\n", param, params[i].ops->set); - mutex_lock(¶m_lock); + kernel_param_lock(params[i].mod); param_check_unsafe(¶ms[i]); err = params[i].ops->set(val, ¶ms[i]); - mutex_unlock(¶m_lock); + kernel_param_unlock(params[i].mod); return err; } } if (handle_unknown) { pr_debug("doing %s: %s='%s'\n", doing, param, val); - return handle_unknown(param, val, doing); + return handle_unknown(param, val, doing, arg); } pr_debug("Unknown argument '%s'\n", param); @@ -194,9 +219,11 @@ char *parse_args(const char *doing, unsigned num, s16 min_level, s16 max_level, - int (*unknown)(char *param, char *val, const char *doing)) + void *arg, + int (*unknown)(char *param, char *val, + const char *doing, void *arg)) { - char *param, *val; + char *param, *val, *err = NULL; /* Chew leading spaces */ args = skip_spaces(args); @@ -211,33 +238,34 @@ char *parse_args(const char *doing, args = next_arg(args, ¶m, &val); /* Stop at -- */ if (!val && strcmp(param, "--") == 0) - return args; + return err ?: args; irq_was_disabled = irqs_disabled(); ret = parse_one(param, val, doing, params, num, - min_level, max_level, unknown); + min_level, max_level, arg, unknown); if (irq_was_disabled && !irqs_disabled()) pr_warn("%s: option '%s' enabled irq's!\n", doing, param); switch (ret) { + case 0: + continue; case -ENOENT: pr_err("%s: Unknown parameter `%s'\n", doing, param); - return ERR_PTR(ret); + break; case -ENOSPC: pr_err("%s: `%s' too large for parameter `%s'\n", doing, val ?: "", param); - return ERR_PTR(ret); - case 0: break; default: pr_err("%s: `%s' invalid for parameter `%s'\n", doing, val ?: "", param); - return ERR_PTR(ret); + break; } + + err = ERR_PTR(ret); } - /* All parsed OK. */ - return NULL; + return err; } /* Lazy bastard, eh? */ @@ -251,7 +279,7 @@ char *parse_args(const char *doing, return scnprintf(buffer, PAGE_SIZE, format, \ *((type *)kp->arg)); \ } \ - struct kernel_param_ops param_ops_##name = { \ + const struct kernel_param_ops param_ops_##name = { \ .set = param_set_##name, \ .get = param_get_##name, \ }; \ @@ -298,12 +326,13 @@ int param_get_charp(char *buffer, const struct kernel_param *kp) } EXPORT_SYMBOL(param_get_charp); -static void param_free_charp(void *arg) +void param_free_charp(void *arg) { maybe_kfree_parameter(*((char **)arg)); } +EXPORT_SYMBOL(param_free_charp); -struct kernel_param_ops param_ops_charp = { +const struct kernel_param_ops param_ops_charp = { .set = param_set_charp, .get = param_get_charp, .free = param_free_charp, @@ -328,13 +357,44 @@ int param_get_bool(char *buffer, const struct kernel_param *kp) } EXPORT_SYMBOL(param_get_bool); -struct kernel_param_ops param_ops_bool = { +const struct kernel_param_ops param_ops_bool = { .flags = KERNEL_PARAM_OPS_FL_NOARG, .set = param_set_bool, .get = param_get_bool, }; EXPORT_SYMBOL(param_ops_bool); +int param_set_bool_enable_only(const char *val, const struct kernel_param *kp) +{ + int err = 0; + bool new_value; + bool orig_value = *(bool *)kp->arg; + struct kernel_param dummy_kp = *kp; + + dummy_kp.arg = &new_value; + + err = param_set_bool(val, &dummy_kp); + if (err) + return err; + + /* Don't let them unset it once it's set! */ + if (!new_value && orig_value) + return -EROFS; + + if (new_value) + err = param_set_bool(val, kp); + + return err; +} +EXPORT_SYMBOL_GPL(param_set_bool_enable_only); + +const struct kernel_param_ops param_ops_bool_enable_only = { + .flags = KERNEL_PARAM_OPS_FL_NOARG, + .set = param_set_bool_enable_only, + .get = param_get_bool, +}; +EXPORT_SYMBOL_GPL(param_ops_bool_enable_only); + /* This one must be bool. */ int param_set_invbool(const char *val, const struct kernel_param *kp) { @@ -356,7 +416,7 @@ int param_get_invbool(char *buffer, const struct kernel_param *kp) } EXPORT_SYMBOL(param_get_invbool); -struct kernel_param_ops param_ops_invbool = { +const struct kernel_param_ops param_ops_invbool = { .set = param_set_invbool, .get = param_get_invbool, }; @@ -364,12 +424,11 @@ EXPORT_SYMBOL(param_ops_invbool); int param_set_bint(const char *val, const struct kernel_param *kp) { - struct kernel_param boolkp; + /* Match bool exactly, by re-using it. */ + struct kernel_param boolkp = *kp; bool v; int ret; - /* Match bool exactly, by re-using it. */ - boolkp = *kp; boolkp.arg = &v; ret = param_set_bool(val, &boolkp); @@ -379,7 +438,7 @@ int param_set_bint(const char *val, const struct kernel_param *kp) } EXPORT_SYMBOL(param_set_bint); -struct kernel_param_ops param_ops_bint = { +const struct kernel_param_ops param_ops_bint = { .flags = KERNEL_PARAM_OPS_FL_NOARG, .set = param_set_bint, .get = param_get_int, @@ -387,7 +446,8 @@ struct kernel_param_ops param_ops_bint = { EXPORT_SYMBOL(param_ops_bint); /* We break the rule and mangle the string. */ -static int param_array(const char *name, +static int param_array(struct module *mod, + const char *name, const char *val, unsigned int min, unsigned int max, void *elem, int elemsize, @@ -418,7 +478,7 @@ static int param_array(const char *name, /* nul-terminate and parse */ save = val[len]; ((char *)val)[len] = '\0'; - BUG_ON(!mutex_is_locked(¶m_lock)); + check_kparam_locked(mod); ret = set(val, &kp); if (ret != 0) @@ -440,7 +500,7 @@ static int param_array_set(const char *val, const struct kernel_param *kp) const struct kparam_array *arr = kp->arr; unsigned int temp_num; - return param_array(kp->name, val, 1, arr->max, arr->elem, + return param_array(kp->mod, kp->name, val, 1, arr->max, arr->elem, arr->elemsize, arr->ops->set, kp->level, arr->num ?: &temp_num); } @@ -449,14 +509,13 @@ static int param_array_get(char *buffer, const struct kernel_param *kp) { int i, off, ret; const struct kparam_array *arr = kp->arr; - struct kernel_param p; + struct kernel_param p = *kp; - p = *kp; for (i = off = 0; i < (arr->num ? *arr->num : arr->max); i++) { if (i) buffer[off++] = ','; p.arg = arr->elem + arr->elemsize * i; - BUG_ON(!mutex_is_locked(¶m_lock)); + check_kparam_locked(p.mod); ret = arr->ops->get(buffer + off, &p); if (ret < 0) return ret; @@ -476,7 +535,7 @@ static void param_array_free(void *arg) arr->ops->free(arr->elem + arr->elemsize * i); } -struct kernel_param_ops param_array_ops = { +const struct kernel_param_ops param_array_ops = { .set = param_array_set, .get = param_array_get, .free = param_array_free, @@ -504,7 +563,7 @@ int param_get_string(char *buffer, const struct kernel_param *kp) } EXPORT_SYMBOL(param_get_string); -struct kernel_param_ops param_ops_string = { +const struct kernel_param_ops param_ops_string = { .set = param_set_copystring, .get = param_get_string, }; @@ -539,9 +598,9 @@ static ssize_t param_attr_show(struct module_attribute *mattr, if (!attribute->param->ops->get) return -EPERM; - mutex_lock(¶m_lock); + kernel_param_lock(mk->mod); count = attribute->param->ops->get(buf, attribute->param); - mutex_unlock(¶m_lock); + kernel_param_unlock(mk->mod); if (count > 0) { strcat(buf, "\n"); ++count; @@ -551,7 +610,7 @@ static ssize_t param_attr_show(struct module_attribute *mattr, /* sysfs always hands a nul-terminated string in buf. We rely on that. */ static ssize_t param_attr_store(struct module_attribute *mattr, - struct module_kobject *km, + struct module_kobject *mk, const char *buf, size_t len) { int err; @@ -560,10 +619,10 @@ static ssize_t param_attr_store(struct module_attribute *mattr, if (!attribute->param->ops->set) return -EPERM; - mutex_lock(¶m_lock); + kernel_param_lock(mk->mod); param_check_unsafe(attribute->param); err = attribute->param->ops->set(buf, attribute->param); - mutex_unlock(¶m_lock); + kernel_param_unlock(mk->mod); if (!err) return len; return err; @@ -577,17 +636,18 @@ static ssize_t param_attr_store(struct module_attribute *mattr, #endif #ifdef CONFIG_SYSFS -void __kernel_param_lock(void) +void kernel_param_lock(struct module *mod) { - mutex_lock(¶m_lock); + mutex_lock(KPARAM_MUTEX(mod)); } -EXPORT_SYMBOL(__kernel_param_lock); -void __kernel_param_unlock(void) +void kernel_param_unlock(struct module *mod) { - mutex_unlock(¶m_lock); + mutex_unlock(KPARAM_MUTEX(mod)); } -EXPORT_SYMBOL(__kernel_param_unlock); + +EXPORT_SYMBOL(kernel_param_lock); +EXPORT_SYMBOL(kernel_param_unlock); /* * add_sysfs_param - add a parameter to sysfs @@ -853,6 +913,7 @@ static void __init version_sysfs_builtin(void) mk = locate_module_kobject(vattr->module_name); if (mk) { err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr); + WARN_ON_ONCE(err); kobject_uevent(&mk->kobj, KOBJ_ADD); kobject_put(&mk->kobj); } diff --git a/kernel/kernel/pid.c b/kernel/kernel/pid.c index 4fd07d5b7..78b3d9f80 100644 --- a/kernel/kernel/pid.c +++ b/kernel/kernel/pid.c @@ -451,9 +451,8 @@ EXPORT_SYMBOL(pid_task); */ struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) { - rcu_lockdep_assert(rcu_read_lock_held(), - "find_task_by_pid_ns() needs rcu_read_lock()" - " protection"); + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), + "find_task_by_pid_ns() needs rcu_read_lock() protection"); return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); } @@ -468,7 +467,7 @@ struct pid *get_task_pid(struct task_struct *task, enum pid_type type) rcu_read_lock(); if (type != PIDTYPE_PID) task = task->group_leader; - pid = get_pid(task->pids[type].pid); + pid = get_pid(rcu_dereference(task->pids[type].pid)); rcu_read_unlock(); return pid; } @@ -529,7 +528,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, if (likely(pid_alive(task))) { if (type != PIDTYPE_PID) task = task->group_leader; - nr = pid_nr_ns(task->pids[type].pid, ns); + nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns); } rcu_read_unlock(); diff --git a/kernel/kernel/power/Kconfig b/kernel/kernel/power/Kconfig index 9e302315e..02e8dfaa1 100644 --- a/kernel/kernel/power/Kconfig +++ b/kernel/kernel/power/Kconfig @@ -18,6 +18,16 @@ config SUSPEND_FREEZER Turning OFF this setting is NOT recommended! If in doubt, say Y. +config SUSPEND_SKIP_SYNC + bool "Skip kernel's sys_sync() on suspend to RAM/standby" + depends on SUSPEND + depends on EXPERT + help + Skip the kernel sys_sync() before freezing user processes. + Some systems prefer not to pay this cost on every invocation + of suspend, or they are content with invoking sync() from + user-space before invoking suspend. Say Y if that's your case. + config HIBERNATE_CALLBACKS bool diff --git a/kernel/kernel/power/Makefile b/kernel/kernel/power/Makefile index 29472bff1..cb880a14c 100644 --- a/kernel/kernel/power/Makefile +++ b/kernel/kernel/power/Makefile @@ -7,8 +7,7 @@ obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o obj-$(CONFIG_FREEZER) += process.o obj-$(CONFIG_SUSPEND) += suspend.o obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o -obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ - block_io.o +obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o obj-$(CONFIG_PM_AUTOSLEEP) += autosleep.o obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o diff --git a/kernel/kernel/power/block_io.c b/kernel/kernel/power/block_io.c deleted file mode 100644 index 9a58bc258..000000000 --- a/kernel/kernel/power/block_io.c +++ /dev/null @@ -1,103 +0,0 @@ -/* - * This file provides functions for block I/O operations on swap/file. - * - * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz> - * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> - * - * This file is released under the GPLv2. - */ - -#include <linux/bio.h> -#include <linux/kernel.h> -#include <linux/pagemap.h> -#include <linux/swap.h> - -#include "power.h" - -/** - * submit - submit BIO request. - * @rw: READ or WRITE. - * @off physical offset of page. - * @page: page we're reading or writing. - * @bio_chain: list of pending biod (for async reading) - * - * Straight from the textbook - allocate and initialize the bio. - * If we're reading, make sure the page is marked as dirty. - * Then submit it and, if @bio_chain == NULL, wait. - */ -static int submit(int rw, struct block_device *bdev, sector_t sector, - struct page *page, struct bio **bio_chain) -{ - const int bio_rw = rw | REQ_SYNC; - struct bio *bio; - - bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); - bio->bi_iter.bi_sector = sector; - bio->bi_bdev = bdev; - bio->bi_end_io = end_swap_bio_read; - - if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { - printk(KERN_ERR "PM: Adding page to bio failed at %llu\n", - (unsigned long long)sector); - bio_put(bio); - return -EFAULT; - } - - lock_page(page); - bio_get(bio); - - if (bio_chain == NULL) { - submit_bio(bio_rw, bio); - wait_on_page_locked(page); - if (rw == READ) - bio_set_pages_dirty(bio); - bio_put(bio); - } else { - if (rw == READ) - get_page(page); /* These pages are freed later */ - bio->bi_private = *bio_chain; - *bio_chain = bio; - submit_bio(bio_rw, bio); - } - return 0; -} - -int hib_bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain) -{ - return submit(READ, hib_resume_bdev, page_off * (PAGE_SIZE >> 9), - virt_to_page(addr), bio_chain); -} - -int hib_bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain) -{ - return submit(WRITE, hib_resume_bdev, page_off * (PAGE_SIZE >> 9), - virt_to_page(addr), bio_chain); -} - -int hib_wait_on_bio_chain(struct bio **bio_chain) -{ - struct bio *bio; - struct bio *next_bio; - int ret = 0; - - if (bio_chain == NULL) - return 0; - - bio = *bio_chain; - if (bio == NULL) - return 0; - while (bio) { - struct page *page; - - next_bio = bio->bi_private; - page = bio->bi_io_vec[0].bv_page; - wait_on_page_locked(page); - if (!PageUptodate(page) || PageError(page)) - ret = -EIO; - put_page(page); - bio_put(bio); - bio = next_bio; - } - *bio_chain = NULL; - return ret; -} diff --git a/kernel/kernel/power/hibernate.c b/kernel/kernel/power/hibernate.c index b8f41a363..fbb23f93e 100644 --- a/kernel/kernel/power/hibernate.c +++ b/kernel/kernel/power/hibernate.c @@ -557,7 +557,7 @@ int hibernation_platform_enter(void) error = disable_nonboot_cpus(); if (error) - goto Platform_finish; + goto Enable_cpus; local_irq_disable(); system_state = SYSTEM_SUSPEND; @@ -575,6 +575,8 @@ int hibernation_platform_enter(void) syscore_resume(); system_state = SYSTEM_RUNNING; local_irq_enable(); + + Enable_cpus: enable_nonboot_cpus(); Platform_finish: @@ -646,6 +648,10 @@ static void power_down(void) cpu_relax(); } +#ifndef CONFIG_SUSPEND +bool pm_in_action; +#endif + /** * hibernate - Carry out system hibernation, including saving the image. */ @@ -658,6 +664,8 @@ int hibernate(void) return -EPERM; } + pm_in_action = true; + lock_system_sleep(); /* The snapshot device should not be opened while we're running */ if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { @@ -723,6 +731,7 @@ int hibernate(void) atomic_inc(&snapshot_device_available); Unlock: unlock_system_sleep(); + pm_in_action = false; return error; } @@ -738,7 +747,7 @@ int hibernate(void) * contents of memory is restored from the saved image. * * If this is successful, control reappears in the restored target kernel in - * hibernation_snaphot() which returns to hibernate(). Otherwise, the routine + * hibernation_snapshot() which returns to hibernate(). Otherwise, the routine * attempts to recover gracefully and make the kernel return to the normal mode * of operation. */ diff --git a/kernel/kernel/power/main.c b/kernel/kernel/power/main.c index 86e8157a4..b2dd4d999 100644 --- a/kernel/kernel/power/main.c +++ b/kernel/kernel/power/main.c @@ -272,7 +272,23 @@ static inline void pm_print_times_init(void) { pm_print_times_enabled = !!initcall_debug; } -#else /* !CONFIG_PP_SLEEP_DEBUG */ + +static ssize_t pm_wakeup_irq_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return pm_wakeup_irq ? sprintf(buf, "%u\n", pm_wakeup_irq) : -ENODATA; +} + +static ssize_t pm_wakeup_irq_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + return -EINVAL; +} +power_attr(pm_wakeup_irq); + +#else /* !CONFIG_PM_SLEEP_DEBUG */ static inline void pm_print_times_init(void) {} #endif /* CONFIG_PM_SLEEP_DEBUG */ @@ -604,6 +620,7 @@ static struct attribute * g[] = { #endif #ifdef CONFIG_PM_SLEEP_DEBUG &pm_print_times_attr.attr, + &pm_wakeup_irq_attr.attr, #endif #endif #ifdef CONFIG_FREEZER diff --git a/kernel/kernel/power/power.h b/kernel/kernel/power/power.h index ce9b8328a..caadb566e 100644 --- a/kernel/kernel/power/power.h +++ b/kernel/kernel/power/power.h @@ -163,15 +163,6 @@ extern void swsusp_close(fmode_t); extern int swsusp_unmark(void); #endif -/* kernel/power/block_io.c */ -extern struct block_device *hib_resume_bdev; - -extern int hib_bio_read_page(pgoff_t page_off, void *addr, - struct bio **bio_chain); -extern int hib_bio_write_page(pgoff_t page_off, void *addr, - struct bio **bio_chain); -extern int hib_wait_on_bio_chain(struct bio **bio_chain); - struct timeval; /* kernel/power/swsusp.c */ extern void swsusp_show_speed(ktime_t, ktime_t, unsigned int, char *); diff --git a/kernel/kernel/power/snapshot.c b/kernel/kernel/power/snapshot.c index 5235dd4e1..3a9706043 100644 --- a/kernel/kernel/power/snapshot.c +++ b/kernel/kernel/power/snapshot.c @@ -1779,7 +1779,7 @@ alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem) while (to_alloc-- > 0) { struct page *page; - page = alloc_image_page(__GFP_HIGHMEM); + page = alloc_image_page(__GFP_HIGHMEM|__GFP_KSWAPD_RECLAIM); memory_bm_set_bit(bm, page_to_pfn(page)); } return nr_highmem; diff --git a/kernel/kernel/power/suspend.c b/kernel/kernel/power/suspend.c index db920b170..393bc342c 100644 --- a/kernel/kernel/power/suspend.c +++ b/kernel/kernel/power/suspend.c @@ -35,6 +35,9 @@ const char *pm_labels[] = { "mem", "standby", "freeze", NULL }; const char *pm_states[PM_SUSPEND_MAX]; +unsigned int pm_suspend_global_flags; +EXPORT_SYMBOL_GPL(pm_suspend_global_flags); + static const struct platform_suspend_ops *suspend_ops; static const struct platform_freeze_ops *freeze_ops; static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head); @@ -368,6 +371,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) trace_suspend_resume(TPS("machine_suspend"), state, false); events_check_enabled = false; + } else if (*wakeup) { + error = -EBUSY; } syscore_resume(); } @@ -472,7 +477,7 @@ static int enter_state(suspend_state_t state) if (state == PM_SUSPEND_FREEZE) { #ifdef CONFIG_PM_DEBUG if (pm_test_level != TEST_NONE && pm_test_level <= TEST_CPUS) { - pr_warning("PM: Unsupported test mode for freeze state," + pr_warning("PM: Unsupported test mode for suspend to idle," "please choose none/freezer/devices/platform.\n"); return -EAGAIN; } @@ -486,13 +491,16 @@ static int enter_state(suspend_state_t state) if (state == PM_SUSPEND_FREEZE) freeze_begin(); +#ifndef CONFIG_SUSPEND_SKIP_SYNC trace_suspend_resume(TPS("sync_filesystems"), 0, true); printk(KERN_INFO "PM: Syncing filesystems ... "); sys_sync(); printk("done.\n"); trace_suspend_resume(TPS("sync_filesystems"), 0, false); +#endif - pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); + pr_debug("PM: Preparing system for sleep (%s)\n", pm_states[state]); + pm_suspend_clear_flags(); error = suspend_prepare(state); if (error) goto Unlock; @@ -501,7 +509,7 @@ static int enter_state(suspend_state_t state) goto Finish; trace_suspend_resume(TPS("suspend_enter"), state, false); - pr_debug("PM: Entering %s sleep\n", pm_states[state]); + pr_debug("PM: Suspending system (%s)\n", pm_states[state]); pm_restrict_gfp_mask(); error = suspend_devices_and_enter(state); pm_restore_gfp_mask(); @@ -514,6 +522,8 @@ static int enter_state(suspend_state_t state) return error; } +bool pm_in_action; + /** * pm_suspend - Externally visible function for suspending the system. * @state: System sleep state to enter. @@ -528,6 +538,8 @@ int pm_suspend(suspend_state_t state) if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX) return -EINVAL; + pm_in_action = true; + error = enter_state(state); if (error) { suspend_stats.fail++; @@ -535,6 +547,7 @@ int pm_suspend(suspend_state_t state) } else { suspend_stats.success++; } + pm_in_action = false; return error; } EXPORT_SYMBOL(pm_suspend); diff --git a/kernel/kernel/power/swap.c b/kernel/kernel/power/swap.c index 570aff817..12cd989da 100644 --- a/kernel/kernel/power/swap.c +++ b/kernel/kernel/power/swap.c @@ -212,7 +212,80 @@ int swsusp_swap_in_use(void) */ static unsigned short root_swap = 0xffff; -struct block_device *hib_resume_bdev; +static struct block_device *hib_resume_bdev; + +struct hib_bio_batch { + atomic_t count; + wait_queue_head_t wait; + int error; +}; + +static void hib_init_batch(struct hib_bio_batch *hb) +{ + atomic_set(&hb->count, 0); + init_waitqueue_head(&hb->wait); + hb->error = 0; +} + +static void hib_end_io(struct bio *bio) +{ + struct hib_bio_batch *hb = bio->bi_private; + struct page *page = bio->bi_io_vec[0].bv_page; + + if (bio->bi_error) { + printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n", + imajor(bio->bi_bdev->bd_inode), + iminor(bio->bi_bdev->bd_inode), + (unsigned long long)bio->bi_iter.bi_sector); + } + + if (bio_data_dir(bio) == WRITE) + put_page(page); + + if (bio->bi_error && !hb->error) + hb->error = bio->bi_error; + if (atomic_dec_and_test(&hb->count)) + wake_up(&hb->wait); + + bio_put(bio); +} + +static int hib_submit_io(int rw, pgoff_t page_off, void *addr, + struct hib_bio_batch *hb) +{ + struct page *page = virt_to_page(addr); + struct bio *bio; + int error = 0; + + bio = bio_alloc(__GFP_RECLAIM | __GFP_HIGH, 1); + bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9); + bio->bi_bdev = hib_resume_bdev; + + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { + printk(KERN_ERR "PM: Adding page to bio failed at %llu\n", + (unsigned long long)bio->bi_iter.bi_sector); + bio_put(bio); + return -EFAULT; + } + + if (hb) { + bio->bi_end_io = hib_end_io; + bio->bi_private = hb; + atomic_inc(&hb->count); + submit_bio(rw, bio); + } else { + error = submit_bio_wait(rw, bio); + bio_put(bio); + } + + return error; +} + +static int hib_wait_io(struct hib_bio_batch *hb) +{ + wait_event(hb->wait, atomic_read(&hb->count) == 0); + return hb->error; +} /* * Saving part @@ -222,7 +295,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) { int error; - hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL); + hib_submit_io(READ_SYNC, swsusp_resume_block, swsusp_header, NULL); if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); @@ -231,7 +304,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) swsusp_header->flags = flags; if (flags & SF_CRC32_MODE) swsusp_header->crc32 = handle->crc32; - error = hib_bio_write_page(swsusp_resume_block, + error = hib_submit_io(WRITE_SYNC, swsusp_resume_block, swsusp_header, NULL); } else { printk(KERN_ERR "PM: Swap header not found!\n"); @@ -271,10 +344,10 @@ static int swsusp_swap_check(void) * write_page - Write one page to given swap location. * @buf: Address we're writing. * @offset: Offset of the swap page we're writing to. - * @bio_chain: Link the next write BIO here + * @hb: bio completion batch */ -static int write_page(void *buf, sector_t offset, struct bio **bio_chain) +static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb) { void *src; int ret; @@ -282,30 +355,30 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain) if (!offset) return -ENOSPC; - if (bio_chain) { - src = (void *)__get_free_page(__GFP_WAIT | __GFP_NOWARN | + if (hb) { + src = (void *)__get_free_page(__GFP_RECLAIM | __GFP_NOWARN | __GFP_NORETRY); if (src) { copy_page(src, buf); } else { - ret = hib_wait_on_bio_chain(bio_chain); /* Free pages */ + ret = hib_wait_io(hb); /* Free pages */ if (ret) return ret; - src = (void *)__get_free_page(__GFP_WAIT | + src = (void *)__get_free_page(__GFP_RECLAIM | __GFP_NOWARN | __GFP_NORETRY); if (src) { copy_page(src, buf); } else { WARN_ON_ONCE(1); - bio_chain = NULL; /* Go synchronous */ + hb = NULL; /* Go synchronous */ src = buf; } } } else { src = buf; } - return hib_bio_write_page(offset, src, bio_chain); + return hib_submit_io(WRITE_SYNC, offset, src, hb); } static void release_swap_writer(struct swap_map_handle *handle) @@ -348,7 +421,7 @@ err_close: } static int swap_write_page(struct swap_map_handle *handle, void *buf, - struct bio **bio_chain) + struct hib_bio_batch *hb) { int error = 0; sector_t offset; @@ -356,7 +429,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf, if (!handle->cur) return -EINVAL; offset = alloc_swapdev_block(root_swap); - error = write_page(buf, offset, bio_chain); + error = write_page(buf, offset, hb); if (error) return error; handle->cur->entries[handle->k++] = offset; @@ -365,15 +438,15 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf, if (!offset) return -ENOSPC; handle->cur->next_swap = offset; - error = write_page(handle->cur, handle->cur_swap, bio_chain); + error = write_page(handle->cur, handle->cur_swap, hb); if (error) goto out; clear_page(handle->cur); handle->cur_swap = offset; handle->k = 0; - if (bio_chain && low_free_pages() <= handle->reqd_free_pages) { - error = hib_wait_on_bio_chain(bio_chain); + if (hb && low_free_pages() <= handle->reqd_free_pages) { + error = hib_wait_io(hb); if (error) goto out; /* @@ -445,23 +518,24 @@ static int save_image(struct swap_map_handle *handle, int ret; int nr_pages; int err2; - struct bio *bio; + struct hib_bio_batch hb; ktime_t start; ktime_t stop; + hib_init_batch(&hb); + printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n", nr_to_write); m = nr_to_write / 10; if (!m) m = 1; nr_pages = 0; - bio = NULL; start = ktime_get(); while (1) { ret = snapshot_read_next(snapshot); if (ret <= 0) break; - ret = swap_write_page(handle, data_of(*snapshot), &bio); + ret = swap_write_page(handle, data_of(*snapshot), &hb); if (ret) break; if (!(nr_pages % m)) @@ -469,7 +543,7 @@ static int save_image(struct swap_map_handle *handle, nr_pages / m * 10); nr_pages++; } - err2 = hib_wait_on_bio_chain(&bio); + err2 = hib_wait_io(&hb); stop = ktime_get(); if (!ret) ret = err2; @@ -580,7 +654,7 @@ static int save_image_lzo(struct swap_map_handle *handle, int ret = 0; int nr_pages; int err2; - struct bio *bio; + struct hib_bio_batch hb; ktime_t start; ktime_t stop; size_t off; @@ -589,6 +663,8 @@ static int save_image_lzo(struct swap_map_handle *handle, struct cmp_data *data = NULL; struct crc_data *crc = NULL; + hib_init_batch(&hb); + /* * We'll limit the number of threads for compression to limit memory * footprint. @@ -596,7 +672,7 @@ static int save_image_lzo(struct swap_map_handle *handle, nr_threads = num_online_cpus() - 1; nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); - page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); + page = (void *)__get_free_page(__GFP_RECLAIM | __GFP_HIGH); if (!page) { printk(KERN_ERR "PM: Failed to allocate LZO page\n"); ret = -ENOMEM; @@ -674,7 +750,6 @@ static int save_image_lzo(struct swap_map_handle *handle, if (!m) m = 1; nr_pages = 0; - bio = NULL; start = ktime_get(); for (;;) { for (thr = 0; thr < nr_threads; thr++) { @@ -748,7 +823,7 @@ static int save_image_lzo(struct swap_map_handle *handle, off += PAGE_SIZE) { memcpy(page, data[thr].cmp + off, PAGE_SIZE); - ret = swap_write_page(handle, page, &bio); + ret = swap_write_page(handle, page, &hb); if (ret) goto out_finish; } @@ -759,7 +834,7 @@ static int save_image_lzo(struct swap_map_handle *handle, } out_finish: - err2 = hib_wait_on_bio_chain(&bio); + err2 = hib_wait_io(&hb); stop = ktime_get(); if (!ret) ret = err2; @@ -900,13 +975,13 @@ static int get_swap_reader(struct swap_map_handle *handle, last = tmp; tmp->map = (struct swap_map_page *) - __get_free_page(__GFP_WAIT | __GFP_HIGH); + __get_free_page(__GFP_RECLAIM | __GFP_HIGH); if (!tmp->map) { release_swap_reader(handle); return -ENOMEM; } - error = hib_bio_read_page(offset, tmp->map, NULL); + error = hib_submit_io(READ_SYNC, offset, tmp->map, NULL); if (error) { release_swap_reader(handle); return error; @@ -919,7 +994,7 @@ static int get_swap_reader(struct swap_map_handle *handle, } static int swap_read_page(struct swap_map_handle *handle, void *buf, - struct bio **bio_chain) + struct hib_bio_batch *hb) { sector_t offset; int error; @@ -930,7 +1005,7 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf, offset = handle->cur->entries[handle->k]; if (!offset) return -EFAULT; - error = hib_bio_read_page(offset, buf, bio_chain); + error = hib_submit_io(READ_SYNC, offset, buf, hb); if (error) return error; if (++handle->k >= MAP_PAGE_ENTRIES) { @@ -968,27 +1043,28 @@ static int load_image(struct swap_map_handle *handle, int ret = 0; ktime_t start; ktime_t stop; - struct bio *bio; + struct hib_bio_batch hb; int err2; unsigned nr_pages; + hib_init_batch(&hb); + printk(KERN_INFO "PM: Loading image data pages (%u pages)...\n", nr_to_read); m = nr_to_read / 10; if (!m) m = 1; nr_pages = 0; - bio = NULL; start = ktime_get(); for ( ; ; ) { ret = snapshot_write_next(snapshot); if (ret <= 0) break; - ret = swap_read_page(handle, data_of(*snapshot), &bio); + ret = swap_read_page(handle, data_of(*snapshot), &hb); if (ret) break; if (snapshot->sync_read) - ret = hib_wait_on_bio_chain(&bio); + ret = hib_wait_io(&hb); if (ret) break; if (!(nr_pages % m)) @@ -996,7 +1072,7 @@ static int load_image(struct swap_map_handle *handle, nr_pages / m * 10); nr_pages++; } - err2 = hib_wait_on_bio_chain(&bio); + err2 = hib_wait_io(&hb); stop = ktime_get(); if (!ret) ret = err2; @@ -1067,7 +1143,7 @@ static int load_image_lzo(struct swap_map_handle *handle, unsigned int m; int ret = 0; int eof = 0; - struct bio *bio; + struct hib_bio_batch hb; ktime_t start; ktime_t stop; unsigned nr_pages; @@ -1080,6 +1156,8 @@ static int load_image_lzo(struct swap_map_handle *handle, struct dec_data *data = NULL; struct crc_data *crc = NULL; + hib_init_batch(&hb); + /* * We'll limit the number of threads for decompression to limit memory * footprint. @@ -1164,9 +1242,9 @@ static int load_image_lzo(struct swap_map_handle *handle, for (i = 0; i < read_pages; i++) { page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ? - __GFP_WAIT | __GFP_HIGH : - __GFP_WAIT | __GFP_NOWARN | - __GFP_NORETRY); + __GFP_RECLAIM | __GFP_HIGH : + __GFP_RECLAIM | __GFP_NOWARN | + __GFP_NORETRY); if (!page[i]) { if (i < LZO_CMP_PAGES) { @@ -1190,7 +1268,6 @@ static int load_image_lzo(struct swap_map_handle *handle, if (!m) m = 1; nr_pages = 0; - bio = NULL; start = ktime_get(); ret = snapshot_write_next(snapshot); @@ -1199,7 +1276,7 @@ static int load_image_lzo(struct swap_map_handle *handle, for(;;) { for (i = 0; !eof && i < want; i++) { - ret = swap_read_page(handle, page[ring], &bio); + ret = swap_read_page(handle, page[ring], &hb); if (ret) { /* * On real read error, finish. On end of data, @@ -1226,7 +1303,7 @@ static int load_image_lzo(struct swap_map_handle *handle, if (!asked) break; - ret = hib_wait_on_bio_chain(&bio); + ret = hib_wait_io(&hb); if (ret) goto out_finish; have += asked; @@ -1281,7 +1358,7 @@ static int load_image_lzo(struct swap_map_handle *handle, * Wait for more data while we are decompressing. */ if (have < LZO_CMP_PAGES && asked) { - ret = hib_wait_on_bio_chain(&bio); + ret = hib_wait_io(&hb); if (ret) goto out_finish; have += asked; @@ -1430,7 +1507,7 @@ int swsusp_check(void) if (!IS_ERR(hib_resume_bdev)) { set_blocksize(hib_resume_bdev, PAGE_SIZE); clear_page(swsusp_header); - error = hib_bio_read_page(swsusp_resume_block, + error = hib_submit_io(READ_SYNC, swsusp_resume_block, swsusp_header, NULL); if (error) goto put; @@ -1438,7 +1515,7 @@ int swsusp_check(void) if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) { memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); /* Reset swap signature now */ - error = hib_bio_write_page(swsusp_resume_block, + error = hib_submit_io(WRITE_SYNC, swsusp_resume_block, swsusp_header, NULL); } else { error = -EINVAL; @@ -1482,10 +1559,10 @@ int swsusp_unmark(void) { int error; - hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL); + hib_submit_io(READ_SYNC, swsusp_resume_block, swsusp_header, NULL); if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) { memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10); - error = hib_bio_write_page(swsusp_resume_block, + error = hib_submit_io(WRITE_SYNC, swsusp_resume_block, swsusp_header, NULL); } else { printk(KERN_ERR "PM: Cannot find swsusp signature!\n"); diff --git a/kernel/kernel/power/wakelock.c b/kernel/kernel/power/wakelock.c index 019069c84..1896386e1 100644 --- a/kernel/kernel/power/wakelock.c +++ b/kernel/kernel/power/wakelock.c @@ -17,6 +17,7 @@ #include <linux/list.h> #include <linux/rbtree.h> #include <linux/slab.h> +#include <linux/workqueue.h> #include "power.h" @@ -83,7 +84,9 @@ static inline void decrement_wakelocks_number(void) {} #define WL_GC_COUNT_MAX 100 #define WL_GC_TIME_SEC 300 +static void __wakelocks_gc(struct work_struct *work); static LIST_HEAD(wakelocks_lru_list); +static DECLARE_WORK(wakelock_work, __wakelocks_gc); static unsigned int wakelocks_gc_count; static inline void wakelocks_lru_add(struct wakelock *wl) @@ -96,13 +99,12 @@ static inline void wakelocks_lru_most_recent(struct wakelock *wl) list_move(&wl->lru, &wakelocks_lru_list); } -static void wakelocks_gc(void) +static void __wakelocks_gc(struct work_struct *work) { struct wakelock *wl, *aux; ktime_t now; - if (++wakelocks_gc_count <= WL_GC_COUNT_MAX) - return; + mutex_lock(&wakelocks_lock); now = ktime_get(); list_for_each_entry_safe_reverse(wl, aux, &wakelocks_lru_list, lru) { @@ -127,6 +129,16 @@ static void wakelocks_gc(void) } } wakelocks_gc_count = 0; + + mutex_unlock(&wakelocks_lock); +} + +static void wakelocks_gc(void) +{ + if (++wakelocks_gc_count <= WL_GC_COUNT_MAX) + return; + + schedule_work(&wakelock_work); } #else /* !CONFIG_PM_WAKELOCKS_GC */ static inline void wakelocks_lru_add(struct wakelock *wl) {} diff --git a/kernel/kernel/printk/printk.c b/kernel/kernel/printk/printk.c index c2d5877ae..f75e4b0c6 100644 --- a/kernel/kernel/printk/printk.c +++ b/kernel/kernel/printk/printk.c @@ -85,6 +85,18 @@ static struct lockdep_map console_lock_dep_map = { #endif /* + * Number of registered extended console drivers. + * + * If extended consoles are present, in-kernel cont reassembly is disabled + * and each fragment is stored as a separate log entry with proper + * continuation flag so that every emitted message has full metadata. This + * doesn't change the result for regular consoles or /proc/kmsg. For + * /dev/kmsg, as long as the reader concatenates messages according to + * consecutive continuation flags, the end result should be the same too. + */ +static int nr_ext_console_drivers; + +/* * Helper macros to handle lockdep when locking/unlocking console_sem. We use * macros instead of functions so that _RET_IP_ contains useful information. */ @@ -195,14 +207,14 @@ static int console_may_schedule; * need to be changed in the future, when the requirements change. * * /dev/kmsg exports the structured data in the following line format: - * "level,sequnum,timestamp;<message text>\n" + * "<level>,<sequnum>,<timestamp>,<contflag>[,additional_values, ... ];<message text>\n" + * + * Users of the export format should ignore possible additional values + * separated by ',', and find the message after the ';' character. * * The optional key/value pairs are attached as continuation lines starting * with a space character and terminated by a newline. All possible * non-prinatable characters are escaped in the "\xff" notation. - * - * Users of the export format should ignore possible additional values - * separated by ',', and find the message after the ';' character. */ enum log_flags { @@ -229,6 +241,65 @@ struct printk_log { */ static DEFINE_RAW_SPINLOCK(logbuf_lock); +#ifdef CONFIG_EARLY_PRINTK +struct console *early_console; + +static void early_vprintk(const char *fmt, va_list ap) +{ + if (early_console) { + char buf[512]; + int n = vscnprintf(buf, sizeof(buf), fmt, ap); + + early_console->write(early_console, buf, n); + } +} + +asmlinkage void early_printk(const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + early_vprintk(fmt, ap); + va_end(ap); +} + +/* + * This is independent of any log levels - a global + * kill switch that turns off all of printk. + * + * Used by the NMI watchdog if early-printk is enabled. + */ +static bool __read_mostly printk_killswitch; + +static int __init force_early_printk_setup(char *str) +{ + printk_killswitch = true; + return 0; +} +early_param("force_early_printk", force_early_printk_setup); + +void printk_kill(void) +{ + printk_killswitch = true; +} + +#ifdef CONFIG_PRINTK +static int forced_early_printk(const char *fmt, va_list ap) +{ + if (!printk_killswitch) + return 0; + early_vprintk(fmt, ap); + return 1; +} +#endif + +#else +static inline int forced_early_printk(const char *fmt, va_list ap) +{ + return 0; +} +#endif + #ifdef CONFIG_PRINTK DECLARE_WAIT_QUEUE_HEAD(log_wait); /* the next printk record to read by syslog(READ) or /proc/kmsg */ @@ -257,6 +328,9 @@ static u32 clear_idx; #define PREFIX_MAX 32 #define LOG_LINE_MAX (1024 - PREFIX_MAX) +#define LOG_LEVEL(v) ((v) & 0x07) +#define LOG_FACILITY(v) ((v) >> 3 & 0xff) + /* record buffer */ #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) #define LOG_ALIGN 4 @@ -477,13 +551,13 @@ static int syslog_action_restricted(int type) type != SYSLOG_ACTION_SIZE_BUFFER; } -int check_syslog_permissions(int type, bool from_file) +int check_syslog_permissions(int type, int source) { /* * If this is from /proc/kmsg and we've already opened it, then we've * already done the capabilities checks at open time. */ - if (from_file && type != SYSLOG_ACTION_OPEN) + if (source == SYSLOG_FROM_PROC && type != SYSLOG_ACTION_OPEN) goto ok; if (syslog_action_restricted(type)) { @@ -505,7 +579,88 @@ int check_syslog_permissions(int type, bool from_file) ok: return security_syslog(type); } +EXPORT_SYMBOL_GPL(check_syslog_permissions); + +static void append_char(char **pp, char *e, char c) +{ + if (*pp < e) + *(*pp)++ = c; +} +static ssize_t msg_print_ext_header(char *buf, size_t size, + struct printk_log *msg, u64 seq, + enum log_flags prev_flags) +{ + u64 ts_usec = msg->ts_nsec; + char cont = '-'; + + do_div(ts_usec, 1000); + + /* + * If we couldn't merge continuation line fragments during the print, + * export the stored flags to allow an optional external merge of the + * records. Merging the records isn't always neccessarily correct, like + * when we hit a race during printing. In most cases though, it produces + * better readable output. 'c' in the record flags mark the first + * fragment of a line, '+' the following. + */ + if (msg->flags & LOG_CONT && !(prev_flags & LOG_CONT)) + cont = 'c'; + else if ((msg->flags & LOG_CONT) || + ((prev_flags & LOG_CONT) && !(msg->flags & LOG_PREFIX))) + cont = '+'; + + return scnprintf(buf, size, "%u,%llu,%llu,%c;", + (msg->facility << 3) | msg->level, seq, ts_usec, cont); +} + +static ssize_t msg_print_ext_body(char *buf, size_t size, + char *dict, size_t dict_len, + char *text, size_t text_len) +{ + char *p = buf, *e = buf + size; + size_t i; + + /* escape non-printable characters */ + for (i = 0; i < text_len; i++) { + unsigned char c = text[i]; + + if (c < ' ' || c >= 127 || c == '\\') + p += scnprintf(p, e - p, "\\x%02x", c); + else + append_char(&p, e, c); + } + append_char(&p, e, '\n'); + + if (dict_len) { + bool line = true; + + for (i = 0; i < dict_len; i++) { + unsigned char c = dict[i]; + + if (line) { + append_char(&p, e, ' '); + line = false; + } + + if (c == '\0') { + append_char(&p, e, '\n'); + line = true; + continue; + } + + if (c < ' ' || c >= 127 || c == '\\') { + p += scnprintf(p, e - p, "\\x%02x", c); + continue; + } + + append_char(&p, e, c); + } + append_char(&p, e, '\n'); + } + + return p - buf; +} /* /dev/kmsg - userspace message inject/listen interface */ struct devkmsg_user { @@ -513,13 +668,12 @@ struct devkmsg_user { u32 idx; enum log_flags prev; struct mutex lock; - char buf[8192]; + char buf[CONSOLE_EXT_LOG_MAX]; }; static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) { char *buf, *line; - int i; int level = default_message_loglevel; int facility = 1; /* LOG_USER */ size_t len = iov_iter_count(from); @@ -549,12 +703,13 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) line = buf; if (line[0] == '<') { char *endp = NULL; + unsigned int u; - i = simple_strtoul(line+1, &endp, 10); + u = simple_strtoul(line + 1, &endp, 10); if (endp && endp[0] == '>') { - level = i & 7; - if (i >> 3) - facility = i >> 3; + level = LOG_LEVEL(u); + if (LOG_FACILITY(u) != 0) + facility = LOG_FACILITY(u); endp++; len -= endp - line; line = endp; @@ -571,9 +726,6 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, { struct devkmsg_user *user = file->private_data; struct printk_log *msg; - u64 ts_usec; - size_t i; - char cont = '-'; size_t len; ssize_t ret; @@ -609,66 +761,13 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, } msg = log_from_idx(user->idx); - ts_usec = msg->ts_nsec; - do_div(ts_usec, 1000); - - /* - * If we couldn't merge continuation line fragments during the print, - * export the stored flags to allow an optional external merge of the - * records. Merging the records isn't always neccessarily correct, like - * when we hit a race during printing. In most cases though, it produces - * better readable output. 'c' in the record flags mark the first - * fragment of a line, '+' the following. - */ - if (msg->flags & LOG_CONT && !(user->prev & LOG_CONT)) - cont = 'c'; - else if ((msg->flags & LOG_CONT) || - ((user->prev & LOG_CONT) && !(msg->flags & LOG_PREFIX))) - cont = '+'; + len = msg_print_ext_header(user->buf, sizeof(user->buf), + msg, user->seq, user->prev); + len += msg_print_ext_body(user->buf + len, sizeof(user->buf) - len, + log_dict(msg), msg->dict_len, + log_text(msg), msg->text_len); - len = sprintf(user->buf, "%u,%llu,%llu,%c;", - (msg->facility << 3) | msg->level, - user->seq, ts_usec, cont); user->prev = msg->flags; - - /* escape non-printable characters */ - for (i = 0; i < msg->text_len; i++) { - unsigned char c = log_text(msg)[i]; - - if (c < ' ' || c >= 127 || c == '\\') - len += sprintf(user->buf + len, "\\x%02x", c); - else - user->buf[len++] = c; - } - user->buf[len++] = '\n'; - - if (msg->dict_len) { - bool line = true; - - for (i = 0; i < msg->dict_len; i++) { - unsigned char c = log_dict(msg)[i]; - - if (line) { - user->buf[len++] = ' '; - line = false; - } - - if (c == '\0') { - user->buf[len++] = '\n'; - line = true; - continue; - } - - if (c < ' ' || c >= 127 || c == '\\') { - len += sprintf(user->buf + len, "\\x%02x", c); - continue; - } - - user->buf[len++] = c; - } - user->buf[len++] = '\n'; - } - user->idx = log_next(user->idx); user->seq++; raw_spin_unlock_irq(&logbuf_lock); @@ -799,7 +898,7 @@ const struct file_operations kmsg_fops = { .release = devkmsg_release, }; -#ifdef CONFIG_KEXEC +#ifdef CONFIG_KEXEC_CORE /* * This appends the listed symbols to /proc/vmcore * @@ -1279,13 +1378,13 @@ out: return len; } -int do_syslog(int type, char __user *buf, int len, bool from_file) +int do_syslog(int type, char __user *buf, int len, int source) { bool clear = false; static int saved_console_loglevel = LOGLEVEL_DEFAULT; int error; - error = check_syslog_permissions(type, from_file); + error = check_syslog_permissions(type, source); if (error) goto out; @@ -1368,7 +1467,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) syslog_prev = 0; syslog_partial = 0; } - if (from_file) { + if (source == SYSLOG_FROM_PROC) { /* * Short-cut for poll(/"proc/kmsg") which simply checks * for pending data, not the size; return the count of @@ -1415,7 +1514,9 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) * log_buf[start] to log_buf[end - 1]. * The console_lock must be held. */ -static void call_console_drivers(int level, const char *text, size_t len) +static void call_console_drivers(int level, + const char *ext_text, size_t ext_len, + const char *text, size_t len) { struct console *con; @@ -1437,7 +1538,10 @@ static void call_console_drivers(int level, const char *text, size_t len) if (!cpu_online(smp_processor_id()) && !(con->flags & CON_ANYTIME)) continue; - con->write(con, text, len); + if (con->flags & CON_EXTENDED) + con->write(con, ext_text, ext_len); + else + con->write(con, text, len); } migrate_enable(); } @@ -1590,8 +1694,12 @@ static bool cont_add(int facility, int level, const char *text, size_t len) if (cont.len && cont.flushed) return false; - if (cont.len + len > sizeof(cont.buf)) { - /* the line gets too long, split it up in separate records */ + /* + * If ext consoles are present, flush and skip in-kernel + * continuation. See nr_ext_console_drivers definition. Also, if + * the line gets too long, split it up in separate records. + */ + if (nr_ext_console_drivers || cont.len + len > sizeof(cont.buf)) { cont_flush(LOG_CONT); return false; } @@ -1643,62 +1751,6 @@ static size_t cont_print_text(char *text, size_t size) return textlen; } -#ifdef CONFIG_EARLY_PRINTK -struct console *early_console; - -static void early_vprintk(const char *fmt, va_list ap) -{ - if (early_console) { - char buf[512]; - int n = vscnprintf(buf, sizeof(buf), fmt, ap); - - early_console->write(early_console, buf, n); - } -} - -asmlinkage void early_printk(const char *fmt, ...) -{ - va_list ap; - - va_start(ap, fmt); - early_vprintk(fmt, ap); - va_end(ap); -} - -/* - * This is independent of any log levels - a global - * kill switch that turns off all of printk. - * - * Used by the NMI watchdog if early-printk is enabled. - */ -static bool __read_mostly printk_killswitch; - -static int __init force_early_printk_setup(char *str) -{ - printk_killswitch = true; - return 0; -} -early_param("force_early_printk", force_early_printk_setup); - -void printk_kill(void) -{ - printk_killswitch = true; -} - -static int forced_early_printk(const char *fmt, va_list ap) -{ - if (!printk_killswitch) - return 0; - early_vprintk(fmt, ap); - return 1; -} -#else -static inline int forced_early_printk(const char *fmt, va_list ap) -{ - return 0; -} -#endif - asmlinkage int vprintk_emit(int facility, int level, const char *dict, size_t dictlen, const char *fmt, va_list args) @@ -1988,9 +2040,19 @@ static struct cont { u8 level; bool flushed:1; } cont; +static char *log_text(const struct printk_log *msg) { return NULL; } +static char *log_dict(const struct printk_log *msg) { return NULL; } static struct printk_log *log_from_idx(u32 idx) { return NULL; } static u32 log_next(u32 idx) { return 0; } -static void call_console_drivers(int level, const char *text, size_t len) {} +static ssize_t msg_print_ext_header(char *buf, size_t size, + struct printk_log *msg, u64 seq, + enum log_flags prev_flags) { return 0; } +static ssize_t msg_print_ext_body(char *buf, size_t size, + char *dict, size_t dict_len, + char *text, size_t text_len) { return 0; } +static void call_console_drivers(int level, + const char *ext_text, size_t ext_len, + const char *text, size_t len) {} static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev, bool syslog, char *buf, size_t size) { return 0; } static size_t cont_print_text(char *text, size_t size) { return 0; } @@ -2221,15 +2283,15 @@ static void console_cont_flush(char *text, size_t size) goto out; len = cont_print_text(text, size); -#ifndef CONFIG_PREEMPT_RT_FULL +#ifdef CONFIG_PREEMPT_RT_FULL + raw_spin_unlock_irqrestore(&logbuf_lock, flags); + call_console_drivers(cont.level, NULL, 0, text, len); +#else raw_spin_unlock(&logbuf_lock); stop_critical_timings(); - call_console_drivers(cont.level, text, len); + call_console_drivers(cont.level, NULL, 0, text, len); start_critical_timings(); local_irq_restore(flags); -#else - raw_spin_unlock_irqrestore(&logbuf_lock, flags); - call_console_drivers(cont.level, text, len); #endif return; out: @@ -2252,17 +2314,29 @@ out: */ void console_unlock(void) { + static char ext_text[CONSOLE_EXT_LOG_MAX]; static char text[LOG_LINE_MAX + PREFIX_MAX]; static u64 seen_seq; unsigned long flags; bool wake_klogd = false; - bool retry; + bool do_cond_resched, retry; if (console_suspended) { up_console_sem(); return; } + /* + * Console drivers are called under logbuf_lock, so + * @console_may_schedule should be cleared before; however, we may + * end up dumping a lot of lines, for example, if called from + * console registration path, and should invoke cond_resched() + * between lines if allowable. Not doing so can cause a very long + * scheduling stall on a slow console leading to RCU stall and + * softlockup warnings which exacerbate the issue with more + * messages practically incapacitating the system. + */ + do_cond_resched = console_may_schedule; console_may_schedule = 0; /* flush buffered message fragment immediately to console */ @@ -2270,6 +2344,7 @@ void console_unlock(void) again: for (;;) { struct printk_log *msg; + size_t ext_len = 0; size_t len; int level; @@ -2315,20 +2390,31 @@ skip: level = msg->level; len += msg_print_text(msg, console_prev, false, text + len, sizeof(text) - len); + if (nr_ext_console_drivers) { + ext_len = msg_print_ext_header(ext_text, + sizeof(ext_text), + msg, console_seq, console_prev); + ext_len += msg_print_ext_body(ext_text + ext_len, + sizeof(ext_text) - ext_len, + log_dict(msg), msg->dict_len, + log_text(msg), msg->text_len); + } console_idx = log_next(console_idx); console_seq++; console_prev = msg->flags; #ifdef CONFIG_PREEMPT_RT_FULL raw_spin_unlock_irqrestore(&logbuf_lock, flags); - call_console_drivers(level, text, len); + call_console_drivers(level, ext_text, ext_len, text, len); #else raw_spin_unlock(&logbuf_lock); stop_critical_timings(); /* don't trace print latency */ - call_console_drivers(level, text, len); + call_console_drivers(level, ext_text, ext_len, text, len); start_critical_timings(); local_irq_restore(flags); #endif + if (do_cond_resched) + cond_resched(); } console_locked = 0; @@ -2396,6 +2482,25 @@ void console_unblank(void) console_unlock(); } +/** + * console_flush_on_panic - flush console content on panic + * + * Immediately output all pending messages no matter what. + */ +void console_flush_on_panic(void) +{ + /* + * If someone else is holding the console lock, trylock will fail + * and may_schedule may be set. Ignore and proceed to unlock so + * that messages are flushed out. As this can be called from any + * context and we don't want to get preempted while flushing, + * ensure may_schedule is cleared. + */ + console_trylock(); + console_may_schedule = 0; + console_unlock(); +} + /* * Return the console tty driver structure and its associated index */ @@ -2582,6 +2687,11 @@ void register_console(struct console *newcon) newcon->next = console_drivers->next; console_drivers->next = newcon; } + + if (newcon->flags & CON_EXTENDED) + if (!nr_ext_console_drivers++) + pr_info("printk: continuation disabled due to ext consoles, expect more fragments in /dev/kmsg\n"); + if (newcon->flags & CON_PRINTBUFFER) { /* * console_unlock(); will print out the buffered messages @@ -2654,6 +2764,9 @@ int unregister_console(struct console *console) } } + if (!res && (console->flags & CON_EXTENDED)) + nr_ext_console_drivers--; + /* * If this isn't the last console and it has CON_CONSDEV set, we * need to set it on the next preferred console. diff --git a/kernel/kernel/profile.c b/kernel/kernel/profile.c index a7bcd28d6..99513e116 100644 --- a/kernel/kernel/profile.c +++ b/kernel/kernel/profile.c @@ -339,7 +339,7 @@ static int profile_cpu_callback(struct notifier_block *info, node = cpu_to_mem(cpu); per_cpu(cpu_profile_flip, cpu) = 0; if (!per_cpu(cpu_profile_hits, cpu)[1]) { - page = alloc_pages_exact_node(node, + page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); if (!page) @@ -347,7 +347,7 @@ static int profile_cpu_callback(struct notifier_block *info, per_cpu(cpu_profile_hits, cpu)[1] = page_address(page); } if (!per_cpu(cpu_profile_hits, cpu)[0]) { - page = alloc_pages_exact_node(node, + page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); if (!page) @@ -543,14 +543,14 @@ static int create_hash_tables(void) int node = cpu_to_mem(cpu); struct page *page; - page = alloc_pages_exact_node(node, + page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, 0); if (!page) goto out_cleanup; per_cpu(cpu_profile_hits, cpu)[1] = (struct profile_hit *)page_address(page); - page = alloc_pages_exact_node(node, + page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, 0); if (!page) diff --git a/kernel/kernel/ptrace.c b/kernel/kernel/ptrace.c index b8d001b89..1004af706 100644 --- a/kernel/kernel/ptrace.c +++ b/kernel/kernel/ptrace.c @@ -129,12 +129,14 @@ static bool ptrace_freeze_traced(struct task_struct *task) spin_lock_irq(&task->sighand->siglock); if (task_is_traced(task) && !__fatal_signal_pending(task)) { - raw_spin_lock_irq(&task->pi_lock); + unsigned long flags; + + raw_spin_lock_irqsave(&task->pi_lock, flags); if (task->state & __TASK_TRACED) task->state = __TASK_TRACED; else task->saved_state = __TASK_TRACED; - raw_spin_unlock_irq(&task->pi_lock); + raw_spin_unlock_irqrestore(&task->pi_lock, flags); ret = true; } spin_unlock_irq(&task->sighand->siglock); @@ -224,6 +226,14 @@ static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode) static int __ptrace_may_access(struct task_struct *task, unsigned int mode) { const struct cred *cred = current_cred(), *tcred; + int dumpable = 0; + kuid_t caller_uid; + kgid_t caller_gid; + + if (!(mode & PTRACE_MODE_FSCREDS) == !(mode & PTRACE_MODE_REALCREDS)) { + WARN(1, "denying ptrace access check without PTRACE_MODE_*CREDS\n"); + return -EPERM; + } /* May we inspect the given task? * This check is used both for attaching with ptrace @@ -233,18 +243,33 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode) * because setting up the necessary parent/child relationship * or halting the specified task is impossible. */ - int dumpable = 0; + /* Don't let security modules deny introspection */ if (same_thread_group(task, current)) return 0; rcu_read_lock(); + if (mode & PTRACE_MODE_FSCREDS) { + caller_uid = cred->fsuid; + caller_gid = cred->fsgid; + } else { + /* + * Using the euid would make more sense here, but something + * in userland might rely on the old behavior, and this + * shouldn't be a security problem since + * PTRACE_MODE_REALCREDS implies that the caller explicitly + * used a syscall that requests access to another process + * (and not a filesystem syscall to procfs). + */ + caller_uid = cred->uid; + caller_gid = cred->gid; + } tcred = __task_cred(task); - if (uid_eq(cred->uid, tcred->euid) && - uid_eq(cred->uid, tcred->suid) && - uid_eq(cred->uid, tcred->uid) && - gid_eq(cred->gid, tcred->egid) && - gid_eq(cred->gid, tcred->sgid) && - gid_eq(cred->gid, tcred->gid)) + if (uid_eq(caller_uid, tcred->euid) && + uid_eq(caller_uid, tcred->suid) && + uid_eq(caller_uid, tcred->uid) && + gid_eq(caller_gid, tcred->egid) && + gid_eq(caller_gid, tcred->sgid) && + gid_eq(caller_gid, tcred->gid)) goto ok; if (ptrace_has_cap(tcred->user_ns, mode)) goto ok; @@ -311,7 +336,7 @@ static int ptrace_attach(struct task_struct *task, long request, goto out; task_lock(task); - retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH); + retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS); task_unlock(task); if (retval) goto unlock_creds; @@ -561,6 +586,19 @@ static int ptrace_setoptions(struct task_struct *child, unsigned long data) if (data & ~(unsigned long)PTRACE_O_MASK) return -EINVAL; + if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) { + if (!config_enabled(CONFIG_CHECKPOINT_RESTORE) || + !config_enabled(CONFIG_SECCOMP)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (seccomp_mode(¤t->seccomp) != SECCOMP_MODE_DISABLED || + current->ptrace & PT_SUSPEND_SECCOMP) + return -EPERM; + } + /* Avoid intermediate state when all opts are cleared */ flags = child->ptrace; flags &= ~(PTRACE_O_MASK << PT_OPT_FLAG_SHIFT); @@ -1008,6 +1046,11 @@ int ptrace_request(struct task_struct *child, long request, break; } #endif + + case PTRACE_SECCOMP_GET_FILTER: + ret = seccomp_get_filter(child, addr, datavp); + break; + default: break; } diff --git a/kernel/kernel/rcu/Makefile b/kernel/kernel/rcu/Makefile index 50a808424..61a16569f 100644 --- a/kernel/kernel/rcu/Makefile +++ b/kernel/kernel/rcu/Makefile @@ -1,4 +1,4 @@ -obj-y += update.o +obj-y += update.o sync.o obj-$(CONFIG_SRCU) += srcu.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_TREE_RCU) += tree.o diff --git a/kernel/kernel/rcu/rcutorture.c b/kernel/kernel/rcu/rcutorture.c index 8dbe27611..5bb3364a6 100644 --- a/kernel/kernel/rcu/rcutorture.c +++ b/kernel/kernel/rcu/rcutorture.c @@ -241,6 +241,7 @@ rcu_torture_free(struct rcu_torture *p) struct rcu_torture_ops { int ttype; void (*init)(void); + void (*cleanup)(void); int (*readlock)(void); void (*read_delay)(struct torture_random_state *rrsp); void (*readunlock)(int idx); @@ -251,7 +252,7 @@ struct rcu_torture_ops { void (*exp_sync)(void); unsigned long (*get_state)(void); void (*cond_sync)(unsigned long oldstate); - void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); + call_rcu_func_t call; void (*cb_barrier)(void); void (*fqs)(void); void (*stats)(void); @@ -389,6 +390,7 @@ static struct rcu_torture_ops rcu_ops = { .name = "rcu" }; +#ifndef CONFIG_PREEMPT_RT_FULL /* * Definitions for rcu_bh torture testing. */ @@ -428,6 +430,12 @@ static struct rcu_torture_ops rcu_bh_ops = { .name = "rcu_bh" }; +#else +static struct rcu_torture_ops rcu_bh_ops = { + .ttype = INVALID_RCU_FLAVOR, +}; +#endif + /* * Don't even think about trying any of these in real life!!! * The names includes "busted", and they really means it! @@ -447,7 +455,7 @@ static void synchronize_rcu_busted(void) } static void -call_rcu_busted(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +call_rcu_busted(struct rcu_head *head, rcu_callback_t func) { /* This is a deliberate bug for testing purposes only! */ func(head); @@ -477,10 +485,12 @@ static struct rcu_torture_ops rcu_busted_ops = { */ DEFINE_STATIC_SRCU(srcu_ctl); +static struct srcu_struct srcu_ctld; +static struct srcu_struct *srcu_ctlp = &srcu_ctl; -static int srcu_torture_read_lock(void) __acquires(&srcu_ctl) +static int srcu_torture_read_lock(void) __acquires(srcu_ctlp) { - return srcu_read_lock(&srcu_ctl); + return srcu_read_lock(srcu_ctlp); } static void srcu_read_delay(struct torture_random_state *rrsp) @@ -499,49 +509,49 @@ static void srcu_read_delay(struct torture_random_state *rrsp) rcu_read_delay(rrsp); } -static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl) +static void srcu_torture_read_unlock(int idx) __releases(srcu_ctlp) { - srcu_read_unlock(&srcu_ctl, idx); + srcu_read_unlock(srcu_ctlp, idx); } static unsigned long srcu_torture_completed(void) { - return srcu_batches_completed(&srcu_ctl); + return srcu_batches_completed(srcu_ctlp); } static void srcu_torture_deferred_free(struct rcu_torture *rp) { - call_srcu(&srcu_ctl, &rp->rtort_rcu, rcu_torture_cb); + call_srcu(srcu_ctlp, &rp->rtort_rcu, rcu_torture_cb); } static void srcu_torture_synchronize(void) { - synchronize_srcu(&srcu_ctl); + synchronize_srcu(srcu_ctlp); } static void srcu_torture_call(struct rcu_head *head, - void (*func)(struct rcu_head *head)) + rcu_callback_t func) { - call_srcu(&srcu_ctl, head, func); + call_srcu(srcu_ctlp, head, func); } static void srcu_torture_barrier(void) { - srcu_barrier(&srcu_ctl); + srcu_barrier(srcu_ctlp); } static void srcu_torture_stats(void) { int cpu; - int idx = srcu_ctl.completed & 0x1; + int idx = srcu_ctlp->completed & 0x1; pr_alert("%s%s per-CPU(idx=%d):", torture_type, TORTURE_FLAG, idx); for_each_possible_cpu(cpu) { long c0, c1; - c0 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx]; - c1 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]; + c0 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[!idx]; + c1 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[idx]; pr_cont(" %d(%ld,%ld)", cpu, c0, c1); } pr_cont("\n"); @@ -549,7 +559,7 @@ static void srcu_torture_stats(void) static void srcu_torture_synchronize_expedited(void) { - synchronize_srcu_expedited(&srcu_ctl); + synchronize_srcu_expedited(srcu_ctlp); } static struct rcu_torture_ops srcu_ops = { @@ -569,6 +579,38 @@ static struct rcu_torture_ops srcu_ops = { .name = "srcu" }; +static void srcu_torture_init(void) +{ + rcu_sync_torture_init(); + WARN_ON(init_srcu_struct(&srcu_ctld)); + srcu_ctlp = &srcu_ctld; +} + +static void srcu_torture_cleanup(void) +{ + cleanup_srcu_struct(&srcu_ctld); + srcu_ctlp = &srcu_ctl; /* In case of a later rcutorture run. */ +} + +/* As above, but dynamically allocated. */ +static struct rcu_torture_ops srcud_ops = { + .ttype = SRCU_FLAVOR, + .init = srcu_torture_init, + .cleanup = srcu_torture_cleanup, + .readlock = srcu_torture_read_lock, + .read_delay = srcu_read_delay, + .readunlock = srcu_torture_read_unlock, + .started = NULL, + .completed = srcu_torture_completed, + .deferred_free = srcu_torture_deferred_free, + .sync = srcu_torture_synchronize, + .exp_sync = srcu_torture_synchronize_expedited, + .call = srcu_torture_call, + .cb_barrier = srcu_torture_barrier, + .stats = srcu_torture_stats, + .name = "srcud" +}; + /* * Definitions for sched torture testing. */ @@ -600,6 +642,8 @@ static struct rcu_torture_ops sched_ops = { .deferred_free = rcu_sched_torture_deferred_free, .sync = synchronize_sched, .exp_sync = synchronize_sched_expedited, + .get_state = get_state_synchronize_sched, + .cond_sync = cond_synchronize_sched, .call = call_rcu_sched, .cb_barrier = rcu_barrier_sched, .fqs = rcu_sched_force_quiescent_state, @@ -649,10 +693,20 @@ static struct rcu_torture_ops tasks_ops = { #define RCUTORTURE_TASKS_OPS &tasks_ops, +static bool __maybe_unused torturing_tasks(void) +{ + return cur_ops == &tasks_ops; +} + #else /* #ifdef CONFIG_TASKS_RCU */ #define RCUTORTURE_TASKS_OPS +static bool __maybe_unused torturing_tasks(void) +{ + return false; +} + #endif /* #else #ifdef CONFIG_TASKS_RCU */ /* @@ -672,8 +726,8 @@ static void rcu_torture_boost_cb(struct rcu_head *head) struct rcu_boost_inflight *rbip = container_of(head, struct rcu_boost_inflight, rcu); - smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */ - rbip->inflight = 0; + /* Ensure RCU-core accesses precede clearing ->inflight */ + smp_store_release(&rbip->inflight, 0); } static int rcu_torture_boost(void *arg) @@ -710,9 +764,9 @@ static int rcu_torture_boost(void *arg) call_rcu_time = jiffies; while (ULONG_CMP_LT(jiffies, endtime)) { /* If we don't have a callback in flight, post one. */ - if (!rbi.inflight) { - smp_mb(); /* RCU core before ->inflight = 1. */ - rbi.inflight = 1; + if (!smp_load_acquire(&rbi.inflight)) { + /* RCU core before ->inflight = 1. */ + smp_store_release(&rbi.inflight, 1); call_rcu(&rbi.rcu, rcu_torture_boost_cb); if (jiffies - call_rcu_time > test_boost_duration * HZ - HZ / 2) { @@ -721,7 +775,6 @@ static int rcu_torture_boost(void *arg) } call_rcu_time = jiffies; } - cond_resched_rcu_qs(); stutter_wait("rcu_torture_boost"); if (torture_must_stop()) goto checkwait; @@ -751,11 +804,10 @@ checkwait: stutter_wait("rcu_torture_boost"); } while (!torture_must_stop()); /* Clean up and exit. */ - while (!kthread_should_stop() || rbi.inflight) { + while (!kthread_should_stop() || smp_load_acquire(&rbi.inflight)) { torture_shutdown_absorb("rcu_torture_boost"); schedule_timeout_uninterruptible(1); } - smp_mb(); /* order accesses to ->inflight before stack-frame death. */ destroy_rcu_head_on_stack(&rbi.rcu); torture_kthread_stopping("rcu_torture_boost"); return 0; @@ -789,9 +841,7 @@ rcu_torture_cbflood(void *arg) } if (err) { VERBOSE_TOROUT_STRING("rcu_torture_cbflood disabled: Bad args or OOM"); - while (!torture_must_stop()) - schedule_timeout_interruptible(HZ); - return 0; + goto wait_for_stop; } VERBOSE_TOROUT_STRING("rcu_torture_cbflood task started"); do { @@ -810,6 +860,7 @@ rcu_torture_cbflood(void *arg) stutter_wait("rcu_torture_cbflood"); } while (!torture_must_stop()); vfree(rhp); +wait_for_stop: torture_kthread_stopping("rcu_torture_cbflood"); return 0; } @@ -1054,7 +1105,8 @@ static void rcu_torture_timer(unsigned long unused) p = rcu_dereference_check(rcu_torture_current, rcu_read_lock_bh_held() || rcu_read_lock_sched_held() || - srcu_read_lock_held(&srcu_ctl)); + srcu_read_lock_held(srcu_ctlp) || + torturing_tasks()); if (p == NULL) { /* Leave because rcu_torture_writer is not yet underway */ cur_ops->readunlock(idx); @@ -1128,7 +1180,8 @@ rcu_torture_reader(void *arg) p = rcu_dereference_check(rcu_torture_current, rcu_read_lock_bh_held() || rcu_read_lock_sched_held() || - srcu_read_lock_held(&srcu_ctl)); + srcu_read_lock_held(srcu_ctlp) || + torturing_tasks()); if (p == NULL) { /* Wait for rcu_torture_writer to get underway */ cur_ops->readunlock(idx); @@ -1161,7 +1214,6 @@ rcu_torture_reader(void *arg) __this_cpu_inc(rcu_torture_batch[completed]); preempt_enable(); cur_ops->readunlock(idx); - cond_resched_rcu_qs(); stutter_wait("rcu_torture_reader"); } while (!torture_must_stop()); if (irqreader && cur_ops->irq_capable) { @@ -1413,12 +1465,15 @@ static int rcu_torture_barrier_cbs(void *arg) do { wait_event(barrier_cbs_wq[myid], (newphase = - ACCESS_ONCE(barrier_phase)) != lastphase || + smp_load_acquire(&barrier_phase)) != lastphase || torture_must_stop()); lastphase = newphase; - smp_mb(); /* ensure barrier_phase load before ->call(). */ if (torture_must_stop()) break; + /* + * The above smp_load_acquire() ensures barrier_phase load + * is ordered before the folloiwng ->call(). + */ cur_ops->call(&rcu, rcu_torture_barrier_cbf); if (atomic_dec_and_test(&barrier_cbs_count)) wake_up(&barrier_wq); @@ -1439,8 +1494,8 @@ static int rcu_torture_barrier(void *arg) do { atomic_set(&barrier_cbs_invoked, 0); atomic_set(&barrier_cbs_count, n_barrier_cbs); - smp_mb(); /* Ensure barrier_phase after prior assignments. */ - barrier_phase = !barrier_phase; + /* Ensure barrier_phase ordered after prior assignments. */ + smp_store_release(&barrier_phase, !barrier_phase); for (i = 0; i < n_barrier_cbs; i++) wake_up(&barrier_cbs_wq[i]); wait_event(barrier_wq, @@ -1470,7 +1525,7 @@ static int rcu_torture_barrier_init(void) int i; int ret; - if (n_barrier_cbs == 0) + if (n_barrier_cbs <= 0) return 0; if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) { pr_alert("%s" TORTURE_FLAG @@ -1588,10 +1643,14 @@ rcu_torture_cleanup(void) rcutorture_booster_cleanup(i); } - /* Wait for all RCU callbacks to fire. */ - + /* + * Wait for all RCU callbacks to fire, then do flavor-specific + * cleanup operations. + */ if (cur_ops->cb_barrier != NULL) cur_ops->cb_barrier(); + if (cur_ops->cleanup != NULL) + cur_ops->cleanup(); rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ @@ -1668,8 +1727,8 @@ rcu_torture_init(void) int cpu; int firsterr = 0; static struct rcu_torture_ops *torture_ops[] = { - &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &sched_ops, - RCUTORTURE_TASKS_OPS + &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, + &sched_ops, RCUTORTURE_TASKS_OPS }; if (!torture_init_begin(torture_type, verbose, &torture_runnable)) @@ -1688,20 +1747,20 @@ rcu_torture_init(void) for (i = 0; i < ARRAY_SIZE(torture_ops); i++) pr_alert(" %s", torture_ops[i]->name); pr_alert("\n"); - torture_init_end(); - return -EINVAL; + firsterr = -EINVAL; + goto unwind; } if (cur_ops->fqs == NULL && fqs_duration != 0) { pr_alert("rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n"); fqs_duration = 0; } if (cur_ops->init) - cur_ops->init(); /* no "goto unwind" prior to this point!!! */ + cur_ops->init(); if (nreaders >= 0) { nrealreaders = nreaders; } else { - nrealreaders = num_online_cpus() - 1; + nrealreaders = num_online_cpus() - 2 - nreaders; if (nrealreaders <= 0) nrealreaders = 1; } @@ -1745,12 +1804,15 @@ rcu_torture_init(void) writer_task); if (firsterr) goto unwind; - fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), - GFP_KERNEL); - if (fakewriter_tasks == NULL) { - VERBOSE_TOROUT_ERRSTRING("out of memory"); - firsterr = -ENOMEM; - goto unwind; + if (nfakewriters > 0) { + fakewriter_tasks = kzalloc(nfakewriters * + sizeof(fakewriter_tasks[0]), + GFP_KERNEL); + if (fakewriter_tasks == NULL) { + VERBOSE_TOROUT_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } } for (i = 0; i < nfakewriters; i++) { firsterr = torture_create_kthread(rcu_torture_fakewriter, @@ -1777,7 +1839,7 @@ rcu_torture_init(void) if (firsterr) goto unwind; } - if (test_no_idle_hz) { + if (test_no_idle_hz && shuffle_interval > 0) { firsterr = torture_shuffle_init(shuffle_interval * HZ); if (firsterr) goto unwind; diff --git a/kernel/kernel/rcu/srcu.c b/kernel/kernel/rcu/srcu.c index cad76e76b..a63a1ea5a 100644 --- a/kernel/kernel/rcu/srcu.c +++ b/kernel/kernel/rcu/srcu.c @@ -151,7 +151,7 @@ static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx) unsigned long t; for_each_possible_cpu(cpu) { - t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]); + t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]); sum += t; } return sum; @@ -168,7 +168,7 @@ static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx) unsigned long t; for_each_possible_cpu(cpu) { - t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]); + t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]); sum += t; } return sum; @@ -252,21 +252,22 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) } /** - * srcu_readers_active - returns approximate number of readers. + * srcu_readers_active - returns true if there are readers. and false + * otherwise * @sp: which srcu_struct to count active readers (holding srcu_read_lock). * * Note that this is not an atomic primitive, and can therefore suffer * severe errors when invoked on an active srcu_struct. That said, it * can be useful as an error check at cleanup time. */ -static int srcu_readers_active(struct srcu_struct *sp) +static bool srcu_readers_active(struct srcu_struct *sp) { int cpu; unsigned long sum = 0; for_each_possible_cpu(cpu) { - sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]); - sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]); + sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]); + sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]); } return sum; } @@ -296,12 +297,10 @@ int __srcu_read_lock(struct srcu_struct *sp) { int idx; - idx = ACCESS_ONCE(sp->completed) & 0x1; - preempt_disable(); + idx = READ_ONCE(sp->completed) & 0x1; __this_cpu_inc(sp->per_cpu_ref->c[idx]); smp_mb(); /* B */ /* Avoid leaking the critical section. */ __this_cpu_inc(sp->per_cpu_ref->seq[idx]); - preempt_enable(); return idx; } EXPORT_SYMBOL_GPL(__srcu_read_lock); @@ -386,7 +385,7 @@ static void srcu_flip(struct srcu_struct *sp) * srcu_struct structure. */ void call_srcu(struct srcu_struct *sp, struct rcu_head *head, - void (*func)(struct rcu_head *head)) + rcu_callback_t func) { unsigned long flags; @@ -414,11 +413,11 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount) struct rcu_head *head = &rcu.head; bool done = false; - rcu_lockdep_assert(!lock_is_held(&sp->dep_map) && - !lock_is_held(&rcu_bh_lock_map) && - !lock_is_held(&rcu_lock_map) && - !lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section"); + RCU_LOCKDEP_WARN(lock_is_held(&sp->dep_map) || + lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section"); might_sleep(); init_completion(&rcu.completion); diff --git a/kernel/kernel/rcu/sync.c b/kernel/kernel/rcu/sync.c new file mode 100644 index 000000000..be922c9f3 --- /dev/null +++ b/kernel/kernel/rcu/sync.c @@ -0,0 +1,223 @@ +/* + * RCU-based infrastructure for lightweight reader-writer locking + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright (c) 2015, Red Hat, Inc. + * + * Author: Oleg Nesterov <oleg@redhat.com> + */ + +#include <linux/rcu_sync.h> +#include <linux/sched.h> + +#ifdef CONFIG_PROVE_RCU +#define __INIT_HELD(func) .held = func, +#else +#define __INIT_HELD(func) +#endif + +static const struct { + void (*sync)(void); + void (*call)(struct rcu_head *, void (*)(struct rcu_head *)); + void (*wait)(void); +#ifdef CONFIG_PROVE_RCU + int (*held)(void); +#endif +} gp_ops[] = { + [RCU_SYNC] = { + .sync = synchronize_rcu, + .call = call_rcu, + .wait = rcu_barrier, + __INIT_HELD(rcu_read_lock_held) + }, + [RCU_SCHED_SYNC] = { + .sync = synchronize_sched, + .call = call_rcu_sched, + .wait = rcu_barrier_sched, + __INIT_HELD(rcu_read_lock_sched_held) + }, + [RCU_BH_SYNC] = { + .sync = synchronize_rcu_bh, + .call = call_rcu_bh, + .wait = rcu_barrier_bh, + __INIT_HELD(rcu_read_lock_bh_held) + }, +}; + +enum { GP_IDLE = 0, GP_PENDING, GP_PASSED }; +enum { CB_IDLE = 0, CB_PENDING, CB_REPLAY }; + +#define rss_lock gp_wait.lock + +#ifdef CONFIG_PROVE_RCU +void rcu_sync_lockdep_assert(struct rcu_sync *rsp) +{ + RCU_LOCKDEP_WARN(!gp_ops[rsp->gp_type].held(), + "suspicious rcu_sync_is_idle() usage"); +} +#endif + +/** + * rcu_sync_init() - Initialize an rcu_sync structure + * @rsp: Pointer to rcu_sync structure to be initialized + * @type: Flavor of RCU with which to synchronize rcu_sync structure + */ +void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type) +{ + memset(rsp, 0, sizeof(*rsp)); + init_waitqueue_head(&rsp->gp_wait); + rsp->gp_type = type; +} + +/** + * rcu_sync_enter() - Force readers onto slowpath + * @rsp: Pointer to rcu_sync structure to use for synchronization + * + * This function is used by updaters who need readers to make use of + * a slowpath during the update. After this function returns, all + * subsequent calls to rcu_sync_is_idle() will return false, which + * tells readers to stay off their fastpaths. A later call to + * rcu_sync_exit() re-enables reader slowpaths. + * + * When called in isolation, rcu_sync_enter() must wait for a grace + * period, however, closely spaced calls to rcu_sync_enter() can + * optimize away the grace-period wait via a state machine implemented + * by rcu_sync_enter(), rcu_sync_exit(), and rcu_sync_func(). + */ +void rcu_sync_enter(struct rcu_sync *rsp) +{ + bool need_wait, need_sync; + + spin_lock_irq(&rsp->rss_lock); + need_wait = rsp->gp_count++; + need_sync = rsp->gp_state == GP_IDLE; + if (need_sync) + rsp->gp_state = GP_PENDING; + spin_unlock_irq(&rsp->rss_lock); + + BUG_ON(need_wait && need_sync); + + if (need_sync) { + gp_ops[rsp->gp_type].sync(); + rsp->gp_state = GP_PASSED; + wake_up_all(&rsp->gp_wait); + } else if (need_wait) { + wait_event(rsp->gp_wait, rsp->gp_state == GP_PASSED); + } else { + /* + * Possible when there's a pending CB from a rcu_sync_exit(). + * Nobody has yet been allowed the 'fast' path and thus we can + * avoid doing any sync(). The callback will get 'dropped'. + */ + BUG_ON(rsp->gp_state != GP_PASSED); + } +} + +/** + * rcu_sync_func() - Callback function managing reader access to fastpath + * @rsp: Pointer to rcu_sync structure to use for synchronization + * + * This function is passed to one of the call_rcu() functions by + * rcu_sync_exit(), so that it is invoked after a grace period following the + * that invocation of rcu_sync_exit(). It takes action based on events that + * have taken place in the meantime, so that closely spaced rcu_sync_enter() + * and rcu_sync_exit() pairs need not wait for a grace period. + * + * If another rcu_sync_enter() is invoked before the grace period + * ended, reset state to allow the next rcu_sync_exit() to let the + * readers back onto their fastpaths (after a grace period). If both + * another rcu_sync_enter() and its matching rcu_sync_exit() are invoked + * before the grace period ended, re-invoke call_rcu() on behalf of that + * rcu_sync_exit(). Otherwise, set all state back to idle so that readers + * can again use their fastpaths. + */ +static void rcu_sync_func(struct rcu_head *rcu) +{ + struct rcu_sync *rsp = container_of(rcu, struct rcu_sync, cb_head); + unsigned long flags; + + BUG_ON(rsp->gp_state != GP_PASSED); + BUG_ON(rsp->cb_state == CB_IDLE); + + spin_lock_irqsave(&rsp->rss_lock, flags); + if (rsp->gp_count) { + /* + * A new rcu_sync_begin() has happened; drop the callback. + */ + rsp->cb_state = CB_IDLE; + } else if (rsp->cb_state == CB_REPLAY) { + /* + * A new rcu_sync_exit() has happened; requeue the callback + * to catch a later GP. + */ + rsp->cb_state = CB_PENDING; + gp_ops[rsp->gp_type].call(&rsp->cb_head, rcu_sync_func); + } else { + /* + * We're at least a GP after rcu_sync_exit(); eveybody will now + * have observed the write side critical section. Let 'em rip!. + */ + rsp->cb_state = CB_IDLE; + rsp->gp_state = GP_IDLE; + } + spin_unlock_irqrestore(&rsp->rss_lock, flags); +} + +/** + * rcu_sync_exit() - Allow readers back onto fast patch after grace period + * @rsp: Pointer to rcu_sync structure to use for synchronization + * + * This function is used by updaters who have completed, and can therefore + * now allow readers to make use of their fastpaths after a grace period + * has elapsed. After this grace period has completed, all subsequent + * calls to rcu_sync_is_idle() will return true, which tells readers that + * they can once again use their fastpaths. + */ +void rcu_sync_exit(struct rcu_sync *rsp) +{ + spin_lock_irq(&rsp->rss_lock); + if (!--rsp->gp_count) { + if (rsp->cb_state == CB_IDLE) { + rsp->cb_state = CB_PENDING; + gp_ops[rsp->gp_type].call(&rsp->cb_head, rcu_sync_func); + } else if (rsp->cb_state == CB_PENDING) { + rsp->cb_state = CB_REPLAY; + } + } + spin_unlock_irq(&rsp->rss_lock); +} + +/** + * rcu_sync_dtor() - Clean up an rcu_sync structure + * @rsp: Pointer to rcu_sync structure to be cleaned up + */ +void rcu_sync_dtor(struct rcu_sync *rsp) +{ + int cb_state; + + BUG_ON(rsp->gp_count); + + spin_lock_irq(&rsp->rss_lock); + if (rsp->cb_state == CB_REPLAY) + rsp->cb_state = CB_PENDING; + cb_state = rsp->cb_state; + spin_unlock_irq(&rsp->rss_lock); + + if (cb_state != CB_IDLE) { + gp_ops[rsp->gp_type].wait(); + BUG_ON(rsp->cb_state != CB_IDLE); + } +} diff --git a/kernel/kernel/rcu/tiny.c b/kernel/kernel/rcu/tiny.c index ec3086879..944b1b491 100644 --- a/kernel/kernel/rcu/tiny.c +++ b/kernel/kernel/rcu/tiny.c @@ -35,7 +35,7 @@ #include <linux/time.h> #include <linux/cpu.h> #include <linux/prefetch.h> -#include <linux/ftrace_event.h> +#include <linux/trace_events.h> #include "rcu.h" @@ -44,44 +44,11 @@ struct rcu_ctrlblk; static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); static void rcu_process_callbacks(struct softirq_action *unused); static void __call_rcu(struct rcu_head *head, - void (*func)(struct rcu_head *rcu), + rcu_callback_t func, struct rcu_ctrlblk *rcp); #include "tiny_plugin.h" -/* - * Enter idle, which is an extended quiescent state if we have fully - * entered that mode. - */ -void rcu_idle_enter(void) -{ -} -EXPORT_SYMBOL_GPL(rcu_idle_enter); - -/* - * Exit an interrupt handler towards idle. - */ -void rcu_irq_exit(void) -{ -} -EXPORT_SYMBOL_GPL(rcu_irq_exit); - -/* - * Exit idle, so that we are no longer in an extended quiescent state. - */ -void rcu_idle_exit(void) -{ -} -EXPORT_SYMBOL_GPL(rcu_idle_exit); - -/* - * Enter an interrupt handler, moving away from idle. - */ -void rcu_irq_enter(void) -{ -} -EXPORT_SYMBOL_GPL(rcu_irq_enter); - #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) /* @@ -224,10 +191,10 @@ static void rcu_process_callbacks(struct softirq_action *unused) */ void synchronize_sched(void) { - rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && - !lock_is_held(&rcu_lock_map) && - !lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_sched() in RCU read-side critical section"); + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_sched() in RCU read-side critical section"); cond_resched(); } EXPORT_SYMBOL_GPL(synchronize_sched); @@ -236,7 +203,7 @@ EXPORT_SYMBOL_GPL(synchronize_sched); * Helper function for call_rcu() and call_rcu_bh(). */ static void __call_rcu(struct rcu_head *head, - void (*func)(struct rcu_head *rcu), + rcu_callback_t func, struct rcu_ctrlblk *rcp) { unsigned long flags; @@ -262,7 +229,7 @@ static void __call_rcu(struct rcu_head *head, * period. But since we have but one CPU, that would be after any * quiescent state. */ -void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +void call_rcu_sched(struct rcu_head *head, rcu_callback_t func) { __call_rcu(head, func, &rcu_sched_ctrlblk); } @@ -272,7 +239,7 @@ EXPORT_SYMBOL_GPL(call_rcu_sched); * Post an RCU bottom-half callback to be invoked after any subsequent * quiescent state. */ -void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +void call_rcu_bh(struct rcu_head *head, rcu_callback_t func) { __call_rcu(head, func, &rcu_bh_ctrlblk); } diff --git a/kernel/kernel/rcu/tiny_plugin.h b/kernel/kernel/rcu/tiny_plugin.h index f94e209a1..e492a5253 100644 --- a/kernel/kernel/rcu/tiny_plugin.h +++ b/kernel/kernel/rcu/tiny_plugin.h @@ -144,16 +144,17 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp) return; rcp->ticks_this_gp++; j = jiffies; - js = ACCESS_ONCE(rcp->jiffies_stall); + js = READ_ONCE(rcp->jiffies_stall); if (rcp->rcucblist && ULONG_CMP_GE(j, js)) { pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n", rcp->name, rcp->ticks_this_gp, DYNTICK_TASK_EXIT_IDLE, jiffies - rcp->gp_start, rcp->qlen); dump_stack(); - ACCESS_ONCE(rcp->jiffies_stall) = jiffies + - 3 * rcu_jiffies_till_stall_check() + 3; + WRITE_ONCE(rcp->jiffies_stall, + jiffies + 3 * rcu_jiffies_till_stall_check() + 3); } else if (ULONG_CMP_GE(j, js)) { - ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check(); + WRITE_ONCE(rcp->jiffies_stall, + jiffies + rcu_jiffies_till_stall_check()); } } @@ -161,7 +162,8 @@ static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) { rcp->ticks_this_gp = 0; rcp->gp_start = jiffies; - ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check(); + WRITE_ONCE(rcp->jiffies_stall, + jiffies + rcu_jiffies_till_stall_check()); } static void check_cpu_stalls(void) diff --git a/kernel/kernel/rcu/tree.c b/kernel/kernel/rcu/tree.c index 965df22d9..d862a203f 100644 --- a/kernel/kernel/rcu/tree.c +++ b/kernel/kernel/rcu/tree.c @@ -54,7 +54,7 @@ #include <linux/delay.h> #include <linux/stop_machine.h> #include <linux/random.h> -#include <linux/ftrace_event.h> +#include <linux/trace_events.h> #include <linux/suspend.h> #include <linux/delay.h> #include <linux/gfp.h> @@ -75,6 +75,7 @@ MODULE_ALIAS("rcutree"); static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; +static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS]; /* * In order to export the rcu_state name to the tracing tools, it @@ -96,12 +97,12 @@ static const char *tp_##sname##_varname __used __tracepoint_string = sname##_var #define RCU_STATE_INITIALIZER(sname, sabbr, cr) \ DEFINE_RCU_TPS(sname) \ -DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data); \ +static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data); \ struct rcu_state sname##_state = { \ .level = { &sname##_state.node[0] }, \ .rda = &sname##_data, \ .call = cr, \ - .fqs_state = RCU_GP_IDLE, \ + .gp_state = RCU_GP_IDLE, \ .gpnum = 0UL - 300UL, \ .completed = 0UL - 300UL, \ .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \ @@ -115,20 +116,22 @@ struct rcu_state sname##_state = { \ RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); -static struct rcu_state *rcu_state_p; +static struct rcu_state *const rcu_state_p; +static struct rcu_data __percpu *const rcu_data_p; LIST_HEAD(rcu_struct_flavors); -/* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */ -static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF; +/* Dump rcu_node combining tree at boot to verify correct setup. */ +static bool dump_tree; +module_param(dump_tree, bool, 0444); +/* Control rcu_node-tree auto-balancing at boot time. */ +static bool rcu_fanout_exact; +module_param(rcu_fanout_exact, bool, 0444); +/* Increase (but not decrease) the RCU_FANOUT_LEAF at boot time. */ +static int rcu_fanout_leaf = RCU_FANOUT_LEAF; module_param(rcu_fanout_leaf, int, 0444); int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; -static int num_rcu_lvl[] = { /* Number of rcu_nodes at specified level. */ - NUM_RCU_LVL_0, - NUM_RCU_LVL_1, - NUM_RCU_LVL_2, - NUM_RCU_LVL_3, - NUM_RCU_LVL_4, -}; +/* Number of rcu_nodes at specified level. */ +static int num_rcu_lvl[] = NUM_RCU_LVL_INIT; int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ /* @@ -162,19 +165,50 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf); static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); static void invoke_rcu_core(void); static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); +static void rcu_report_exp_rdp(struct rcu_state *rsp, + struct rcu_data *rdp, bool wake); /* rcuc/rcub kthread realtime priority */ +#ifdef CONFIG_RCU_KTHREAD_PRIO static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO; +#else /* #ifdef CONFIG_RCU_KTHREAD_PRIO */ +static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0; +#endif /* #else #ifdef CONFIG_RCU_KTHREAD_PRIO */ module_param(kthread_prio, int, 0644); /* Delay in jiffies for grace-period initialization delays, debug only. */ + +#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT +static int gp_preinit_delay = CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT_DELAY; +module_param(gp_preinit_delay, int, 0644); +#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT */ +static const int gp_preinit_delay; +#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT */ + #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT static int gp_init_delay = CONFIG_RCU_TORTURE_TEST_SLOW_INIT_DELAY; module_param(gp_init_delay, int, 0644); #else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT */ static const int gp_init_delay; #endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT */ -#define PER_RCU_NODE_PERIOD 10 /* Number of grace periods between delays. */ + +#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP +static int gp_cleanup_delay = CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP_DELAY; +module_param(gp_cleanup_delay, int, 0644); +#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP */ +static const int gp_cleanup_delay; +#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP */ + +/* + * Number of grace periods between delays, normalized by the duration of + * the delay. The longer the the delay, the more the grace periods between + * each delay. The reason for this normalization is that it means that, + * for non-zero delays, the overall slowdown of grace periods is constant + * regardless of the duration of the delay. This arrangement balances + * the need for long delays to increase some race probabilities with the + * need for fast grace periods to increase other race probabilities. + */ +#define PER_RCU_NODE_PERIOD 3 /* Number of grace periods between delays. */ /* * Track the rcutorture test sequence number and the update version @@ -196,17 +230,17 @@ unsigned long rcutorture_vernum; */ unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp) { - return ACCESS_ONCE(rnp->qsmaskinitnext); + return READ_ONCE(rnp->qsmaskinitnext); } /* - * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s + * Return true if an RCU grace period is in progress. The READ_ONCE()s * permit this function to be invoked without holding the root rcu_node * structure's ->lock, but of course results can be subject to change. */ static int rcu_gp_in_progress(struct rcu_state *rsp) { - return ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum); + return READ_ONCE(rsp->completed) != READ_ONCE(rsp->gpnum); } /* @@ -217,11 +251,23 @@ static int rcu_gp_in_progress(struct rcu_state *rsp) */ void rcu_sched_qs(void) { - if (!__this_cpu_read(rcu_sched_data.passed_quiesce)) { + unsigned long flags; + + if (__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) { trace_rcu_grace_period(TPS("rcu_sched"), __this_cpu_read(rcu_sched_data.gpnum), TPS("cpuqs")); - __this_cpu_write(rcu_sched_data.passed_quiesce, 1); + __this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false); + if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) + return; + local_irq_save(flags); + if (__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) { + __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false); + rcu_report_exp_rdp(&rcu_sched_state, + this_cpu_ptr(&rcu_sched_data), + true); + } + local_irq_restore(flags); } } @@ -240,11 +286,11 @@ void rcu_bh_qs(void) #else void rcu_bh_qs(void) { - if (!__this_cpu_read(rcu_bh_data.passed_quiesce)) { + if (__this_cpu_read(rcu_bh_data.cpu_no_qs.s)) { trace_rcu_grace_period(TPS("rcu_bh"), __this_cpu_read(rcu_bh_data.gpnum), TPS("cpuqs")); - __this_cpu_write(rcu_bh_data.passed_quiesce, 1); + __this_cpu_write(rcu_bh_data.cpu_no_qs.b.norm, false); } } #endif @@ -297,8 +343,8 @@ static void rcu_momentary_dyntick_idle(void) if (!(resched_mask & rsp->flavor_mask)) continue; smp_mb(); /* rcu_sched_qs_mask before cond_resched_completed. */ - if (ACCESS_ONCE(rdp->mynode->completed) != - ACCESS_ONCE(rdp->cond_resched_completed)) + if (READ_ONCE(rdp->mynode->completed) != + READ_ONCE(rdp->cond_resched_completed)) continue; /* @@ -323,12 +369,14 @@ static void rcu_momentary_dyntick_idle(void) */ void rcu_note_context_switch(void) { + barrier(); /* Avoid RCU read-side critical sections leaking down. */ trace_rcu_utilization(TPS("Start context switch")); rcu_sched_qs(); rcu_preempt_note_context_switch(); if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) rcu_momentary_dyntick_idle(); trace_rcu_utilization(TPS("End context switch")); + barrier(); /* Avoid RCU read-side critical sections leaking up. */ } EXPORT_SYMBOL_GPL(rcu_note_context_switch); @@ -339,12 +387,19 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch); * RCU flavors in desperate need of a quiescent state, which will normally * be none of them). Either way, do a lightweight quiescent state for * all RCU flavors. + * + * The barrier() calls are redundant in the common case when this is + * called externally, but just in case this is called from within this + * file. + * */ void rcu_all_qs(void) { + barrier(); /* Avoid RCU read-side critical sections leaking down. */ if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) rcu_momentary_dyntick_idle(); this_cpu_inc(rcu_qs_ctr); + barrier(); /* Avoid RCU read-side critical sections leaking up. */ } EXPORT_SYMBOL_GPL(rcu_all_qs); @@ -399,11 +454,13 @@ EXPORT_SYMBOL_GPL(rcu_batches_started_sched); /* * Return the number of RCU BH batches started thus far for debug & stats. */ +#ifndef CONFIG_PREEMPT_RT_FULL unsigned long rcu_batches_started_bh(void) { return rcu_bh_state.gpnum; } EXPORT_SYMBOL_GPL(rcu_batches_started_bh); +#endif /* * Return the number of RCU batches completed thus far for debug & stats. @@ -508,9 +565,11 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, case RCU_FLAVOR: rsp = rcu_state_p; break; +#ifndef CONFIG_PREEMPT_RT_FULL case RCU_BH_FLAVOR: rsp = &rcu_bh_state; break; +#endif case RCU_SCHED_FLAVOR: rsp = &rcu_sched_state; break; @@ -518,9 +577,9 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, break; } if (rsp != NULL) { - *flags = ACCESS_ONCE(rsp->gp_flags); - *gpnum = ACCESS_ONCE(rsp->gpnum); - *completed = ACCESS_ONCE(rsp->completed); + *flags = READ_ONCE(rsp->gp_flags); + *gpnum = READ_ONCE(rsp->gpnum); + *completed = READ_ONCE(rsp->completed); return; } *flags = 0; @@ -566,10 +625,10 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) static int rcu_future_needs_gp(struct rcu_state *rsp) { struct rcu_node *rnp = rcu_get_root(rsp); - int idx = (ACCESS_ONCE(rnp->completed) + 1) & 0x1; + int idx = (READ_ONCE(rnp->completed) + 1) & 0x1; int *fp = &rnp->need_future_gp[idx]; - return ACCESS_ONCE(*fp); + return READ_ONCE(*fp); } /* @@ -592,7 +651,7 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) return 1; /* Yes, this CPU has newly registered callbacks. */ for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) if (rdp->nxttail[i - 1] != rdp->nxttail[i] && - ULONG_CMP_LT(ACCESS_ONCE(rsp->completed), + ULONG_CMP_LT(READ_ONCE(rsp->completed), rdp->nxtcompleted[i])) return 1; /* Yes, CBs for future grace period. */ return 0; /* No grace period needed. */ @@ -612,7 +671,8 @@ static void rcu_eqs_enter_common(long long oldval, bool user) struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting); - if (!user && !is_idle_task(current)) { + if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && + !user && !is_idle_task(current)) { struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); @@ -631,19 +691,20 @@ static void rcu_eqs_enter_common(long long oldval, bool user) smp_mb__before_atomic(); /* See above. */ atomic_inc(&rdtp->dynticks); smp_mb__after_atomic(); /* Force ordering with next sojourn. */ - WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && + atomic_read(&rdtp->dynticks) & 0x1); rcu_dynticks_task_enter(); /* * It is illegal to enter an extended quiescent state while * in an RCU read-side critical section. */ - rcu_lockdep_assert(!lock_is_held(&rcu_lock_map), - "Illegal idle entry in RCU read-side critical section."); - rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map), - "Illegal idle entry in RCU-bh read-side critical section."); - rcu_lockdep_assert(!lock_is_held(&rcu_sched_lock_map), - "Illegal idle entry in RCU-sched read-side critical section."); + RCU_LOCKDEP_WARN(lock_is_held(&rcu_lock_map), + "Illegal idle entry in RCU read-side critical section."); + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map), + "Illegal idle entry in RCU-bh read-side critical section."); + RCU_LOCKDEP_WARN(lock_is_held(&rcu_sched_lock_map), + "Illegal idle entry in RCU-sched read-side critical section."); } /* @@ -657,7 +718,8 @@ static void rcu_eqs_enter(bool user) rdtp = this_cpu_ptr(&rcu_dynticks); oldval = rdtp->dynticks_nesting; - WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && + (oldval & DYNTICK_TASK_NEST_MASK) == 0); if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) { rdtp->dynticks_nesting = 0; rcu_eqs_enter_common(oldval, user); @@ -689,7 +751,7 @@ void rcu_idle_enter(void) } EXPORT_SYMBOL_GPL(rcu_idle_enter); -#ifdef CONFIG_RCU_USER_QS +#ifdef CONFIG_NO_HZ_FULL /** * rcu_user_enter - inform RCU that we are resuming userspace. * @@ -702,7 +764,7 @@ void rcu_user_enter(void) { rcu_eqs_enter(1); } -#endif /* CONFIG_RCU_USER_QS */ +#endif /* CONFIG_NO_HZ_FULL */ /** * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle @@ -730,7 +792,8 @@ void rcu_irq_exit(void) rdtp = this_cpu_ptr(&rcu_dynticks); oldval = rdtp->dynticks_nesting; rdtp->dynticks_nesting--; - WARN_ON_ONCE(rdtp->dynticks_nesting < 0); + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && + rdtp->dynticks_nesting < 0); if (rdtp->dynticks_nesting) trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting); else @@ -755,10 +818,12 @@ static void rcu_eqs_exit_common(long long oldval, int user) atomic_inc(&rdtp->dynticks); /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ smp_mb__after_atomic(); /* See above. */ - WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && + !(atomic_read(&rdtp->dynticks) & 0x1)); rcu_cleanup_after_idle(); trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting); - if (!user && !is_idle_task(current)) { + if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && + !user && !is_idle_task(current)) { struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); @@ -782,7 +847,7 @@ static void rcu_eqs_exit(bool user) rdtp = this_cpu_ptr(&rcu_dynticks); oldval = rdtp->dynticks_nesting; - WARN_ON_ONCE(oldval < 0); + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0); if (oldval & DYNTICK_TASK_NEST_MASK) { rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; } else { @@ -813,7 +878,7 @@ void rcu_idle_exit(void) } EXPORT_SYMBOL_GPL(rcu_idle_exit); -#ifdef CONFIG_RCU_USER_QS +#ifdef CONFIG_NO_HZ_FULL /** * rcu_user_exit - inform RCU that we are exiting userspace. * @@ -824,7 +889,7 @@ void rcu_user_exit(void) { rcu_eqs_exit(1); } -#endif /* CONFIG_RCU_USER_QS */ +#endif /* CONFIG_NO_HZ_FULL */ /** * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle @@ -855,7 +920,8 @@ void rcu_irq_enter(void) rdtp = this_cpu_ptr(&rcu_dynticks); oldval = rdtp->dynticks_nesting; rdtp->dynticks_nesting++; - WARN_ON_ONCE(rdtp->dynticks_nesting == 0); + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && + rdtp->dynticks_nesting == 0); if (oldval) trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting); else @@ -962,9 +1028,9 @@ bool notrace rcu_is_watching(void) { bool ret; - preempt_disable(); + preempt_disable_notrace(); ret = __rcu_is_watching(); - preempt_enable(); + preempt_enable_notrace(); return ret; } EXPORT_SYMBOL_GPL(rcu_is_watching); @@ -1038,9 +1104,9 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp, trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); return 1; } else { - if (ULONG_CMP_LT(ACCESS_ONCE(rdp->gpnum) + ULONG_MAX / 4, + if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, rdp->mynode->gpnum)) - ACCESS_ONCE(rdp->gpwrap) = true; + WRITE_ONCE(rdp->gpwrap, true); return 0; } } @@ -1120,12 +1186,12 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, if (ULONG_CMP_GE(jiffies, rdp->rsp->gp_start + jiffies_till_sched_qs) || ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) { - if (!(ACCESS_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) { - ACCESS_ONCE(rdp->cond_resched_completed) = - ACCESS_ONCE(rdp->mynode->completed); + if (!(READ_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) { + WRITE_ONCE(rdp->cond_resched_completed, + READ_ONCE(rdp->mynode->completed)); smp_mb(); /* ->cond_resched_completed before *rcrmp. */ - ACCESS_ONCE(*rcrmp) = - ACCESS_ONCE(*rcrmp) + rdp->rsp->flavor_mask; + WRITE_ONCE(*rcrmp, + READ_ONCE(*rcrmp) + rdp->rsp->flavor_mask); resched_cpu(rdp->cpu); /* Force CPU into scheduler. */ rdp->rsp->jiffies_resched += 5; /* Enable beating. */ } else if (ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) { @@ -1146,9 +1212,9 @@ static void record_gp_stall_check_time(struct rcu_state *rsp) rsp->gp_start = j; smp_wmb(); /* Record start time before stall time. */ j1 = rcu_jiffies_till_stall_check(); - ACCESS_ONCE(rsp->jiffies_stall) = j + j1; + WRITE_ONCE(rsp->jiffies_stall, j + j1); rsp->jiffies_resched = j + j1 / 2; - rsp->n_force_qs_gpstart = ACCESS_ONCE(rsp->n_force_qs); + rsp->n_force_qs_gpstart = READ_ONCE(rsp->n_force_qs); } /* @@ -1160,10 +1226,13 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) unsigned long j; j = jiffies; - gpa = ACCESS_ONCE(rsp->gp_activity); + gpa = READ_ONCE(rsp->gp_activity); if (j - gpa > 2 * HZ) - pr_err("%s kthread starved for %ld jiffies!\n", - rsp->name, j - gpa); + pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x s%d ->state=%#lx\n", + rsp->name, j - gpa, + rsp->gpnum, rsp->completed, + rsp->gp_flags, rsp->gp_state, + rsp->gp_kthread ? rsp->gp_kthread->state : 0); } /* @@ -1200,12 +1269,13 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) /* Only let one CPU complain about others per time interval. */ raw_spin_lock_irqsave(&rnp->lock, flags); - delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall); + delta = jiffies - READ_ONCE(rsp->jiffies_stall); if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } - ACCESS_ONCE(rsp->jiffies_stall) = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; + WRITE_ONCE(rsp->jiffies_stall, + jiffies + 3 * rcu_jiffies_till_stall_check() + 3); raw_spin_unlock_irqrestore(&rnp->lock, flags); /* @@ -1239,12 +1309,12 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) if (ndetected) { rcu_dump_cpu_stacks(rsp); } else { - if (ACCESS_ONCE(rsp->gpnum) != gpnum || - ACCESS_ONCE(rsp->completed) == gpnum) { + if (READ_ONCE(rsp->gpnum) != gpnum || + READ_ONCE(rsp->completed) == gpnum) { pr_err("INFO: Stall ended before state dump start\n"); } else { j = jiffies; - gpa = ACCESS_ONCE(rsp->gp_activity); + gpa = READ_ONCE(rsp->gp_activity); pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n", rsp->name, j - gpa, j, gpa, jiffies_till_next_fqs, @@ -1289,9 +1359,9 @@ static void print_cpu_stall(struct rcu_state *rsp) rcu_dump_cpu_stacks(rsp); raw_spin_lock_irqsave(&rnp->lock, flags); - if (ULONG_CMP_GE(jiffies, ACCESS_ONCE(rsp->jiffies_stall))) - ACCESS_ONCE(rsp->jiffies_stall) = jiffies + - 3 * rcu_jiffies_till_stall_check() + 3; + if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall))) + WRITE_ONCE(rsp->jiffies_stall, + jiffies + 3 * rcu_jiffies_till_stall_check() + 3); raw_spin_unlock_irqrestore(&rnp->lock, flags); /* @@ -1334,20 +1404,20 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) * Given this check, comparisons of jiffies, rsp->jiffies_stall, * and rsp->gp_start suffice to forestall false positives. */ - gpnum = ACCESS_ONCE(rsp->gpnum); + gpnum = READ_ONCE(rsp->gpnum); smp_rmb(); /* Pick up ->gpnum first... */ - js = ACCESS_ONCE(rsp->jiffies_stall); + js = READ_ONCE(rsp->jiffies_stall); smp_rmb(); /* ...then ->jiffies_stall before the rest... */ - gps = ACCESS_ONCE(rsp->gp_start); + gps = READ_ONCE(rsp->gp_start); smp_rmb(); /* ...and finally ->gp_start before ->completed. */ - completed = ACCESS_ONCE(rsp->completed); + completed = READ_ONCE(rsp->completed); if (ULONG_CMP_GE(completed, gpnum) || ULONG_CMP_LT(j, js) || ULONG_CMP_GE(gps, js)) return; /* No stall or GP completed since entering function. */ rnp = rdp->mynode; if (rcu_gp_in_progress(rsp) && - (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask)) { + (READ_ONCE(rnp->qsmask) & rdp->grpmask)) { /* We haven't checked in, so go dump stack. */ print_cpu_stall(rsp); @@ -1374,7 +1444,7 @@ void rcu_cpu_stall_reset(void) struct rcu_state *rsp; for_each_rcu_flavor(rsp) - ACCESS_ONCE(rsp->jiffies_stall) = jiffies + ULONG_MAX / 2; + WRITE_ONCE(rsp->jiffies_stall, jiffies + ULONG_MAX / 2); } /* @@ -1484,7 +1554,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, * doing some extra useless work. */ if (rnp->gpnum != rnp->completed || - ACCESS_ONCE(rnp_root->gpnum) != ACCESS_ONCE(rnp_root->completed)) { + READ_ONCE(rnp_root->gpnum) != READ_ONCE(rnp_root->completed)) { rnp->need_future_gp[c & 0x1]++; trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf")); goto out; @@ -1551,7 +1621,6 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) int needmore; struct rcu_data *rdp = this_cpu_ptr(rsp->rda); - rcu_nocb_gp_cleanup(rsp, rnp); rnp->need_future_gp[c & 0x1] = 0; needmore = rnp->need_future_gp[(c + 1) & 0x1]; trace_rcu_future_gp(rnp, rdp, c, @@ -1569,10 +1638,10 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) static void rcu_gp_kthread_wake(struct rcu_state *rsp) { if (current == rsp->gp_kthread || - !ACCESS_ONCE(rsp->gp_flags) || + !READ_ONCE(rsp->gp_flags) || !rsp->gp_kthread) return; - swait_wake(&rsp->gp_wq); + swake_up(&rsp->gp_wq); } /* @@ -1704,7 +1773,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, /* Handle the ends of any preceding grace periods first. */ if (rdp->completed == rnp->completed && - !unlikely(ACCESS_ONCE(rdp->gpwrap))) { + !unlikely(READ_ONCE(rdp->gpwrap))) { /* No grace period end, so just accelerate recent callbacks. */ ret = rcu_accelerate_cbs(rsp, rnp, rdp); @@ -1719,7 +1788,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend")); } - if (rdp->gpnum != rnp->gpnum || unlikely(ACCESS_ONCE(rdp->gpwrap))) { + if (rdp->gpnum != rnp->gpnum || unlikely(READ_ONCE(rdp->gpwrap))) { /* * If the current grace period is waiting for this CPU, * set up to detect a quiescent state, otherwise don't @@ -1727,11 +1796,11 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, */ rdp->gpnum = rnp->gpnum; trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); - rdp->passed_quiesce = 0; + rdp->cpu_no_qs.b.norm = true; rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); - rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); + rdp->core_needs_qs = !!(rnp->qsmask & rdp->grpmask); zero_cpu_stall_ticks(rdp); - ACCESS_ONCE(rdp->gpwrap) = false; + WRITE_ONCE(rdp->gpwrap, false); } return ret; } @@ -1744,9 +1813,9 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) local_irq_save(flags); rnp = rdp->mynode; - if ((rdp->gpnum == ACCESS_ONCE(rnp->gpnum) && - rdp->completed == ACCESS_ONCE(rnp->completed) && - !unlikely(ACCESS_ONCE(rdp->gpwrap))) || /* w/out lock. */ + if ((rdp->gpnum == READ_ONCE(rnp->gpnum) && + rdp->completed == READ_ONCE(rnp->completed) && + !unlikely(READ_ONCE(rdp->gpwrap))) || /* w/out lock. */ !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ local_irq_restore(flags); return; @@ -1758,6 +1827,13 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) rcu_gp_kthread_wake(rsp); } +static void rcu_gp_slow(struct rcu_state *rsp, int delay) +{ + if (delay > 0 && + !(rsp->gpnum % (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay))) + schedule_timeout_uninterruptible(delay); +} + /* * Initialize a new grace period. Return 0 if no grace period required. */ @@ -1767,15 +1843,15 @@ static int rcu_gp_init(struct rcu_state *rsp) struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); - ACCESS_ONCE(rsp->gp_activity) = jiffies; + WRITE_ONCE(rsp->gp_activity, jiffies); raw_spin_lock_irq(&rnp->lock); smp_mb__after_unlock_lock(); - if (!ACCESS_ONCE(rsp->gp_flags)) { + if (!READ_ONCE(rsp->gp_flags)) { /* Spurious wakeup, tell caller to go back to sleep. */ raw_spin_unlock_irq(&rnp->lock); return 0; } - ACCESS_ONCE(rsp->gp_flags) = 0; /* Clear all flags: New grace period. */ + WRITE_ONCE(rsp->gp_flags, 0); /* Clear all flags: New grace period. */ if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) { /* @@ -1800,6 +1876,7 @@ static int rcu_gp_init(struct rcu_state *rsp) * will handle subsequent offline CPUs. */ rcu_for_each_leaf_node(rsp, rnp) { + rcu_gp_slow(rsp, gp_preinit_delay); raw_spin_lock_irq(&rnp->lock); smp_mb__after_unlock_lock(); if (rnp->qsmaskinit == rnp->qsmaskinitnext && @@ -1856,14 +1933,15 @@ static int rcu_gp_init(struct rcu_state *rsp) * process finishes, because this kthread handles both. */ rcu_for_each_node_breadth_first(rsp, rnp) { + rcu_gp_slow(rsp, gp_init_delay); raw_spin_lock_irq(&rnp->lock); smp_mb__after_unlock_lock(); rdp = this_cpu_ptr(rsp->rda); rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; - ACCESS_ONCE(rnp->gpnum) = rsp->gpnum; + WRITE_ONCE(rnp->gpnum, rsp->gpnum); if (WARN_ON_ONCE(rnp->completed != rsp->completed)) - ACCESS_ONCE(rnp->completed) = rsp->completed; + WRITE_ONCE(rnp->completed, rsp->completed); if (rnp == rdp->mynode) (void)__note_gp_changes(rsp, rnp, rdp); rcu_preempt_boost_start_gp(rnp); @@ -1872,28 +1950,44 @@ static int rcu_gp_init(struct rcu_state *rsp) rnp->grphi, rnp->qsmask); raw_spin_unlock_irq(&rnp->lock); cond_resched_rcu_qs(); - ACCESS_ONCE(rsp->gp_activity) = jiffies; - if (gp_init_delay > 0 && - !(rsp->gpnum % (rcu_num_nodes * PER_RCU_NODE_PERIOD))) - schedule_timeout_uninterruptible(gp_init_delay); + WRITE_ONCE(rsp->gp_activity, jiffies); } return 1; } /* + * Helper function for wait_event_interruptible_timeout() wakeup + * at force-quiescent-state time. + */ +static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp) +{ + struct rcu_node *rnp = rcu_get_root(rsp); + + /* Someone like call_rcu() requested a force-quiescent-state scan. */ + *gfp = READ_ONCE(rsp->gp_flags); + if (*gfp & RCU_GP_FLAG_FQS) + return true; + + /* The current grace period has completed. */ + if (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp)) + return true; + + return false; +} + +/* * Do one round of quiescent-state forcing. */ -static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) +static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time) { - int fqs_state = fqs_state_in; bool isidle = false; unsigned long maxj; struct rcu_node *rnp = rcu_get_root(rsp); - ACCESS_ONCE(rsp->gp_activity) = jiffies; + WRITE_ONCE(rsp->gp_activity, jiffies); rsp->n_force_qs++; - if (fqs_state == RCU_SAVE_DYNTICK) { + if (first_time) { /* Collect dyntick-idle snapshots. */ if (is_sysidle_rcu_state(rsp)) { isidle = true; @@ -1902,21 +1996,19 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) force_qs_rnp(rsp, dyntick_save_progress_counter, &isidle, &maxj); rcu_sysidle_report_gp(rsp, isidle, maxj); - fqs_state = RCU_FORCE_QS; } else { /* Handle dyntick-idle and offline CPUs. */ isidle = true; force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj); } /* Clear flag to prevent immediate re-entry. */ - if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { + if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { raw_spin_lock_irq(&rnp->lock); smp_mb__after_unlock_lock(); - ACCESS_ONCE(rsp->gp_flags) = - ACCESS_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS; + WRITE_ONCE(rsp->gp_flags, + READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS); raw_spin_unlock_irq(&rnp->lock); } - return fqs_state; } /* @@ -1929,8 +2021,9 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) int nocb = 0; struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); + struct swait_queue_head *sq; - ACCESS_ONCE(rsp->gp_activity) = jiffies; + WRITE_ONCE(rsp->gp_activity, jiffies); raw_spin_lock_irq(&rnp->lock); smp_mb__after_unlock_lock(); gp_duration = jiffies - rsp->gp_start; @@ -1961,15 +2054,18 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) smp_mb__after_unlock_lock(); WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); WARN_ON_ONCE(rnp->qsmask); - ACCESS_ONCE(rnp->completed) = rsp->gpnum; + WRITE_ONCE(rnp->completed, rsp->gpnum); rdp = this_cpu_ptr(rsp->rda); if (rnp == rdp->mynode) needgp = __note_gp_changes(rsp, rnp, rdp) || needgp; /* smp_mb() provided by prior unlock-lock pair. */ nocb += rcu_future_gp_cleanup(rsp, rnp); + sq = rcu_nocb_gp_get(rnp); raw_spin_unlock_irq(&rnp->lock); + rcu_nocb_gp_cleanup(sq); cond_resched_rcu_qs(); - ACCESS_ONCE(rsp->gp_activity) = jiffies; + WRITE_ONCE(rsp->gp_activity, jiffies); + rcu_gp_slow(rsp, gp_cleanup_delay); } rnp = rcu_get_root(rsp); raw_spin_lock_irq(&rnp->lock); @@ -1977,16 +2073,16 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) rcu_nocb_gp_set(rnp, nocb); /* Declare grace period done. */ - ACCESS_ONCE(rsp->completed) = rsp->gpnum; + WRITE_ONCE(rsp->completed, rsp->gpnum); trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end")); - rsp->fqs_state = RCU_GP_IDLE; + rsp->gp_state = RCU_GP_IDLE; rdp = this_cpu_ptr(rsp->rda); /* Advance CBs to reduce false positives below. */ needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp; if (needgp || cpu_needs_another_gp(rsp, rdp)) { - ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT; + WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT); trace_rcu_grace_period(rsp->name, - ACCESS_ONCE(rsp->gpnum), + READ_ONCE(rsp->gpnum), TPS("newreq")); } raw_spin_unlock_irq(&rnp->lock); @@ -1997,7 +2093,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) */ static int __noreturn rcu_gp_kthread(void *arg) { - int fqs_state; + bool first_gp_fqs; int gf; unsigned long j; int ret; @@ -2010,25 +2106,26 @@ static int __noreturn rcu_gp_kthread(void *arg) /* Handle grace-period start. */ for (;;) { trace_rcu_grace_period(rsp->name, - ACCESS_ONCE(rsp->gpnum), + READ_ONCE(rsp->gpnum), TPS("reqwait")); rsp->gp_state = RCU_GP_WAIT_GPS; swait_event_interruptible(rsp->gp_wq, - ACCESS_ONCE(rsp->gp_flags) & + READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_INIT); + rsp->gp_state = RCU_GP_DONE_GPS; /* Locking provides needed memory barrier. */ if (rcu_gp_init(rsp)) break; cond_resched_rcu_qs(); - ACCESS_ONCE(rsp->gp_activity) = jiffies; + WRITE_ONCE(rsp->gp_activity, jiffies); WARN_ON(signal_pending(current)); trace_rcu_grace_period(rsp->name, - ACCESS_ONCE(rsp->gpnum), + READ_ONCE(rsp->gpnum), TPS("reqwaitsig")); } /* Handle quiescent-state forcing. */ - fqs_state = RCU_SAVE_DYNTICK; + first_gp_fqs = true; j = jiffies_till_first_fqs; if (j > HZ) { j = HZ; @@ -2039,39 +2136,37 @@ static int __noreturn rcu_gp_kthread(void *arg) if (!ret) rsp->jiffies_force_qs = jiffies + j; trace_rcu_grace_period(rsp->name, - ACCESS_ONCE(rsp->gpnum), + READ_ONCE(rsp->gpnum), TPS("fqswait")); rsp->gp_state = RCU_GP_WAIT_FQS; ret = swait_event_interruptible_timeout(rsp->gp_wq, - ((gf = ACCESS_ONCE(rsp->gp_flags)) & - RCU_GP_FLAG_FQS) || - (!ACCESS_ONCE(rnp->qsmask) && - !rcu_preempt_blocked_readers_cgp(rnp)), - j); + rcu_gp_fqs_check_wake(rsp, &gf), j); + rsp->gp_state = RCU_GP_DOING_FQS; /* Locking provides needed memory barriers. */ /* If grace period done, leave loop. */ - if (!ACCESS_ONCE(rnp->qsmask) && + if (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp)) break; /* If time for quiescent-state forcing, do it. */ if (ULONG_CMP_GE(jiffies, rsp->jiffies_force_qs) || (gf & RCU_GP_FLAG_FQS)) { trace_rcu_grace_period(rsp->name, - ACCESS_ONCE(rsp->gpnum), + READ_ONCE(rsp->gpnum), TPS("fqsstart")); - fqs_state = rcu_gp_fqs(rsp, fqs_state); + rcu_gp_fqs(rsp, first_gp_fqs); + first_gp_fqs = false; trace_rcu_grace_period(rsp->name, - ACCESS_ONCE(rsp->gpnum), + READ_ONCE(rsp->gpnum), TPS("fqsend")); cond_resched_rcu_qs(); - ACCESS_ONCE(rsp->gp_activity) = jiffies; + WRITE_ONCE(rsp->gp_activity, jiffies); } else { /* Deal with stray signal. */ cond_resched_rcu_qs(); - ACCESS_ONCE(rsp->gp_activity) = jiffies; + WRITE_ONCE(rsp->gp_activity, jiffies); WARN_ON(signal_pending(current)); trace_rcu_grace_period(rsp->name, - ACCESS_ONCE(rsp->gpnum), + READ_ONCE(rsp->gpnum), TPS("fqswaitsig")); } j = jiffies_till_next_fqs; @@ -2085,7 +2180,9 @@ static int __noreturn rcu_gp_kthread(void *arg) } /* Handle grace-period end. */ + rsp->gp_state = RCU_GP_CLEANUP; rcu_gp_cleanup(rsp); + rsp->gp_state = RCU_GP_CLEANED; } } @@ -2113,8 +2210,8 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, */ return false; } - ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT; - trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), + WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT); + trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("newreq")); /* @@ -2164,8 +2261,9 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) __releases(rcu_get_root(rsp)->lock) { WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); + WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); - rcu_gp_kthread_wake(rsp); + swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */ } /* @@ -2292,7 +2390,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); smp_mb__after_unlock_lock(); - if ((rdp->passed_quiesce == 0 && + if ((rdp->cpu_no_qs.b.norm && rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) || rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum || rdp->gpwrap) { @@ -2303,7 +2401,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) * We will instead need a new quiescent state that lies * within the current grace period. */ - rdp->passed_quiesce = 0; /* need qs for new gp. */ + rdp->cpu_no_qs.b.norm = true; /* need qs for new gp. */ rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); raw_spin_unlock_irqrestore(&rnp->lock, flags); return; @@ -2312,7 +2410,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) if ((rnp->qsmask & mask) == 0) { raw_spin_unlock_irqrestore(&rnp->lock, flags); } else { - rdp->qs_pending = 0; + rdp->core_needs_qs = 0; /* * This GP can't end until cpu checks in, so all of our @@ -2343,14 +2441,14 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) * Does this CPU still need to do its part for current grace period? * If no, return and let the other CPUs do their part as well. */ - if (!rdp->qs_pending) + if (!rdp->core_needs_qs) return; /* * Was there a quiescent state since the beginning of the grace * period? If no, then exit and wait for the next call. */ - if (!rdp->passed_quiesce && + if (rdp->cpu_no_qs.b.norm && rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) return; @@ -2361,8 +2459,6 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) rcu_report_qs_rdp(rdp->cpu, rsp, rdp); } -#ifdef CONFIG_HOTPLUG_CPU - /* * Send the specified CPU's RCU callbacks to the orphanage. The * specified CPU must be offline, and the caller must hold the @@ -2373,7 +2469,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { /* No-CBs CPUs do not have orphanable callbacks. */ - if (rcu_is_nocb_cpu(rdp->cpu)) + if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rcu_is_nocb_cpu(rdp->cpu)) return; /* @@ -2386,7 +2482,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, rsp->qlen += rdp->qlen; rdp->n_cbs_orphaned += rdp->qlen; rdp->qlen_lazy = 0; - ACCESS_ONCE(rdp->qlen) = 0; + WRITE_ONCE(rdp->qlen, 0); } /* @@ -2432,7 +2528,8 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); /* No-CBs CPUs are handled specially. */ - if (rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags)) + if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || + rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags)) return; /* Do the accounting first. */ @@ -2479,6 +2576,9 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda)); RCU_TRACE(struct rcu_node *rnp = rdp->mynode); + if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) + return; + RCU_TRACE(mask = rdp->grpmask); trace_rcu_grace_period(rsp->name, rnp->gpnum + 1 - !!(rnp->qsmask & mask), @@ -2507,7 +2607,8 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) long mask; struct rcu_node *rnp = rnp_leaf; - if (rnp->qsmaskinit || rcu_preempt_has_tasks(rnp)) + if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || + rnp->qsmaskinit || rcu_preempt_has_tasks(rnp)) return; for (;;) { mask = rnp->grpmask; @@ -2538,6 +2639,9 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ + if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) + return; + /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ mask = rdp->grpmask; raw_spin_lock_irqsave(&rnp->lock, flags); @@ -2559,6 +2663,9 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ + if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) + return; + /* Adjust any no-longer-needed kthreads. */ rcu_boost_kthread_setaffinity(rnp, -1); @@ -2573,26 +2680,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) cpu, rdp->qlen, rdp->nxtlist); } -#else /* #ifdef CONFIG_HOTPLUG_CPU */ - -static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) -{ -} - -static void __maybe_unused rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) -{ -} - -static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) -{ -} - -static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) -{ -} - -#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ - /* * Invoke any RCU callbacks that have made it to the end of their grace * period. Thottle as specified by rdp->blimit. @@ -2607,7 +2694,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) /* If no callbacks are ready, just return. */ if (!cpu_has_callbacks_ready_to_invoke(rdp)) { trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0); - trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist), + trace_rcu_batch_end(rsp->name, 0, !!READ_ONCE(rdp->nxtlist), need_resched(), is_idle_task(current), rcu_is_callbacks_kthread()); return; @@ -2663,7 +2750,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) } smp_mb(); /* List handling before counting for rcu_barrier(). */ rdp->qlen_lazy -= count_lazy; - ACCESS_ONCE(rdp->qlen) = rdp->qlen - count; + WRITE_ONCE(rdp->qlen, rdp->qlen - count); rdp->n_cbs_invoked += count; /* Reinstate batch limit if we have worked down the excess. */ @@ -2757,10 +2844,6 @@ static void force_qs_rnp(struct rcu_state *rsp, mask = 0; raw_spin_lock_irqsave(&rnp->lock, flags); smp_mb__after_unlock_lock(); - if (!rcu_gp_in_progress(rsp)) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); - return; - } if (rnp->qsmask == 0) { if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p || @@ -2790,8 +2873,6 @@ static void force_qs_rnp(struct rcu_state *rsp, bit = 1; for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { if ((rnp->qsmask & bit) != 0) { - if ((rnp->qsmaskinit & bit) == 0) - *isidle = false; /* Pending hotplug. */ if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) mask |= bit; } @@ -2820,7 +2901,7 @@ static void force_quiescent_state(struct rcu_state *rsp) /* Funnel through hierarchy to reduce memory contention. */ rnp = __this_cpu_read(rsp->rda->mynode); for (; rnp != NULL; rnp = rnp->parent) { - ret = (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) || + ret = (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) || !raw_spin_trylock(&rnp->fqslock); if (rnp_old != NULL) raw_spin_unlock(&rnp_old->fqslock); @@ -2836,15 +2917,14 @@ static void force_quiescent_state(struct rcu_state *rsp) raw_spin_lock_irqsave(&rnp_old->lock, flags); smp_mb__after_unlock_lock(); raw_spin_unlock(&rnp_old->fqslock); - if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { + if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { rsp->n_force_qs_lh++; raw_spin_unlock_irqrestore(&rnp_old->lock, flags); return; /* Someone beat us to it. */ } - ACCESS_ONCE(rsp->gp_flags) = - ACCESS_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS; + WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); raw_spin_unlock_irqrestore(&rnp_old->lock, flags); - rcu_gp_kthread_wake(rsp); + swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */ } /* @@ -2907,7 +2987,7 @@ static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); */ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) { - if (unlikely(!ACCESS_ONCE(rcu_scheduler_fully_active))) + if (unlikely(!READ_ONCE(rcu_scheduler_fully_active))) return; rcu_do_batch(rsp, rdp); } @@ -3076,7 +3156,7 @@ static void rcu_leak_callback(struct rcu_head *rhp) * is expected to specify a CPU. */ static void -__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), +__call_rcu(struct rcu_head *head, rcu_callback_t func, struct rcu_state *rsp, int cpu, bool lazy) { unsigned long flags; @@ -3085,7 +3165,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), WARN_ON_ONCE((unsigned long)head & 0x1); /* Misaligned rcu_head! */ if (debug_rcu_head_queue(head)) { /* Probable double call_rcu(), so leak the callback. */ - ACCESS_ONCE(head->func) = rcu_leak_callback; + WRITE_ONCE(head->func, rcu_leak_callback); WARN_ONCE(1, "__call_rcu(): Leaked duplicate callback\n"); return; } @@ -3124,7 +3204,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), if (!likely(rdp->nxtlist)) init_default_callback_list(rdp); } - ACCESS_ONCE(rdp->qlen) = rdp->qlen + 1; + WRITE_ONCE(rdp->qlen, rdp->qlen + 1); if (lazy) rdp->qlen_lazy++; else @@ -3147,7 +3227,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), /* * Queue an RCU-sched callback for invocation after a grace period. */ -void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +void call_rcu_sched(struct rcu_head *head, rcu_callback_t func) { __call_rcu(head, func, &rcu_sched_state, -1, 0); } @@ -3157,7 +3237,7 @@ EXPORT_SYMBOL_GPL(call_rcu_sched); /* * Queue an RCU callback for invocation after a quicker grace period. */ -void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +void call_rcu_bh(struct rcu_head *head, rcu_callback_t func) { __call_rcu(head, func, &rcu_bh_state, -1, 0); } @@ -3172,7 +3252,7 @@ EXPORT_SYMBOL_GPL(call_rcu_bh); * function may only be called from __kfree_rcu(). */ void kfree_call_rcu(struct rcu_head *head, - void (*func)(struct rcu_head *rcu)) + rcu_callback_t func) { __call_rcu(head, func, rcu_state_p, -1, 1); } @@ -3241,10 +3321,10 @@ static inline int rcu_blocking_is_gp(void) */ void synchronize_sched(void) { - rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && - !lock_is_held(&rcu_lock_map) && - !lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_sched() in RCU-sched read-side critical section"); + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_sched() in RCU-sched read-side critical section"); if (rcu_blocking_is_gp()) return; if (rcu_gp_is_expedited()) @@ -3269,10 +3349,10 @@ EXPORT_SYMBOL_GPL(synchronize_sched); */ void synchronize_rcu_bh(void) { - rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && - !lock_is_held(&rcu_lock_map) && - !lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section"); + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section"); if (rcu_blocking_is_gp()) return; if (rcu_gp_is_expedited()) @@ -3335,190 +3415,574 @@ void cond_synchronize_rcu(unsigned long oldstate) } EXPORT_SYMBOL_GPL(cond_synchronize_rcu); -static int synchronize_sched_expedited_cpu_stop(void *data) +/** + * get_state_synchronize_sched - Snapshot current RCU-sched state + * + * Returns a cookie that is used by a later call to cond_synchronize_sched() + * to determine whether or not a full grace period has elapsed in the + * meantime. + */ +unsigned long get_state_synchronize_sched(void) { /* - * There must be a full memory barrier on each affected CPU - * between the time that try_stop_cpus() is called and the - * time that it returns. - * - * In the current initial implementation of cpu_stop, the - * above condition is already met when the control reaches - * this point and the following smp_mb() is not strictly - * necessary. Do smp_mb() anyway for documentation and - * robustness against future implementation changes. + * Any prior manipulation of RCU-protected data must happen + * before the load from ->gpnum. */ - smp_mb(); /* See above comment block. */ - return 0; + smp_mb(); /* ^^^ */ + + /* + * Make sure this load happens before the purportedly + * time-consuming work between get_state_synchronize_sched() + * and cond_synchronize_sched(). + */ + return smp_load_acquire(&rcu_sched_state.gpnum); } +EXPORT_SYMBOL_GPL(get_state_synchronize_sched); /** - * synchronize_sched_expedited - Brute-force RCU-sched grace period + * cond_synchronize_sched - Conditionally wait for an RCU-sched grace period * - * Wait for an RCU-sched grace period to elapse, but use a "big hammer" - * approach to force the grace period to end quickly. This consumes - * significant time on all CPUs and is unfriendly to real-time workloads, - * so is thus not recommended for any sort of common-case code. In fact, - * if you are using synchronize_sched_expedited() in a loop, please - * restructure your code to batch your updates, and then use a single - * synchronize_sched() instead. - * - * This implementation can be thought of as an application of ticket - * locking to RCU, with sync_sched_expedited_started and - * sync_sched_expedited_done taking on the roles of the halves - * of the ticket-lock word. Each task atomically increments - * sync_sched_expedited_started upon entry, snapshotting the old value, - * then attempts to stop all the CPUs. If this succeeds, then each - * CPU will have executed a context switch, resulting in an RCU-sched - * grace period. We are then done, so we use atomic_cmpxchg() to - * update sync_sched_expedited_done to match our snapshot -- but - * only if someone else has not already advanced past our snapshot. + * @oldstate: return value from earlier call to get_state_synchronize_sched() * - * On the other hand, if try_stop_cpus() fails, we check the value - * of sync_sched_expedited_done. If it has advanced past our - * initial snapshot, then someone else must have forced a grace period - * some time after we took our snapshot. In this case, our work is - * done for us, and we can simply return. Otherwise, we try again, - * but keep our initial snapshot for purposes of checking for someone - * doing our work for us. + * If a full RCU-sched grace period has elapsed since the earlier call to + * get_state_synchronize_sched(), just return. Otherwise, invoke + * synchronize_sched() to wait for a full grace period. * - * If we fail too many times in a row, we fall back to synchronize_sched(). + * Yes, this function does not take counter wrap into account. But + * counter wrap is harmless. If the counter wraps, we have waited for + * more than 2 billion grace periods (and way more on a 64-bit system!), + * so waiting for one additional grace period should be just fine. */ -void synchronize_sched_expedited(void) +void cond_synchronize_sched(unsigned long oldstate) { - cpumask_var_t cm; - bool cma = false; - int cpu; - long firstsnap, s, snap; - int trycount = 0; - struct rcu_state *rsp = &rcu_sched_state; + unsigned long newstate; /* - * If we are in danger of counter wrap, just do synchronize_sched(). - * By allowing sync_sched_expedited_started to advance no more than - * ULONG_MAX/8 ahead of sync_sched_expedited_done, we are ensuring - * that more than 3.5 billion CPUs would be required to force a - * counter wrap on a 32-bit system. Quite a few more CPUs would of - * course be required on a 64-bit system. + * Ensure that this load happens before any RCU-destructive + * actions the caller might carry out after we return. */ - if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start), - (ulong)atomic_long_read(&rsp->expedited_done) + - ULONG_MAX / 8)) { + newstate = smp_load_acquire(&rcu_sched_state.completed); + if (ULONG_CMP_GE(oldstate, newstate)) synchronize_sched(); - atomic_long_inc(&rsp->expedited_wrap); +} +EXPORT_SYMBOL_GPL(cond_synchronize_sched); + +/* Adjust sequence number for start of update-side operation. */ +static void rcu_seq_start(unsigned long *sp) +{ + WRITE_ONCE(*sp, *sp + 1); + smp_mb(); /* Ensure update-side operation after counter increment. */ + WARN_ON_ONCE(!(*sp & 0x1)); +} + +/* Adjust sequence number for end of update-side operation. */ +static void rcu_seq_end(unsigned long *sp) +{ + smp_mb(); /* Ensure update-side operation before counter increment. */ + WRITE_ONCE(*sp, *sp + 1); + WARN_ON_ONCE(*sp & 0x1); +} + +/* Take a snapshot of the update side's sequence number. */ +static unsigned long rcu_seq_snap(unsigned long *sp) +{ + unsigned long s; + + smp_mb(); /* Caller's modifications seen first by other CPUs. */ + s = (READ_ONCE(*sp) + 3) & ~0x1; + smp_mb(); /* Above access must not bleed into critical section. */ + return s; +} + +/* + * Given a snapshot from rcu_seq_snap(), determine whether or not a + * full update-side operation has occurred. + */ +static bool rcu_seq_done(unsigned long *sp, unsigned long s) +{ + return ULONG_CMP_GE(READ_ONCE(*sp), s); +} + +/* Wrapper functions for expedited grace periods. */ +static void rcu_exp_gp_seq_start(struct rcu_state *rsp) +{ + rcu_seq_start(&rsp->expedited_sequence); +} +static void rcu_exp_gp_seq_end(struct rcu_state *rsp) +{ + rcu_seq_end(&rsp->expedited_sequence); + smp_mb(); /* Ensure that consecutive grace periods serialize. */ +} +static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) +{ + return rcu_seq_snap(&rsp->expedited_sequence); +} +static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) +{ + return rcu_seq_done(&rsp->expedited_sequence, s); +} + +/* + * Reset the ->expmaskinit values in the rcu_node tree to reflect any + * recent CPU-online activity. Note that these masks are not cleared + * when CPUs go offline, so they reflect the union of all CPUs that have + * ever been online. This means that this function normally takes its + * no-work-to-do fastpath. + */ +static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) +{ + bool done; + unsigned long flags; + unsigned long mask; + unsigned long oldmask; + int ncpus = READ_ONCE(rsp->ncpus); + struct rcu_node *rnp; + struct rcu_node *rnp_up; + + /* If no new CPUs onlined since last time, nothing to do. */ + if (likely(ncpus == rsp->ncpus_snap)) return; - } + rsp->ncpus_snap = ncpus; /* - * Take a ticket. Note that atomic_inc_return() implies a - * full memory barrier. + * Each pass through the following loop propagates newly onlined + * CPUs for the current rcu_node structure up the rcu_node tree. */ - snap = atomic_long_inc_return(&rsp->expedited_start); - firstsnap = snap; - if (!try_get_online_cpus()) { - /* CPU hotplug operation in flight, fall back to normal GP. */ - wait_rcu_gp(call_rcu_sched); - atomic_long_inc(&rsp->expedited_normal); - return; + rcu_for_each_leaf_node(rsp, rnp) { + raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); + if (rnp->expmaskinit == rnp->expmaskinitnext) { + raw_spin_unlock_irqrestore(&rnp->lock, flags); + continue; /* No new CPUs, nothing to do. */ + } + + /* Update this node's mask, track old value for propagation. */ + oldmask = rnp->expmaskinit; + rnp->expmaskinit = rnp->expmaskinitnext; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + + /* If was already nonzero, nothing to propagate. */ + if (oldmask) + continue; + + /* Propagate the new CPU up the tree. */ + mask = rnp->grpmask; + rnp_up = rnp->parent; + done = false; + while (rnp_up) { + raw_spin_lock_irqsave(&rnp_up->lock, flags); + smp_mb__after_unlock_lock(); + if (rnp_up->expmaskinit) + done = true; + rnp_up->expmaskinit |= mask; + raw_spin_unlock_irqrestore(&rnp_up->lock, flags); + if (done) + break; + mask = rnp_up->grpmask; + rnp_up = rnp_up->parent; + } } - WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); - - /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */ - cma = zalloc_cpumask_var(&cm, GFP_KERNEL); - if (cma) { - cpumask_copy(cm, cpu_online_mask); - cpumask_clear_cpu(raw_smp_processor_id(), cm); - for_each_cpu(cpu, cm) { - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); +} + +/* + * Reset the ->expmask values in the rcu_node tree in preparation for + * a new expedited grace period. + */ +static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) +{ + unsigned long flags; + struct rcu_node *rnp; - if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1)) - cpumask_clear_cpu(cpu, cm); + sync_exp_reset_tree_hotplug(rsp); + rcu_for_each_node_breadth_first(rsp, rnp) { + raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); + WARN_ON_ONCE(rnp->expmask); + rnp->expmask = rnp->expmaskinit; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + } +} + +/* + * Return non-zero if there is no RCU expedited grace period in progress + * for the specified rcu_node structure, in other words, if all CPUs and + * tasks covered by the specified rcu_node structure have done their bit + * for the current expedited grace period. Works only for preemptible + * RCU -- other RCU implementation use other means. + * + * Caller must hold the root rcu_node's exp_funnel_mutex. + */ +static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) +{ + return rnp->exp_tasks == NULL && + READ_ONCE(rnp->expmask) == 0; +} + +/* + * Report the exit from RCU read-side critical section for the last task + * that queued itself during or before the current expedited preemptible-RCU + * grace period. This event is reported either to the rcu_node structure on + * which the task was queued or to one of that rcu_node structure's ancestors, + * recursively up the tree. (Calm down, calm down, we do the recursion + * iteratively!) + * + * Caller must hold the root rcu_node's exp_funnel_mutex and the + * specified rcu_node structure's ->lock. + */ +static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, + bool wake, unsigned long flags) + __releases(rnp->lock) +{ + unsigned long mask; + + for (;;) { + if (!sync_rcu_preempt_exp_done(rnp)) { + if (!rnp->expmask) + rcu_initiate_boost(rnp, flags); + else + raw_spin_unlock_irqrestore(&rnp->lock, flags); + break; + } + if (rnp->parent == NULL) { + raw_spin_unlock_irqrestore(&rnp->lock, flags); + if (wake) { + smp_mb(); /* EGP done before wake_up(). */ + swake_up(&rsp->expedited_wq); + } + break; } - if (cpumask_weight(cm) == 0) - goto all_cpus_idle; + mask = rnp->grpmask; + raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ + rnp = rnp->parent; + raw_spin_lock(&rnp->lock); /* irqs already disabled */ + smp_mb__after_unlock_lock(); + WARN_ON_ONCE(!(rnp->expmask & mask)); + rnp->expmask &= ~mask; } +} + +/* + * Report expedited quiescent state for specified node. This is a + * lock-acquisition wrapper function for __rcu_report_exp_rnp(). + * + * Caller must hold the root rcu_node's exp_funnel_mutex. + */ +static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, + struct rcu_node *rnp, bool wake) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); + __rcu_report_exp_rnp(rsp, rnp, wake, flags); +} + +/* + * Report expedited quiescent state for multiple CPUs, all covered by the + * specified leaf rcu_node structure. Caller must hold the root + * rcu_node's exp_funnel_mutex. + */ +static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, + unsigned long mask, bool wake) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); + if (!(rnp->expmask & mask)) { + raw_spin_unlock_irqrestore(&rnp->lock, flags); + return; + } + rnp->expmask &= ~mask; + __rcu_report_exp_rnp(rsp, rnp, wake, flags); /* Releases rnp->lock. */ +} + +/* + * Report expedited quiescent state for specified rcu_data (CPU). + * Caller must hold the root rcu_node's exp_funnel_mutex. + */ +static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp, + bool wake) +{ + rcu_report_exp_cpu_mult(rsp, rdp->mynode, rdp->grpmask, wake); +} + +/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */ +static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp, + struct rcu_data *rdp, + atomic_long_t *stat, unsigned long s) +{ + if (rcu_exp_gp_seq_done(rsp, s)) { + if (rnp) + mutex_unlock(&rnp->exp_funnel_mutex); + else if (rdp) + mutex_unlock(&rdp->exp_funnel_mutex); + /* Ensure test happens before caller kfree(). */ + smp_mb__before_atomic(); /* ^^^ */ + atomic_long_inc(stat); + return true; + } + return false; +} + +/* + * Funnel-lock acquisition for expedited grace periods. Returns a + * pointer to the root rcu_node structure, or NULL if some other + * task did the expedited grace period for us. + */ +static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) +{ + struct rcu_data *rdp; + struct rcu_node *rnp0; + struct rcu_node *rnp1 = NULL; /* - * Each pass through the following loop attempts to force a - * context switch on each CPU. + * First try directly acquiring the root lock in order to reduce + * latency in the common case where expedited grace periods are + * rare. We check mutex_is_locked() to avoid pathological levels of + * memory contention on ->exp_funnel_mutex in the heavy-load case. */ - while (try_stop_cpus(cma ? cm : cpu_online_mask, - synchronize_sched_expedited_cpu_stop, - NULL) == -EAGAIN) { - put_online_cpus(); - atomic_long_inc(&rsp->expedited_tryfail); - - /* Check to see if someone else did our work for us. */ - s = atomic_long_read(&rsp->expedited_done); - if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { - /* ensure test happens before caller kfree */ - smp_mb__before_atomic(); /* ^^^ */ - atomic_long_inc(&rsp->expedited_workdone1); - free_cpumask_var(cm); - return; + rnp0 = rcu_get_root(rsp); + if (!mutex_is_locked(&rnp0->exp_funnel_mutex)) { + if (mutex_trylock(&rnp0->exp_funnel_mutex)) { + if (sync_exp_work_done(rsp, rnp0, NULL, + &rsp->expedited_workdone0, s)) + return NULL; + return rnp0; } + } - /* No joy, try again later. Or just synchronize_sched(). */ - if (trycount++ < 10) { - udelay(trycount * num_online_cpus()); - } else { - wait_rcu_gp(call_rcu_sched); - atomic_long_inc(&rsp->expedited_normal); - free_cpumask_var(cm); - return; - } + /* + * Each pass through the following loop works its way + * up the rcu_node tree, returning if others have done the + * work or otherwise falls through holding the root rnp's + * ->exp_funnel_mutex. The mapping from CPU to rcu_node structure + * can be inexact, as it is just promoting locality and is not + * strictly needed for correctness. + */ + rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); + if (sync_exp_work_done(rsp, NULL, NULL, &rsp->expedited_workdone1, s)) + return NULL; + mutex_lock(&rdp->exp_funnel_mutex); + rnp0 = rdp->mynode; + for (; rnp0 != NULL; rnp0 = rnp0->parent) { + if (sync_exp_work_done(rsp, rnp1, rdp, + &rsp->expedited_workdone2, s)) + return NULL; + mutex_lock(&rnp0->exp_funnel_mutex); + if (rnp1) + mutex_unlock(&rnp1->exp_funnel_mutex); + else + mutex_unlock(&rdp->exp_funnel_mutex); + rnp1 = rnp0; + } + if (sync_exp_work_done(rsp, rnp1, rdp, + &rsp->expedited_workdone3, s)) + return NULL; + return rnp1; +} - /* Recheck to see if someone else did our work for us. */ - s = atomic_long_read(&rsp->expedited_done); - if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { - /* ensure test happens before caller kfree */ - smp_mb__before_atomic(); /* ^^^ */ - atomic_long_inc(&rsp->expedited_workdone2); - free_cpumask_var(cm); - return; +/* Invoked on each online non-idle CPU for expedited quiescent state. */ +static void sync_sched_exp_handler(void *data) +{ + struct rcu_data *rdp; + struct rcu_node *rnp; + struct rcu_state *rsp = data; + + rdp = this_cpu_ptr(rsp->rda); + rnp = rdp->mynode; + if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || + __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) + return; + __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true); + resched_cpu(smp_processor_id()); +} + +/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */ +static void sync_sched_exp_online_cleanup(int cpu) +{ + struct rcu_data *rdp; + int ret; + struct rcu_node *rnp; + struct rcu_state *rsp = &rcu_sched_state; + + rdp = per_cpu_ptr(rsp->rda, cpu); + rnp = rdp->mynode; + if (!(READ_ONCE(rnp->expmask) & rdp->grpmask)) + return; + ret = smp_call_function_single(cpu, sync_sched_exp_handler, rsp, 0); + WARN_ON_ONCE(ret); +} + +/* + * Select the nodes that the upcoming expedited grace period needs + * to wait for. + */ +static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, + smp_call_func_t func) +{ + int cpu; + unsigned long flags; + unsigned long mask; + unsigned long mask_ofl_test; + unsigned long mask_ofl_ipi; + int ret; + struct rcu_node *rnp; + + sync_exp_reset_tree(rsp); + rcu_for_each_leaf_node(rsp, rnp) { + raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); + + /* Each pass checks a CPU for identity, offline, and idle. */ + mask_ofl_test = 0; + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) { + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + + if (raw_smp_processor_id() == cpu || + !(atomic_add_return(0, &rdtp->dynticks) & 0x1)) + mask_ofl_test |= rdp->grpmask; } + mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; /* - * Refetching sync_sched_expedited_started allows later - * callers to piggyback on our grace period. We retry - * after they started, so our grace period works for them, - * and they started after our first try, so their grace - * period works for us. + * Need to wait for any blocked tasks as well. Note that + * additional blocking tasks will also block the expedited + * GP until such time as the ->expmask bits are cleared. */ - if (!try_get_online_cpus()) { - /* CPU hotplug operation in flight, use normal GP. */ - wait_rcu_gp(call_rcu_sched); - atomic_long_inc(&rsp->expedited_normal); - free_cpumask_var(cm); - return; + if (rcu_preempt_has_tasks(rnp)) + rnp->exp_tasks = rnp->blkd_tasks.next; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + + /* IPI the remaining CPUs for expedited quiescent state. */ + mask = 1; + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { + if (!(mask_ofl_ipi & mask)) + continue; +retry_ipi: + ret = smp_call_function_single(cpu, func, rsp, 0); + if (!ret) { + mask_ofl_ipi &= ~mask; + } else { + /* Failed, raced with offline. */ + raw_spin_lock_irqsave(&rnp->lock, flags); + if (cpu_online(cpu) && + (rnp->expmask & mask)) { + raw_spin_unlock_irqrestore(&rnp->lock, + flags); + schedule_timeout_uninterruptible(1); + if (cpu_online(cpu) && + (rnp->expmask & mask)) + goto retry_ipi; + raw_spin_lock_irqsave(&rnp->lock, + flags); + } + if (!(rnp->expmask & mask)) + mask_ofl_ipi &= ~mask; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + } } - snap = atomic_long_read(&rsp->expedited_start); - smp_mb(); /* ensure read is before try_stop_cpus(). */ + /* Report quiescent states for those that went offline. */ + mask_ofl_test |= mask_ofl_ipi; + if (mask_ofl_test) + rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false); } - atomic_long_inc(&rsp->expedited_stoppedcpus); +} -all_cpus_idle: - free_cpumask_var(cm); +static void synchronize_sched_expedited_wait(struct rcu_state *rsp) +{ + int cpu; + unsigned long jiffies_stall; + unsigned long jiffies_start; + unsigned long mask; + struct rcu_node *rnp; + struct rcu_node *rnp_root = rcu_get_root(rsp); + int ret; - /* - * Everyone up to our most recent fetch is covered by our grace - * period. Update the counter, but only if our work is still - * relevant -- which it won't be if someone who started later - * than we did already did their update. - */ - do { - atomic_long_inc(&rsp->expedited_done_tries); - s = atomic_long_read(&rsp->expedited_done); - if (ULONG_CMP_GE((ulong)s, (ulong)snap)) { - /* ensure test happens before caller kfree */ - smp_mb__before_atomic(); /* ^^^ */ - atomic_long_inc(&rsp->expedited_done_lost); - break; + jiffies_stall = rcu_jiffies_till_stall_check(); + jiffies_start = jiffies; + + for (;;) { + ret = swait_event_timeout( + rsp->expedited_wq, + sync_rcu_preempt_exp_done(rnp_root), + jiffies_stall); + if (ret > 0) + return; + if (ret < 0) { + /* Hit a signal, disable CPU stall warnings. */ + swait_event(rsp->expedited_wq, + sync_rcu_preempt_exp_done(rnp_root)); + return; } - } while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s); - atomic_long_inc(&rsp->expedited_done_exit); + pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", + rsp->name); + rcu_for_each_leaf_node(rsp, rnp) { + (void)rcu_print_task_exp_stall(rnp); + mask = 1; + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { + struct rcu_data *rdp; + + if (!(rnp->expmask & mask)) + continue; + rdp = per_cpu_ptr(rsp->rda, cpu); + pr_cont(" %d-%c%c%c", cpu, + "O."[cpu_online(cpu)], + "o."[!!(rdp->grpmask & rnp->expmaskinit)], + "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]); + } + mask <<= 1; + } + pr_cont(" } %lu jiffies s: %lu\n", + jiffies - jiffies_start, rsp->expedited_sequence); + rcu_for_each_leaf_node(rsp, rnp) { + mask = 1; + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { + if (!(rnp->expmask & mask)) + continue; + dump_cpu_task(cpu); + } + } + jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3; + } +} - put_online_cpus(); +/** + * synchronize_sched_expedited - Brute-force RCU-sched grace period + * + * Wait for an RCU-sched grace period to elapse, but use a "big hammer" + * approach to force the grace period to end quickly. This consumes + * significant time on all CPUs and is unfriendly to real-time workloads, + * so is thus not recommended for any sort of common-case code. In fact, + * if you are using synchronize_sched_expedited() in a loop, please + * restructure your code to batch your updates, and then use a single + * synchronize_sched() instead. + * + * This implementation can be thought of as an application of sequence + * locking to expedited grace periods, but using the sequence counter to + * determine when someone else has already done the work instead of for + * retrying readers. + */ +void synchronize_sched_expedited(void) +{ + unsigned long s; + struct rcu_node *rnp; + struct rcu_state *rsp = &rcu_sched_state; + + /* Take a snapshot of the sequence number. */ + s = rcu_exp_gp_seq_snap(rsp); + + rnp = exp_funnel_lock(rsp, s); + if (rnp == NULL) + return; /* Someone else did our work for us. */ + + rcu_exp_gp_seq_start(rsp); + sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler); + synchronize_sched_expedited_wait(rsp); + + rcu_exp_gp_seq_end(rsp); + mutex_unlock(&rnp->exp_funnel_mutex); } EXPORT_SYMBOL_GPL(synchronize_sched_expedited); @@ -3544,11 +4008,11 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) /* Is the RCU core waiting for a quiescent state from this CPU? */ if (rcu_scheduler_fully_active && - rdp->qs_pending && !rdp->passed_quiesce && + rdp->core_needs_qs && rdp->cpu_no_qs.b.norm && rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) { - rdp->n_rp_qs_pending++; - } else if (rdp->qs_pending && - (rdp->passed_quiesce || + rdp->n_rp_core_needs_qs++; + } else if (rdp->core_needs_qs && + (!rdp->cpu_no_qs.b.norm || rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) { rdp->n_rp_report_qs++; return 1; @@ -3567,14 +4031,14 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) } /* Has another RCU grace period completed? */ - if (ACCESS_ONCE(rnp->completed) != rdp->completed) { /* outside lock */ + if (READ_ONCE(rnp->completed) != rdp->completed) { /* outside lock */ rdp->n_rp_gp_completed++; return 1; } /* Has a new RCU grace period started? */ - if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum || - unlikely(ACCESS_ONCE(rdp->gpwrap))) { /* outside lock */ + if (READ_ONCE(rnp->gpnum) != rdp->gpnum || + unlikely(READ_ONCE(rdp->gpwrap))) { /* outside lock */ rdp->n_rp_gp_started++; return 1; } @@ -3610,7 +4074,7 @@ static int rcu_pending(void) * non-NULL, store an indication of whether all callbacks are lazy. * (If there are no callbacks, all of them are deemed to be lazy.) */ -static int __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy) +static bool __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy) { bool al = true; bool hc = false; @@ -3653,10 +4117,10 @@ static void rcu_barrier_callback(struct rcu_head *rhp) struct rcu_state *rsp = rdp->rsp; if (atomic_dec_and_test(&rsp->barrier_cpu_count)) { - _rcu_barrier_trace(rsp, "LastCB", -1, rsp->n_barrier_done); + _rcu_barrier_trace(rsp, "LastCB", -1, rsp->barrier_sequence); complete(&rsp->barrier_completion); } else { - _rcu_barrier_trace(rsp, "CB", -1, rsp->n_barrier_done); + _rcu_barrier_trace(rsp, "CB", -1, rsp->barrier_sequence); } } @@ -3668,7 +4132,7 @@ static void rcu_barrier_func(void *type) struct rcu_state *rsp = type; struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); - _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done); + _rcu_barrier_trace(rsp, "IRQ", -1, rsp->barrier_sequence); atomic_inc(&rsp->barrier_cpu_count); rsp->call(&rdp->barrier_head, rcu_barrier_callback); } @@ -3681,55 +4145,24 @@ static void _rcu_barrier(struct rcu_state *rsp) { int cpu; struct rcu_data *rdp; - unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done); - unsigned long snap_done; + unsigned long s = rcu_seq_snap(&rsp->barrier_sequence); - _rcu_barrier_trace(rsp, "Begin", -1, snap); + _rcu_barrier_trace(rsp, "Begin", -1, s); /* Take mutex to serialize concurrent rcu_barrier() requests. */ mutex_lock(&rsp->barrier_mutex); - /* - * Ensure that all prior references, including to ->n_barrier_done, - * are ordered before the _rcu_barrier() machinery. - */ - smp_mb(); /* See above block comment. */ - - /* - * Recheck ->n_barrier_done to see if others did our work for us. - * This means checking ->n_barrier_done for an even-to-odd-to-even - * transition. The "if" expression below therefore rounds the old - * value up to the next even number and adds two before comparing. - */ - snap_done = rsp->n_barrier_done; - _rcu_barrier_trace(rsp, "Check", -1, snap_done); - - /* - * If the value in snap is odd, we needed to wait for the current - * rcu_barrier() to complete, then wait for the next one, in other - * words, we need the value of snap_done to be three larger than - * the value of snap. On the other hand, if the value in snap is - * even, we only had to wait for the next rcu_barrier() to complete, - * in other words, we need the value of snap_done to be only two - * greater than the value of snap. The "(snap + 3) & ~0x1" computes - * this for us (thank you, Linus!). - */ - if (ULONG_CMP_GE(snap_done, (snap + 3) & ~0x1)) { - _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done); + /* Did someone else do our work for us? */ + if (rcu_seq_done(&rsp->barrier_sequence, s)) { + _rcu_barrier_trace(rsp, "EarlyExit", -1, rsp->barrier_sequence); smp_mb(); /* caller's subsequent code after above check. */ mutex_unlock(&rsp->barrier_mutex); return; } - /* - * Increment ->n_barrier_done to avoid duplicate work. Use - * ACCESS_ONCE() to prevent the compiler from speculating - * the increment to precede the early-exit check. - */ - ACCESS_ONCE(rsp->n_barrier_done) = rsp->n_barrier_done + 1; - WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1); - _rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done); - smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */ + /* Mark the start of the barrier operation. */ + rcu_seq_start(&rsp->barrier_sequence); + _rcu_barrier_trace(rsp, "Inc1", -1, rsp->barrier_sequence); /* * Initialize the count to one rather than to zero in order to @@ -3753,22 +4186,22 @@ static void _rcu_barrier(struct rcu_state *rsp) if (rcu_is_nocb_cpu(cpu)) { if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) { _rcu_barrier_trace(rsp, "OfflineNoCB", cpu, - rsp->n_barrier_done); + rsp->barrier_sequence); } else { _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, - rsp->n_barrier_done); + rsp->barrier_sequence); smp_mb__before_atomic(); atomic_inc(&rsp->barrier_cpu_count); __call_rcu(&rdp->barrier_head, rcu_barrier_callback, rsp, cpu, 0); } - } else if (ACCESS_ONCE(rdp->qlen)) { + } else if (READ_ONCE(rdp->qlen)) { _rcu_barrier_trace(rsp, "OnlineQ", cpu, - rsp->n_barrier_done); + rsp->barrier_sequence); smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); } else { _rcu_barrier_trace(rsp, "OnlineNQ", cpu, - rsp->n_barrier_done); + rsp->barrier_sequence); } } put_online_cpus(); @@ -3780,16 +4213,13 @@ static void _rcu_barrier(struct rcu_state *rsp) if (atomic_dec_and_test(&rsp->barrier_cpu_count)) complete(&rsp->barrier_completion); - /* Increment ->n_barrier_done to prevent duplicate work. */ - smp_mb(); /* Keep increment after above mechanism. */ - ACCESS_ONCE(rsp->n_barrier_done) = rsp->n_barrier_done + 1; - WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0); - _rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done); - smp_mb(); /* Keep increment before caller's subsequent code. */ - /* Wait for all rcu_barrier_callback() callbacks to be invoked. */ wait_for_completion(&rsp->barrier_completion); + /* Mark the end of the barrier operation. */ + _rcu_barrier_trace(rsp, "Inc2", -1, rsp->barrier_sequence); + rcu_seq_end(&rsp->barrier_sequence); + /* Other rcu_barrier() invocations can now safely proceed. */ mutex_unlock(&rsp->barrier_mutex); } @@ -3854,6 +4284,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); rdp->cpu = cpu; rdp->rsp = rsp; + mutex_init(&rdp->exp_funnel_mutex); rcu_boot_init_nocb_percpu_data(rdp); raw_spin_unlock_irqrestore(&rnp->lock, flags); } @@ -3874,7 +4305,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) /* Set up local state, ensuring consistent view of global state. */ raw_spin_lock_irqsave(&rnp->lock, flags); - rdp->beenonline = 1; /* We have now been online. */ rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = rsp->n_force_qs; rdp->blimit = blimit; @@ -3896,11 +4326,15 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) raw_spin_lock(&rnp->lock); /* irqs already disabled. */ smp_mb__after_unlock_lock(); rnp->qsmaskinitnext |= mask; + rnp->expmaskinitnext |= mask; + if (!rdp->beenonline) + WRITE_ONCE(rsp->ncpus, READ_ONCE(rsp->ncpus) + 1); + rdp->beenonline = true; /* We have now been online. */ rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */ rdp->completed = rnp->completed; - rdp->passed_quiesce = false; - rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); - rdp->qs_pending = false; + rdp->cpu_no_qs.b.norm = true; + rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu); + rdp->core_needs_qs = false; trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); raw_spin_unlock_irqrestore(&rnp->lock, flags); } @@ -3933,6 +4367,7 @@ int rcu_cpu_notify(struct notifier_block *self, break; case CPU_ONLINE: case CPU_DOWN_FAILED: + sync_sched_exp_online_cleanup(cpu); rcu_boost_kthread_setaffinity(rnp, -1); break; case CPU_DOWN_PREPARE: @@ -3944,6 +4379,12 @@ int rcu_cpu_notify(struct notifier_block *self, rcu_cleanup_dying_cpu(rsp); break; case CPU_DYING_IDLE: + /* QS for any half-done expedited RCU-sched GP. */ + preempt_disable(); + rcu_report_exp_rdp(&rcu_sched_state, + this_cpu_ptr(rcu_sched_state.rda), true); + preempt_enable(); + for_each_rcu_flavor(rsp) { rcu_cleanup_dying_idle_cpu(cpu, rsp); } @@ -4043,24 +4484,24 @@ void rcu_scheduler_starting(void) /* * Compute the per-level fanout, either using the exact fanout specified - * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT. + * or balancing the tree, depending on the rcu_fanout_exact boot parameter. */ -static void __init rcu_init_levelspread(struct rcu_state *rsp) +static void __init rcu_init_levelspread(int *levelspread, const int *levelcnt) { int i; - if (IS_ENABLED(CONFIG_RCU_FANOUT_EXACT)) { - rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf; + if (rcu_fanout_exact) { + levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf; for (i = rcu_num_lvls - 2; i >= 0; i--) - rsp->levelspread[i] = CONFIG_RCU_FANOUT; + levelspread[i] = RCU_FANOUT; } else { int ccur; int cprv; cprv = nr_cpu_ids; for (i = rcu_num_lvls - 1; i >= 0; i--) { - ccur = rsp->levelcnt[i]; - rsp->levelspread[i] = (cprv + ccur - 1) / ccur; + ccur = levelcnt[i]; + levelspread[i] = (cprv + ccur - 1) / ccur; cprv = ccur; } } @@ -4072,44 +4513,40 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) static void __init rcu_init_one(struct rcu_state *rsp, struct rcu_data __percpu *rda) { - static const char * const buf[] = { - "rcu_node_0", - "rcu_node_1", - "rcu_node_2", - "rcu_node_3" }; /* Match MAX_RCU_LVLS */ - static const char * const fqs[] = { - "rcu_node_fqs_0", - "rcu_node_fqs_1", - "rcu_node_fqs_2", - "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */ + static const char * const buf[] = RCU_NODE_NAME_INIT; + static const char * const fqs[] = RCU_FQS_NAME_INIT; + static const char * const exp[] = RCU_EXP_NAME_INIT; static u8 fl_mask = 0x1; + + int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */ + int levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ int cpustride = 1; int i; int j; struct rcu_node *rnp; - BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */ + BUILD_BUG_ON(RCU_NUM_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */ - /* Silence gcc 4.8 warning about array index out of range. */ - if (rcu_num_lvls > RCU_NUM_LVLS) - panic("rcu_init_one: rcu_num_lvls overflow"); + /* Silence gcc 4.8 false positive about array index out of range. */ + if (rcu_num_lvls <= 0 || rcu_num_lvls > RCU_NUM_LVLS) + panic("rcu_init_one: rcu_num_lvls out of range"); /* Initialize the level-tracking arrays. */ for (i = 0; i < rcu_num_lvls; i++) - rsp->levelcnt[i] = num_rcu_lvl[i]; + levelcnt[i] = num_rcu_lvl[i]; for (i = 1; i < rcu_num_lvls; i++) - rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1]; - rcu_init_levelspread(rsp); + rsp->level[i] = rsp->level[i - 1] + levelcnt[i - 1]; + rcu_init_levelspread(levelspread, levelcnt); rsp->flavor_mask = fl_mask; fl_mask <<= 1; /* Initialize the elements themselves, starting from the leaves. */ for (i = rcu_num_lvls - 1; i >= 0; i--) { - cpustride *= rsp->levelspread[i]; + cpustride *= levelspread[i]; rnp = rsp->level[i]; - for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { + for (j = 0; j < levelcnt[i]; j++, rnp++) { raw_spin_lock_init(&rnp->lock); lockdep_set_class_and_name(&rnp->lock, &rcu_node_class[i], buf[i]); @@ -4129,18 +4566,22 @@ static void __init rcu_init_one(struct rcu_state *rsp, rnp->grpmask = 0; rnp->parent = NULL; } else { - rnp->grpnum = j % rsp->levelspread[i - 1]; + rnp->grpnum = j % levelspread[i - 1]; rnp->grpmask = 1UL << rnp->grpnum; rnp->parent = rsp->level[i - 1] + - j / rsp->levelspread[i - 1]; + j / levelspread[i - 1]; } rnp->level = i; INIT_LIST_HEAD(&rnp->blkd_tasks); rcu_init_one_nocb(rnp); + mutex_init(&rnp->exp_funnel_mutex); + lockdep_set_class_and_name(&rnp->exp_funnel_mutex, + &rcu_exp_class[i], exp[i]); } } - init_swait_head(&rsp->gp_wq); + init_swait_queue_head(&rsp->gp_wq); + init_swait_queue_head(&rsp->expedited_wq); rnp = rsp->level[rcu_num_lvls - 1]; for_each_possible_cpu(i) { while (i > rnp->grphi) @@ -4160,9 +4601,7 @@ static void __init rcu_init_geometry(void) { ulong d; int i; - int j; - int n = nr_cpu_ids; - int rcu_capacity[MAX_RCU_LVLS + 1]; + int rcu_capacity[RCU_NUM_LVLS]; /* * Initialize any unspecified boot parameters. @@ -4178,54 +4617,80 @@ static void __init rcu_init_geometry(void) jiffies_till_next_fqs = d; /* If the compile-time values are accurate, just leave. */ - if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF && + if (rcu_fanout_leaf == RCU_FANOUT_LEAF && nr_cpu_ids == NR_CPUS) return; pr_info("RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%d\n", rcu_fanout_leaf, nr_cpu_ids); /* + * The boot-time rcu_fanout_leaf parameter must be at least two + * and cannot exceed the number of bits in the rcu_node masks. + * Complain and fall back to the compile-time values if this + * limit is exceeded. + */ + if (rcu_fanout_leaf < 2 || + rcu_fanout_leaf > sizeof(unsigned long) * 8) { + rcu_fanout_leaf = RCU_FANOUT_LEAF; + WARN_ON(1); + return; + } + + /* * Compute number of nodes that can be handled an rcu_node tree - * with the given number of levels. Setting rcu_capacity[0] makes - * some of the arithmetic easier. + * with the given number of levels. */ - rcu_capacity[0] = 1; - rcu_capacity[1] = rcu_fanout_leaf; - for (i = 2; i <= MAX_RCU_LVLS; i++) - rcu_capacity[i] = rcu_capacity[i - 1] * CONFIG_RCU_FANOUT; + rcu_capacity[0] = rcu_fanout_leaf; + for (i = 1; i < RCU_NUM_LVLS; i++) + rcu_capacity[i] = rcu_capacity[i - 1] * RCU_FANOUT; /* - * The boot-time rcu_fanout_leaf parameter is only permitted - * to increase the leaf-level fanout, not decrease it. Of course, - * the leaf-level fanout cannot exceed the number of bits in - * the rcu_node masks. Finally, the tree must be able to accommodate - * the configured number of CPUs. Complain and fall back to the - * compile-time values if these limits are exceeded. + * The tree must be able to accommodate the configured number of CPUs. + * If this limit is exceeded, fall back to the compile-time values. */ - if (rcu_fanout_leaf < CONFIG_RCU_FANOUT_LEAF || - rcu_fanout_leaf > sizeof(unsigned long) * 8 || - n > rcu_capacity[MAX_RCU_LVLS]) { + if (nr_cpu_ids > rcu_capacity[RCU_NUM_LVLS - 1]) { + rcu_fanout_leaf = RCU_FANOUT_LEAF; WARN_ON(1); return; } + /* Calculate the number of levels in the tree. */ + for (i = 0; nr_cpu_ids > rcu_capacity[i]; i++) { + } + rcu_num_lvls = i + 1; + /* Calculate the number of rcu_nodes at each level of the tree. */ - for (i = 1; i <= MAX_RCU_LVLS; i++) - if (n <= rcu_capacity[i]) { - for (j = 0; j <= i; j++) - num_rcu_lvl[j] = - DIV_ROUND_UP(n, rcu_capacity[i - j]); - rcu_num_lvls = i; - for (j = i + 1; j <= MAX_RCU_LVLS; j++) - num_rcu_lvl[j] = 0; - break; - } + for (i = 0; i < rcu_num_lvls; i++) { + int cap = rcu_capacity[(rcu_num_lvls - 1) - i]; + num_rcu_lvl[i] = DIV_ROUND_UP(nr_cpu_ids, cap); + } /* Calculate the total number of rcu_node structures. */ rcu_num_nodes = 0; - for (i = 0; i <= MAX_RCU_LVLS; i++) + for (i = 0; i < rcu_num_lvls; i++) rcu_num_nodes += num_rcu_lvl[i]; - rcu_num_nodes -= n; +} + +/* + * Dump out the structure of the rcu_node combining tree associated + * with the rcu_state structure referenced by rsp. + */ +static void __init rcu_dump_rcu_node_tree(struct rcu_state *rsp) +{ + int level = 0; + struct rcu_node *rnp; + + pr_info("rcu_node tree layout dump\n"); + pr_info(" "); + rcu_for_each_node_breadth_first(rsp, rnp) { + if (rnp->level != level) { + pr_cont("\n"); + pr_info(" "); + level = rnp->level; + } + pr_cont("%d:%d ^%d ", rnp->grplo, rnp->grphi, rnp->grpnum); + } + pr_cont("\n"); } void __init rcu_init(void) @@ -4236,8 +4701,12 @@ void __init rcu_init(void) rcu_bootup_announce(); rcu_init_geometry(); +#ifndef CONFIG_PREEMPT_RT_FULL rcu_init_one(&rcu_bh_state, &rcu_bh_data); +#endif rcu_init_one(&rcu_sched_state, &rcu_sched_data); + if (dump_tree) + rcu_dump_rcu_node_tree(&rcu_sched_state); __rcu_init_preempt(); /* diff --git a/kernel/kernel/rcu/tree.h b/kernel/kernel/rcu/tree.h index 8a9f0d364..c75834d8d 100644 --- a/kernel/kernel/rcu/tree.h +++ b/kernel/kernel/rcu/tree.h @@ -27,7 +27,8 @@ #include <linux/threads.h> #include <linux/cpumask.h> #include <linux/seqlock.h> -#include <linux/wait-simple.h> +#include <linux/swait.h> +#include <linux/stop_machine.h> /* * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and @@ -36,47 +37,74 @@ * In practice, this did work well going from three levels to four. * Of course, your mileage may vary. */ -#define MAX_RCU_LVLS 4 -#define RCU_FANOUT_1 (CONFIG_RCU_FANOUT_LEAF) -#define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT) -#define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT) -#define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT) + +#ifdef CONFIG_RCU_FANOUT +#define RCU_FANOUT CONFIG_RCU_FANOUT +#else /* #ifdef CONFIG_RCU_FANOUT */ +# ifdef CONFIG_64BIT +# define RCU_FANOUT 64 +# else +# define RCU_FANOUT 32 +# endif +#endif /* #else #ifdef CONFIG_RCU_FANOUT */ + +#ifdef CONFIG_RCU_FANOUT_LEAF +#define RCU_FANOUT_LEAF CONFIG_RCU_FANOUT_LEAF +#else /* #ifdef CONFIG_RCU_FANOUT_LEAF */ +# ifdef CONFIG_64BIT +# define RCU_FANOUT_LEAF 64 +# else +# define RCU_FANOUT_LEAF 32 +# endif +#endif /* #else #ifdef CONFIG_RCU_FANOUT_LEAF */ + +#define RCU_FANOUT_1 (RCU_FANOUT_LEAF) +#define RCU_FANOUT_2 (RCU_FANOUT_1 * RCU_FANOUT) +#define RCU_FANOUT_3 (RCU_FANOUT_2 * RCU_FANOUT) +#define RCU_FANOUT_4 (RCU_FANOUT_3 * RCU_FANOUT) #if NR_CPUS <= RCU_FANOUT_1 # define RCU_NUM_LVLS 1 # define NUM_RCU_LVL_0 1 -# define NUM_RCU_LVL_1 (NR_CPUS) -# define NUM_RCU_LVL_2 0 -# define NUM_RCU_LVL_3 0 -# define NUM_RCU_LVL_4 0 +# define NUM_RCU_NODES NUM_RCU_LVL_0 +# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 } +# define RCU_NODE_NAME_INIT { "rcu_node_0" } +# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" } +# define RCU_EXP_NAME_INIT { "rcu_node_exp_0" } #elif NR_CPUS <= RCU_FANOUT_2 # define RCU_NUM_LVLS 2 # define NUM_RCU_LVL_0 1 # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) -# define NUM_RCU_LVL_2 (NR_CPUS) -# define NUM_RCU_LVL_3 0 -# define NUM_RCU_LVL_4 0 +# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1) +# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 } +# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" } +# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" } +# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1" } #elif NR_CPUS <= RCU_FANOUT_3 # define RCU_NUM_LVLS 3 # define NUM_RCU_LVL_0 1 # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) -# define NUM_RCU_LVL_3 (NR_CPUS) -# define NUM_RCU_LVL_4 0 +# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2) +# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 } +# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" } +# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" } +# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2" } #elif NR_CPUS <= RCU_FANOUT_4 # define RCU_NUM_LVLS 4 # define NUM_RCU_LVL_0 1 # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3) # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) # define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) -# define NUM_RCU_LVL_4 (NR_CPUS) +# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3) +# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 } +# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" } +# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" } +# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2", "rcu_node_exp_3" } #else # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" #endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ -#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4) -#define NUM_RCU_NODES (RCU_SUM - NR_CPUS) - extern int rcu_num_lvls; extern int rcu_num_nodes; @@ -136,16 +164,21 @@ struct rcu_node { /* an rcu_data structure, otherwise, each */ /* bit corresponds to a child rcu_node */ /* structure. */ - unsigned long expmask; /* Groups that have ->blkd_tasks */ - /* elements that need to drain to allow the */ - /* current expedited grace period to */ - /* complete (only for PREEMPT_RCU). */ unsigned long qsmaskinit; - /* Per-GP initial value for qsmask & expmask. */ + /* Per-GP initial value for qsmask. */ /* Initialized from ->qsmaskinitnext at the */ /* beginning of each grace period. */ unsigned long qsmaskinitnext; /* Online CPUs for next grace period. */ + unsigned long expmask; /* CPUs or groups that need to check in */ + /* to allow the current expedited GP */ + /* to complete. */ + unsigned long expmaskinit; + /* Per-GP initial values for expmask. */ + /* Initialized from ->expmaskinitnext at the */ + /* beginning of each expedited GP. */ + unsigned long expmaskinitnext; + /* Online CPUs for next expedited GP. */ unsigned long grpmask; /* Mask to apply to parent qsmask. */ /* Only one bit will be set in this mask. */ int grplo; /* lowest-numbered CPU or group here. */ @@ -171,7 +204,6 @@ struct rcu_node { /* if there is no such task. If there */ /* is no current expedited grace period, */ /* then there can cannot be any such task. */ -#ifdef CONFIG_RCU_BOOST struct list_head *boost_tasks; /* Pointer to first task that needs to be */ /* priority boosted, or NULL if no priority */ @@ -209,14 +241,15 @@ struct rcu_node { unsigned long n_balk_nos; /* Refused to boost: not sure why, though. */ /* This can happen due to race conditions. */ -#endif /* #ifdef CONFIG_RCU_BOOST */ #ifdef CONFIG_RCU_NOCB_CPU - struct swait_head nocb_gp_wq[2]; + struct swait_queue_head nocb_gp_wq[2]; /* Place for rcu_nocb_kthread() to wait GP. */ #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ int need_future_gp[2]; /* Counts of upcoming no-CB GP requests. */ raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp; + + struct mutex exp_funnel_mutex ____cacheline_internodealigned_in_smp; } ____cacheline_internodealigned_in_smp; /* @@ -246,6 +279,18 @@ struct rcu_node { for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \ (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) +/* + * Union to allow "aggregate OR" operation on the need for a quiescent + * state by the normal and expedited grace periods. + */ +union rcu_noqs { + struct { + u8 norm; + u8 exp; + } b; /* Bits. */ + u16 s; /* Set of bits, aggregate OR here. */ +}; + /* Index values for nxttail array in struct rcu_data. */ #define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */ #define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */ @@ -262,18 +307,16 @@ struct rcu_data { /* is aware of having started. */ unsigned long rcu_qs_ctr_snap;/* Snapshot of rcu_qs_ctr to check */ /* for rcu_all_qs() invocations. */ - bool passed_quiesce; /* User-mode/idle loop etc. */ - bool qs_pending; /* Core waits for quiesc state. */ + union rcu_noqs cpu_no_qs; /* No QSes yet for this CPU. */ + bool core_needs_qs; /* Core waits for quiesc state. */ bool beenonline; /* CPU online at least once. */ bool gpwrap; /* Possible gpnum/completed wrap. */ struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ unsigned long grpmask; /* Mask to apply to leaf qsmask. */ -#ifdef CONFIG_RCU_CPU_STALL_INFO unsigned long ticks_this_gp; /* The number of scheduling-clock */ /* ticks this CPU has handled */ /* during and after the last grace */ /* period it is aware of. */ -#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ /* 2) batch handling */ /* @@ -327,7 +370,7 @@ struct rcu_data { /* 5) __rcu_pending() statistics. */ unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */ - unsigned long n_rp_qs_pending; + unsigned long n_rp_core_needs_qs; unsigned long n_rp_report_qs; unsigned long n_rp_cb_ready; unsigned long n_rp_cpu_needs_gp; @@ -336,11 +379,12 @@ struct rcu_data { unsigned long n_rp_nocb_defer_wakeup; unsigned long n_rp_need_nothing; - /* 6) _rcu_barrier() and OOM callbacks. */ + /* 6) _rcu_barrier(), OOM callbacks, and expediting. */ struct rcu_head barrier_head; #ifdef CONFIG_RCU_FAST_NO_HZ struct rcu_head oom_head; #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ + struct mutex exp_funnel_mutex; /* 7) Callback offloading. */ #ifdef CONFIG_RCU_NOCB_CPU @@ -350,7 +394,7 @@ struct rcu_data { atomic_long_t nocb_q_count_lazy; /* invocation (all stages). */ struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */ struct rcu_head **nocb_follower_tail; - struct swait_head nocb_wq; /* For nocb kthreads to sleep on. */ + struct swait_queue_head nocb_wq; /* For nocb kthreads to sleep on. */ struct task_struct *nocb_kthread; int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ @@ -368,21 +412,12 @@ struct rcu_data { #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ /* 8) RCU CPU stall data. */ -#ifdef CONFIG_RCU_CPU_STALL_INFO unsigned int softirq_snap; /* Snapshot of softirq activity. */ -#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ int cpu; struct rcu_state *rsp; }; -/* Values for fqs_state field in struct rcu_state. */ -#define RCU_GP_IDLE 0 /* No grace period in progress. */ -#define RCU_GP_INIT 1 /* Grace period being initialized. */ -#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */ -#define RCU_FORCE_QS 3 /* Need to force quiescent state. */ -#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK - /* Values for nocb_defer_wakeup field in struct rcu_data. */ #define RCU_NOGP_WAKE_NOT 0 #define RCU_NOGP_WAKE 1 @@ -423,23 +458,22 @@ do { \ */ struct rcu_state { struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */ - struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */ - u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ - u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ + struct rcu_node *level[RCU_NUM_LVLS + 1]; + /* Hierarchy levels (+1 to */ + /* shut bogus gcc warning) */ u8 flavor_mask; /* bit in flavor mask. */ struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ - void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ - void (*func)(struct rcu_head *head)); + call_rcu_func_t call; /* call_rcu() flavor. */ + int ncpus; /* # CPUs seen so far. */ /* The following fields are guarded by the root rcu_node's lock. */ - u8 fqs_state ____cacheline_internodealigned_in_smp; - /* Force QS state. */ - u8 boost; /* Subject to priority boost. */ + u8 boost ____cacheline_internodealigned_in_smp; + /* Subject to priority boost. */ unsigned long gpnum; /* Current gp number. */ unsigned long completed; /* # of last completed gp. */ struct task_struct *gp_kthread; /* Task for grace periods. */ - struct swait_head gp_wq; /* Where GP task waits. */ + struct swait_queue_head gp_wq; /* Where GP task waits. */ short gp_flags; /* Commands for GP task. */ short gp_state; /* GP kthread sleep state. */ @@ -460,21 +494,19 @@ struct rcu_state { struct mutex barrier_mutex; /* Guards barrier fields. */ atomic_t barrier_cpu_count; /* # CPUs waiting on. */ struct completion barrier_completion; /* Wake at barrier end. */ - unsigned long n_barrier_done; /* ++ at start and end of */ + unsigned long barrier_sequence; /* ++ at start and end of */ /* _rcu_barrier(). */ /* End of fields guarded by barrier_mutex. */ - atomic_long_t expedited_start; /* Starting ticket. */ - atomic_long_t expedited_done; /* Done ticket. */ - atomic_long_t expedited_wrap; /* # near-wrap incidents. */ - atomic_long_t expedited_tryfail; /* # acquisition failures. */ + unsigned long expedited_sequence; /* Take a ticket. */ + atomic_long_t expedited_workdone0; /* # done by others #0. */ atomic_long_t expedited_workdone1; /* # done by others #1. */ atomic_long_t expedited_workdone2; /* # done by others #2. */ + atomic_long_t expedited_workdone3; /* # done by others #3. */ atomic_long_t expedited_normal; /* # fallbacks to normal. */ - atomic_long_t expedited_stoppedcpus; /* # successful stop_cpus. */ - atomic_long_t expedited_done_tries; /* # tries to update _done. */ - atomic_long_t expedited_done_lost; /* # times beaten to _done. */ - atomic_long_t expedited_done_exit; /* # times exited _done loop. */ + atomic_t expedited_need_qs; /* # CPUs left to check in. */ + struct swait_queue_head expedited_wq; /* Wait for check-ins. */ + int ncpus_snap; /* # CPUs seen last time. */ unsigned long jiffies_force_qs; /* Time at which to invoke */ /* force_quiescent_state(). */ @@ -505,10 +537,14 @@ struct rcu_state { #define RCU_GP_FLAG_INIT 0x1 /* Need grace-period initialization. */ #define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */ -/* Values for rcu_state structure's gp_flags field. */ -#define RCU_GP_WAIT_INIT 0 /* Initial state. */ +/* Values for rcu_state structure's gp_state field. */ +#define RCU_GP_IDLE 0 /* Initial state and no GP in progress. */ #define RCU_GP_WAIT_GPS 1 /* Wait for grace-period start. */ -#define RCU_GP_WAIT_FQS 2 /* Wait for force-quiescent-state time. */ +#define RCU_GP_DONE_GPS 2 /* Wait done for grace-period start. */ +#define RCU_GP_WAIT_FQS 3 /* Wait for force-quiescent-state time. */ +#define RCU_GP_DOING_FQS 4 /* Wait done for force-quiescent-state time. */ +#define RCU_GP_CLEANUP 5 /* Grace-period cleanup started. */ +#define RCU_GP_CLEANED 6 /* Grace-period cleanup complete. */ extern struct list_head rcu_struct_flavors; @@ -520,14 +556,13 @@ extern struct list_head rcu_struct_flavors; * RCU implementation internal declarations: */ extern struct rcu_state rcu_sched_state; -DECLARE_PER_CPU(struct rcu_data, rcu_sched_data); +#ifndef CONFIG_PREEMPT_RT_FULL extern struct rcu_state rcu_bh_state; -DECLARE_PER_CPU(struct rcu_data, rcu_bh_data); +#endif #ifdef CONFIG_PREEMPT_RCU extern struct rcu_state rcu_preempt_state; -DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); #endif /* #ifdef CONFIG_PREEMPT_RCU */ DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); @@ -546,9 +581,10 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ static void rcu_print_detail_task_stall(struct rcu_state *rsp); static int rcu_print_task_stall(struct rcu_node *rnp); +static int rcu_print_task_exp_stall(struct rcu_node *rnp); static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); static void rcu_preempt_check_callbacks(void); -void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); +void call_rcu(struct rcu_head *head, rcu_callback_t func); static void __init __rcu_init_preempt(void); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); @@ -571,7 +607,8 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp); static void increment_cpu_stall_ticks(void); static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu); static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); -static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); +static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp); +static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq); static void rcu_init_one_nocb(struct rcu_node *rnp); static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, bool lazy, unsigned long flags); @@ -616,3 +653,15 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ } #endif /* #ifdef CONFIG_RCU_TRACE */ + +/* + * Place this after a lock-acquisition primitive to guarantee that + * an UNLOCK+LOCK pair act as a full barrier. This guarantee applies + * if the UNLOCK and LOCK are executed by the same CPU or if the + * UNLOCK and LOCK operate on the same lock variable. + */ +#ifdef CONFIG_PPC +#define smp_mb__after_unlock_lock() smp_mb() /* Full ordering for lock. */ +#else /* #ifdef CONFIG_PPC */ +#define smp_mb__after_unlock_lock() do { } while (0) +#endif /* #else #ifdef CONFIG_PPC */ diff --git a/kernel/kernel/rcu/tree_plugin.h b/kernel/kernel/rcu/tree_plugin.h index 54da8f44d..8e119cf64 100644 --- a/kernel/kernel/rcu/tree_plugin.h +++ b/kernel/kernel/rcu/tree_plugin.h @@ -28,7 +28,17 @@ #include "../locking/rtmutex_common.h" -#endif /* #ifdef CONFIG_RCU_BOOST */ +#else /* #ifdef CONFIG_RCU_BOOST */ + +/* + * Some architectures do not define rt_mutexes, but if !CONFIG_RCU_BOOST, + * all uses are in dead code. Provide a definition to keep the compiler + * happy, but add WARN_ON_ONCE() to complain if used in the wrong place. + * This probably needs to be excluded from -rt builds. + */ +#define rt_mutex_owner(a) ({ WARN_ON_ONCE(1); NULL; }) + +#endif /* #else #ifdef CONFIG_RCU_BOOST */ /* * Control variables for per-CPU and per-rcu_node kthreads. These @@ -53,11 +63,11 @@ static void __init rcu_bootup_announce_oddness(void) { if (IS_ENABLED(CONFIG_RCU_TRACE)) pr_info("\tRCU debugfs-based tracing is enabled.\n"); - if ((IS_ENABLED(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || - (!IS_ENABLED(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32)) + if ((IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 64) || + (!IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 32)) pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n", - CONFIG_RCU_FANOUT); - if (IS_ENABLED(CONFIG_RCU_FANOUT_EXACT)) + RCU_FANOUT); + if (rcu_fanout_exact) pr_info("\tHierarchical RCU autobalancing is disabled.\n"); if (IS_ENABLED(CONFIG_RCU_FAST_NO_HZ)) pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n"); @@ -65,14 +75,12 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tRCU lockdep checking is enabled.\n"); if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_RUNNABLE)) pr_info("\tRCU torture testing starts during boot.\n"); - if (IS_ENABLED(CONFIG_RCU_CPU_STALL_INFO)) - pr_info("\tAdditional per-CPU info printed with stalls.\n"); - if (NUM_RCU_LVL_4 != 0) - pr_info("\tFour-level hierarchy is enabled.\n"); - if (CONFIG_RCU_FANOUT_LEAF != 16) + if (RCU_NUM_LVLS >= 4) + pr_info("\tFour(or more)-level hierarchy is enabled.\n"); + if (RCU_FANOUT_LEAF != 16) pr_info("\tBuild-time adjustment of leaf fanout to %d.\n", - CONFIG_RCU_FANOUT_LEAF); - if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF) + RCU_FANOUT_LEAF); + if (rcu_fanout_leaf != RCU_FANOUT_LEAF) pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); if (nr_cpu_ids != NR_CPUS) pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); @@ -83,9 +91,9 @@ static void __init rcu_bootup_announce_oddness(void) #ifdef CONFIG_PREEMPT_RCU RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); -static struct rcu_state *rcu_state_p = &rcu_preempt_state; +static struct rcu_state *const rcu_state_p = &rcu_preempt_state; +static struct rcu_data __percpu *const rcu_data_p = &rcu_preempt_data; -static int rcu_preempted_readers_exp(struct rcu_node *rnp); static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, bool wake); @@ -98,6 +106,147 @@ static void __init rcu_bootup_announce(void) rcu_bootup_announce_oddness(); } +/* Flags for rcu_preempt_ctxt_queue() decision table. */ +#define RCU_GP_TASKS 0x8 +#define RCU_EXP_TASKS 0x4 +#define RCU_GP_BLKD 0x2 +#define RCU_EXP_BLKD 0x1 + +/* + * Queues a task preempted within an RCU-preempt read-side critical + * section into the appropriate location within the ->blkd_tasks list, + * depending on the states of any ongoing normal and expedited grace + * periods. The ->gp_tasks pointer indicates which element the normal + * grace period is waiting on (NULL if none), and the ->exp_tasks pointer + * indicates which element the expedited grace period is waiting on (again, + * NULL if none). If a grace period is waiting on a given element in the + * ->blkd_tasks list, it also waits on all subsequent elements. Thus, + * adding a task to the tail of the list blocks any grace period that is + * already waiting on one of the elements. In contrast, adding a task + * to the head of the list won't block any grace period that is already + * waiting on one of the elements. + * + * This queuing is imprecise, and can sometimes make an ongoing grace + * period wait for a task that is not strictly speaking blocking it. + * Given the choice, we needlessly block a normal grace period rather than + * blocking an expedited grace period. + * + * Note that an endless sequence of expedited grace periods still cannot + * indefinitely postpone a normal grace period. Eventually, all of the + * fixed number of preempted tasks blocking the normal grace period that are + * not also blocking the expedited grace period will resume and complete + * their RCU read-side critical sections. At that point, the ->gp_tasks + * pointer will equal the ->exp_tasks pointer, at which point the end of + * the corresponding expedited grace period will also be the end of the + * normal grace period. + */ +static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp, + unsigned long flags) __releases(rnp->lock) +{ + int blkd_state = (rnp->gp_tasks ? RCU_GP_TASKS : 0) + + (rnp->exp_tasks ? RCU_EXP_TASKS : 0) + + (rnp->qsmask & rdp->grpmask ? RCU_GP_BLKD : 0) + + (rnp->expmask & rdp->grpmask ? RCU_EXP_BLKD : 0); + struct task_struct *t = current; + + /* + * Decide where to queue the newly blocked task. In theory, + * this could be an if-statement. In practice, when I tried + * that, it was quite messy. + */ + switch (blkd_state) { + case 0: + case RCU_EXP_TASKS: + case RCU_EXP_TASKS + RCU_GP_BLKD: + case RCU_GP_TASKS: + case RCU_GP_TASKS + RCU_EXP_TASKS: + + /* + * Blocking neither GP, or first task blocking the normal + * GP but not blocking the already-waiting expedited GP. + * Queue at the head of the list to avoid unnecessarily + * blocking the already-waiting GPs. + */ + list_add(&t->rcu_node_entry, &rnp->blkd_tasks); + break; + + case RCU_EXP_BLKD: + case RCU_GP_BLKD: + case RCU_GP_BLKD + RCU_EXP_BLKD: + case RCU_GP_TASKS + RCU_EXP_BLKD: + case RCU_GP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD: + case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD: + + /* + * First task arriving that blocks either GP, or first task + * arriving that blocks the expedited GP (with the normal + * GP already waiting), or a task arriving that blocks + * both GPs with both GPs already waiting. Queue at the + * tail of the list to avoid any GP waiting on any of the + * already queued tasks that are not blocking it. + */ + list_add_tail(&t->rcu_node_entry, &rnp->blkd_tasks); + break; + + case RCU_EXP_TASKS + RCU_EXP_BLKD: + case RCU_EXP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD: + case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_EXP_BLKD: + + /* + * Second or subsequent task blocking the expedited GP. + * The task either does not block the normal GP, or is the + * first task blocking the normal GP. Queue just after + * the first task blocking the expedited GP. + */ + list_add(&t->rcu_node_entry, rnp->exp_tasks); + break; + + case RCU_GP_TASKS + RCU_GP_BLKD: + case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_GP_BLKD: + + /* + * Second or subsequent task blocking the normal GP. + * The task does not block the expedited GP. Queue just + * after the first task blocking the normal GP. + */ + list_add(&t->rcu_node_entry, rnp->gp_tasks); + break; + + default: + + /* Yet another exercise in excessive paranoia. */ + WARN_ON_ONCE(1); + break; + } + + /* + * We have now queued the task. If it was the first one to + * block either grace period, update the ->gp_tasks and/or + * ->exp_tasks pointers, respectively, to reference the newly + * blocked tasks. + */ + if (!rnp->gp_tasks && (blkd_state & RCU_GP_BLKD)) + rnp->gp_tasks = &t->rcu_node_entry; + if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD)) + rnp->exp_tasks = &t->rcu_node_entry; + raw_spin_unlock(&rnp->lock); + + /* + * Report the quiescent state for the expedited GP. This expedited + * GP should not be able to end until we report, so there should be + * no need to check for a subsequent expedited GP. (Though we are + * still in a quiescent state in any case.) + */ + if (blkd_state & RCU_EXP_BLKD && + t->rcu_read_unlock_special.b.exp_need_qs) { + t->rcu_read_unlock_special.b.exp_need_qs = false; + rcu_report_exp_rdp(rdp->rsp, rdp, true); + } else { + WARN_ON_ONCE(t->rcu_read_unlock_special.b.exp_need_qs); + } + local_irq_restore(flags); +} + /* * Record a preemptible-RCU quiescent state for the specified CPU. Note * that this just means that the task currently running on the CPU is @@ -109,11 +258,11 @@ static void __init rcu_bootup_announce(void) */ static void rcu_preempt_qs(void) { - if (!__this_cpu_read(rcu_preempt_data.passed_quiesce)) { + if (__this_cpu_read(rcu_data_p->cpu_no_qs.s)) { trace_rcu_grace_period(TPS("rcu_preempt"), - __this_cpu_read(rcu_preempt_data.gpnum), + __this_cpu_read(rcu_data_p->gpnum), TPS("cpuqs")); - __this_cpu_write(rcu_preempt_data.passed_quiesce, 1); + __this_cpu_write(rcu_data_p->cpu_no_qs.b.norm, false); barrier(); /* Coordinate with rcu_preempt_check_callbacks(). */ current->rcu_read_unlock_special.b.need_qs = false; } @@ -143,7 +292,7 @@ static void rcu_preempt_note_context_switch(void) !t->rcu_read_unlock_special.b.blocked) { /* Possibly blocking in an RCU read-side critical section. */ - rdp = this_cpu_ptr(rcu_preempt_state.rda); + rdp = this_cpu_ptr(rcu_state_p->rda); rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); smp_mb__after_unlock_lock(); @@ -151,43 +300,18 @@ static void rcu_preempt_note_context_switch(void) t->rcu_blocked_node = rnp; /* - * If this CPU has already checked in, then this task - * will hold up the next grace period rather than the - * current grace period. Queue the task accordingly. - * If the task is queued for the current grace period - * (i.e., this CPU has not yet passed through a quiescent - * state for the current grace period), then as long - * as that task remains queued, the current grace period - * cannot end. Note that there is some uncertainty as - * to exactly when the current grace period started. - * We take a conservative approach, which can result - * in unnecessarily waiting on tasks that started very - * slightly after the current grace period began. C'est - * la vie!!! - * - * But first, note that the current CPU must still be - * on line! + * Verify the CPU's sanity, trace the preemption, and + * then queue the task as required based on the states + * of any ongoing and expedited grace periods. */ WARN_ON_ONCE((rdp->grpmask & rcu_rnp_online_cpus(rnp)) == 0); WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); - if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) { - list_add(&t->rcu_node_entry, rnp->gp_tasks->prev); - rnp->gp_tasks = &t->rcu_node_entry; -#ifdef CONFIG_RCU_BOOST - if (rnp->boost_tasks != NULL) - rnp->boost_tasks = rnp->gp_tasks; -#endif /* #ifdef CONFIG_RCU_BOOST */ - } else { - list_add(&t->rcu_node_entry, &rnp->blkd_tasks); - if (rnp->qsmask & rdp->grpmask) - rnp->gp_tasks = &t->rcu_node_entry; - } trace_rcu_preempt_task(rdp->rsp->name, t->pid, (rnp->qsmask & rdp->grpmask) ? rnp->gpnum : rnp->gpnum + 1); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + rcu_preempt_ctxt_queue(rnp, rdp, flags); } else if (t->rcu_read_lock_nesting < 0 && t->rcu_read_unlock_special.s) { @@ -256,9 +380,8 @@ void rcu_read_unlock_special(struct task_struct *t) bool empty_exp_now; unsigned long flags; struct list_head *np; -#ifdef CONFIG_RCU_BOOST bool drop_boost_mutex = false; -#endif /* #ifdef CONFIG_RCU_BOOST */ + struct rcu_data *rdp; struct rcu_node *rnp; union rcu_special special; @@ -269,8 +392,8 @@ void rcu_read_unlock_special(struct task_struct *t) local_irq_save(flags); /* - * If RCU core is waiting for this CPU to exit critical section, - * let it know that we have done so. Because irqs are disabled, + * If RCU core is waiting for this CPU to exit its critical section, + * report the fact that it has exited. Because irqs are disabled, * t->rcu_read_unlock_special cannot change. */ special = t->rcu_read_unlock_special; @@ -283,13 +406,32 @@ void rcu_read_unlock_special(struct task_struct *t) } } + /* + * Respond to a request for an expedited grace period, but only if + * we were not preempted, meaning that we were running on the same + * CPU throughout. If we were preempted, the exp_need_qs flag + * would have been cleared at the time of the first preemption, + * and the quiescent state would be reported when we were dequeued. + */ + if (special.b.exp_need_qs) { + WARN_ON_ONCE(special.b.blocked); + t->rcu_read_unlock_special.b.exp_need_qs = false; + rdp = this_cpu_ptr(rcu_state_p->rda); + rcu_report_exp_rdp(rcu_state_p, rdp, true); + if (!t->rcu_read_unlock_special.s) { + local_irq_restore(flags); + return; + } + } + /* Hardware IRQ handlers cannot block, complain if they get here. */ if (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET)) { lockdep_rcu_suspicious(__FILE__, __LINE__, "rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n"); - pr_alert("->rcu_read_unlock_special: %#x (b: %d, nq: %d)\n", + pr_alert("->rcu_read_unlock_special: %#x (b: %d, enq: %d nq: %d)\n", t->rcu_read_unlock_special.s, t->rcu_read_unlock_special.b.blocked, + t->rcu_read_unlock_special.b.exp_need_qs, t->rcu_read_unlock_special.b.need_qs); local_irq_restore(flags); return; @@ -300,9 +442,11 @@ void rcu_read_unlock_special(struct task_struct *t) t->rcu_read_unlock_special.b.blocked = false; /* - * Remove this task from the list it blocked on. The - * task can migrate while we acquire the lock, but at - * most one time. So at most two passes through loop. + * Remove this task from the list it blocked on. The task + * now remains queued on the rcu_node corresponding to + * the CPU it first blocked on, so the first attempt to + * acquire the task's rcu_node's ->lock will succeed. + * Keep the loop and add a WARN_ON() out of sheer paranoia. */ for (;;) { rnp = t->rcu_blocked_node; @@ -310,10 +454,11 @@ void rcu_read_unlock_special(struct task_struct *t) smp_mb__after_unlock_lock(); if (rnp == t->rcu_blocked_node) break; + WARN_ON_ONCE(1); raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ } empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); - empty_exp = !rcu_preempted_readers_exp(rnp); + empty_exp = sync_rcu_preempt_exp_done(rnp); smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ np = rcu_next_node_entry(t, rnp); list_del_init(&t->rcu_node_entry); @@ -324,12 +469,12 @@ void rcu_read_unlock_special(struct task_struct *t) rnp->gp_tasks = np; if (&t->rcu_node_entry == rnp->exp_tasks) rnp->exp_tasks = np; -#ifdef CONFIG_RCU_BOOST - if (&t->rcu_node_entry == rnp->boost_tasks) - rnp->boost_tasks = np; - /* Snapshot ->boost_mtx ownership with rcu_node lock held. */ - drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t; -#endif /* #ifdef CONFIG_RCU_BOOST */ + if (IS_ENABLED(CONFIG_RCU_BOOST)) { + if (&t->rcu_node_entry == rnp->boost_tasks) + rnp->boost_tasks = np; + /* Snapshot ->boost_mtx ownership w/rnp->lock held. */ + drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t; + } /* * If this was the last task on the current list, and if @@ -337,7 +482,7 @@ void rcu_read_unlock_special(struct task_struct *t) * Note that rcu_report_unblock_qs_rnp() releases rnp->lock, * so we must take a snapshot of the expedited state. */ - empty_exp_now = !rcu_preempted_readers_exp(rnp); + empty_exp_now = sync_rcu_preempt_exp_done(rnp); if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) { trace_rcu_quiescent_state_report(TPS("preempt_rcu"), rnp->gpnum, @@ -346,24 +491,21 @@ void rcu_read_unlock_special(struct task_struct *t) rnp->grplo, rnp->grphi, !!rnp->gp_tasks); - rcu_report_unblock_qs_rnp(&rcu_preempt_state, - rnp, flags); + rcu_report_unblock_qs_rnp(rcu_state_p, rnp, flags); } else { raw_spin_unlock_irqrestore(&rnp->lock, flags); } -#ifdef CONFIG_RCU_BOOST /* Unboost if we were boosted. */ - if (drop_boost_mutex) + if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex) rt_mutex_unlock(&rnp->boost_mtx); -#endif /* #ifdef CONFIG_RCU_BOOST */ /* * If this was the last task on the expedited lists, * then we need to report up the rcu_node hierarchy. */ if (!empty_exp && empty_exp_now) - rcu_report_exp_rnp(&rcu_preempt_state, rnp, true); + rcu_report_exp_rnp(rcu_state_p, rnp, true); } else { local_irq_restore(flags); } @@ -383,7 +525,7 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } - t = list_entry(rnp->gp_tasks, + t = list_entry(rnp->gp_tasks->prev, struct task_struct, rcu_node_entry); list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) sched_show_task(t); @@ -403,8 +545,6 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp) rcu_print_detail_task_stall_rnp(rnp); } -#ifdef CONFIG_RCU_CPU_STALL_INFO - static void rcu_print_task_stall_begin(struct rcu_node *rnp) { pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", @@ -416,18 +556,6 @@ static void rcu_print_task_stall_end(void) pr_cont("\n"); } -#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ - -static void rcu_print_task_stall_begin(struct rcu_node *rnp) -{ -} - -static void rcu_print_task_stall_end(void) -{ -} - -#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */ - /* * Scan the current list of tasks blocked within RCU read-side critical * sections, printing out the tid of each. @@ -440,7 +568,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp) if (!rcu_preempt_blocked_readers_cgp(rnp)) return 0; rcu_print_task_stall_begin(rnp); - t = list_entry(rnp->gp_tasks, + t = list_entry(rnp->gp_tasks->prev, struct task_struct, rcu_node_entry); list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { pr_cont(" P%d", t->pid); @@ -451,6 +579,27 @@ static int rcu_print_task_stall(struct rcu_node *rnp) } /* + * Scan the current list of tasks blocked within RCU read-side critical + * sections, printing out the tid of each that is blocking the current + * expedited grace period. + */ +static int rcu_print_task_exp_stall(struct rcu_node *rnp) +{ + struct task_struct *t; + int ndetected = 0; + + if (!rnp->exp_tasks) + return 0; + t = list_entry(rnp->exp_tasks->prev, + struct task_struct, rcu_node_entry); + list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { + pr_cont(" P%d", t->pid); + ndetected++; + } + return ndetected; +} + +/* * Check that the list of blocked tasks for the newly completed grace * period is in fact empty. It is a serious bug to complete a grace * period that still has RCU readers blocked! This function must be @@ -484,17 +633,17 @@ static void rcu_preempt_check_callbacks(void) return; } if (t->rcu_read_lock_nesting > 0 && - __this_cpu_read(rcu_preempt_data.qs_pending) && - !__this_cpu_read(rcu_preempt_data.passed_quiesce)) + __this_cpu_read(rcu_data_p->core_needs_qs) && + __this_cpu_read(rcu_data_p->cpu_no_qs.b.norm)) t->rcu_read_unlock_special.b.need_qs = true; } /* * Queue a preemptible-RCU callback for invocation after a grace period. */ -void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +void call_rcu(struct rcu_head *head, rcu_callback_t func) { - __call_rcu(head, func, &rcu_preempt_state, -1, 0); + __call_rcu(head, func, rcu_state_p, -1, 0); } EXPORT_SYMBOL_GPL(call_rcu); @@ -514,10 +663,10 @@ EXPORT_SYMBOL_GPL(call_rcu); */ void synchronize_rcu(void) { - rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && - !lock_is_held(&rcu_lock_map) && - !lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_rcu() in RCU read-side critical section"); + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_rcu() in RCU read-side critical section"); if (!rcu_scheduler_active) return; if (rcu_gp_is_expedited()) @@ -527,157 +676,41 @@ void synchronize_rcu(void) } EXPORT_SYMBOL_GPL(synchronize_rcu); -static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq); -static unsigned long sync_rcu_preempt_exp_count; -static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); - -/* - * Return non-zero if there are any tasks in RCU read-side critical - * sections blocking the current preemptible-RCU expedited grace period. - * If there is no preemptible-RCU expedited grace period currently in - * progress, returns zero unconditionally. - */ -static int rcu_preempted_readers_exp(struct rcu_node *rnp) -{ - return rnp->exp_tasks != NULL; -} - -/* - * return non-zero if there is no RCU expedited grace period in progress - * for the specified rcu_node structure, in other words, if all CPUs and - * tasks covered by the specified rcu_node structure have done their bit - * for the current expedited grace period. Works only for preemptible - * RCU -- other RCU implementation use other means. - * - * Caller must hold sync_rcu_preempt_exp_mutex. - */ -static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) -{ - return !rcu_preempted_readers_exp(rnp) && - ACCESS_ONCE(rnp->expmask) == 0; -} - -/* - * Report the exit from RCU read-side critical section for the last task - * that queued itself during or before the current expedited preemptible-RCU - * grace period. This event is reported either to the rcu_node structure on - * which the task was queued or to one of that rcu_node structure's ancestors, - * recursively up the tree. (Calm down, calm down, we do the recursion - * iteratively!) - * - * Caller must hold sync_rcu_preempt_exp_mutex. - */ -static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, - bool wake) -{ - unsigned long flags; - unsigned long mask; - - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); - for (;;) { - if (!sync_rcu_preempt_exp_done(rnp)) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); - break; - } - if (rnp->parent == NULL) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); - if (wake) { - smp_mb(); /* EGP done before wake_up(). */ - wake_up(&sync_rcu_preempt_exp_wq); - } - break; - } - mask = rnp->grpmask; - raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ - rnp = rnp->parent; - raw_spin_lock(&rnp->lock); /* irqs already disabled */ - smp_mb__after_unlock_lock(); - rnp->expmask &= ~mask; - } -} - -/* - * Snapshot the tasks blocking the newly started preemptible-RCU expedited - * grace period for the specified rcu_node structure, phase 1. If there - * are such tasks, set the ->expmask bits up the rcu_node tree and also - * set the ->expmask bits on the leaf rcu_node structures to tell phase 2 - * that work is needed here. - * - * Caller must hold sync_rcu_preempt_exp_mutex. - */ -static void -sync_rcu_preempt_exp_init1(struct rcu_state *rsp, struct rcu_node *rnp) -{ - unsigned long flags; - unsigned long mask; - struct rcu_node *rnp_up; - - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); - WARN_ON_ONCE(rnp->expmask); - WARN_ON_ONCE(rnp->exp_tasks); - if (!rcu_preempt_has_tasks(rnp)) { - /* No blocked tasks, nothing to do. */ - raw_spin_unlock_irqrestore(&rnp->lock, flags); - return; - } - /* Call for Phase 2 and propagate ->expmask bits up the tree. */ - rnp->expmask = 1; - rnp_up = rnp; - while (rnp_up->parent) { - mask = rnp_up->grpmask; - rnp_up = rnp_up->parent; - if (rnp_up->expmask & mask) - break; - raw_spin_lock(&rnp_up->lock); /* irqs already off */ - smp_mb__after_unlock_lock(); - rnp_up->expmask |= mask; - raw_spin_unlock(&rnp_up->lock); /* irqs still off */ - } - raw_spin_unlock_irqrestore(&rnp->lock, flags); -} - /* - * Snapshot the tasks blocking the newly started preemptible-RCU expedited - * grace period for the specified rcu_node structure, phase 2. If the - * leaf rcu_node structure has its ->expmask field set, check for tasks. - * If there are some, clear ->expmask and set ->exp_tasks accordingly, - * then initiate RCU priority boosting. Otherwise, clear ->expmask and - * invoke rcu_report_exp_rnp() to clear out the upper-level ->expmask bits, - * enabling rcu_read_unlock_special() to do the bit-clearing. - * - * Caller must hold sync_rcu_preempt_exp_mutex. + * Remote handler for smp_call_function_single(). If there is an + * RCU read-side critical section in effect, request that the + * next rcu_read_unlock() record the quiescent state up the + * ->expmask fields in the rcu_node tree. Otherwise, immediately + * report the quiescent state. */ -static void -sync_rcu_preempt_exp_init2(struct rcu_state *rsp, struct rcu_node *rnp) +static void sync_rcu_exp_handler(void *info) { - unsigned long flags; - - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); - if (!rnp->expmask) { - /* Phase 1 didn't do anything, so Phase 2 doesn't either. */ - raw_spin_unlock_irqrestore(&rnp->lock, flags); - return; - } - - /* Phase 1 is over. */ - rnp->expmask = 0; + struct rcu_data *rdp; + struct rcu_state *rsp = info; + struct task_struct *t = current; /* - * If there are still blocked tasks, set up ->exp_tasks so that - * rcu_read_unlock_special() will wake us and then boost them. + * Within an RCU read-side critical section, request that the next + * rcu_read_unlock() report. Unless this RCU read-side critical + * section has already blocked, in which case it is already set + * up for the expedited grace period to wait on it. */ - if (rcu_preempt_has_tasks(rnp)) { - rnp->exp_tasks = rnp->blkd_tasks.next; - rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ + if (t->rcu_read_lock_nesting > 0 && + !t->rcu_read_unlock_special.b.blocked) { + t->rcu_read_unlock_special.b.exp_need_qs = true; return; } - /* No longer any blocked tasks, so undo bit setting. */ - raw_spin_unlock_irqrestore(&rnp->lock, flags); - rcu_report_exp_rnp(rsp, rnp, false); + /* + * We are either exiting an RCU read-side critical section (negative + * values of t->rcu_read_lock_nesting) or are not in one at all + * (zero value of t->rcu_read_lock_nesting). Or we are in an RCU + * read-side critical section that blocked before this expedited + * grace period started. Either way, we can immediately report + * the quiescent state. + */ + rdp = this_cpu_ptr(rsp->rda); + rcu_report_exp_rdp(rsp, rdp, true); } /** @@ -695,81 +728,28 @@ sync_rcu_preempt_exp_init2(struct rcu_state *rsp, struct rcu_node *rnp) void synchronize_rcu_expedited(void) { struct rcu_node *rnp; - struct rcu_state *rsp = &rcu_preempt_state; - unsigned long snap; - int trycount = 0; + struct rcu_node *rnp_unlock; + struct rcu_state *rsp = rcu_state_p; + unsigned long s; - smp_mb(); /* Caller's modifications seen first by other CPUs. */ - snap = ACCESS_ONCE(sync_rcu_preempt_exp_count) + 1; - smp_mb(); /* Above access cannot bleed into critical section. */ + s = rcu_exp_gp_seq_snap(rsp); - /* - * Block CPU-hotplug operations. This means that any CPU-hotplug - * operation that finds an rcu_node structure with tasks in the - * process of being boosted will know that all tasks blocking - * this expedited grace period will already be in the process of - * being boosted. This simplifies the process of moving tasks - * from leaf to root rcu_node structures. - */ - if (!try_get_online_cpus()) { - /* CPU-hotplug operation in flight, fall back to normal GP. */ - wait_rcu_gp(call_rcu); - return; - } + rnp_unlock = exp_funnel_lock(rsp, s); + if (rnp_unlock == NULL) + return; /* Someone else did our work for us. */ - /* - * Acquire lock, falling back to synchronize_rcu() if too many - * lock-acquisition failures. Of course, if someone does the - * expedited grace period for us, just leave. - */ - while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) { - if (ULONG_CMP_LT(snap, - ACCESS_ONCE(sync_rcu_preempt_exp_count))) { - put_online_cpus(); - goto mb_ret; /* Others did our work for us. */ - } - if (trycount++ < 10) { - udelay(trycount * num_online_cpus()); - } else { - put_online_cpus(); - wait_rcu_gp(call_rcu); - return; - } - } - if (ULONG_CMP_LT(snap, ACCESS_ONCE(sync_rcu_preempt_exp_count))) { - put_online_cpus(); - goto unlock_mb_ret; /* Others did our work for us. */ - } + rcu_exp_gp_seq_start(rsp); - /* force all RCU readers onto ->blkd_tasks lists. */ - synchronize_sched_expedited(); - - /* - * Snapshot current state of ->blkd_tasks lists into ->expmask. - * Phase 1 sets bits and phase 2 permits rcu_read_unlock_special() - * to start clearing them. Doing this in one phase leads to - * strange races between setting and clearing bits, so just say "no"! - */ - rcu_for_each_leaf_node(rsp, rnp) - sync_rcu_preempt_exp_init1(rsp, rnp); - rcu_for_each_leaf_node(rsp, rnp) - sync_rcu_preempt_exp_init2(rsp, rnp); - - put_online_cpus(); + /* Initialize the rcu_node tree in preparation for the wait. */ + sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler); /* Wait for snapshotted ->blkd_tasks lists to drain. */ rnp = rcu_get_root(rsp); - wait_event(sync_rcu_preempt_exp_wq, - sync_rcu_preempt_exp_done(rnp)); + synchronize_sched_expedited_wait(rsp); /* Clean up and exit. */ - smp_mb(); /* ensure expedited GP seen before counter increment. */ - ACCESS_ONCE(sync_rcu_preempt_exp_count) = - sync_rcu_preempt_exp_count + 1; -unlock_mb_ret: - mutex_unlock(&sync_rcu_preempt_exp_mutex); -mb_ret: - smp_mb(); /* ensure subsequent action seen after grace period. */ + rcu_exp_gp_seq_end(rsp); + mutex_unlock(&rnp_unlock->exp_funnel_mutex); } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); @@ -783,7 +763,7 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); */ void rcu_barrier(void) { - _rcu_barrier(&rcu_preempt_state); + _rcu_barrier(rcu_state_p); } EXPORT_SYMBOL_GPL(rcu_barrier); @@ -792,7 +772,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier); */ static void __init __rcu_init_preempt(void) { - rcu_init_one(&rcu_preempt_state, &rcu_preempt_data); + rcu_init_one(rcu_state_p, rcu_data_p); } /* @@ -815,7 +795,8 @@ void exit_rcu(void) #else /* #ifdef CONFIG_PREEMPT_RCU */ -static struct rcu_state *rcu_state_p = &rcu_sched_state; +static struct rcu_state *const rcu_state_p = &rcu_sched_state; +static struct rcu_data __percpu *const rcu_data_p = &rcu_sched_data; /* * Tell them what RCU they are running. @@ -869,6 +850,16 @@ static int rcu_print_task_stall(struct rcu_node *rnp) } /* + * Because preemptible RCU does not exist, we never have to check for + * tasks blocked within RCU read-side critical sections that are + * blocking the current expedited grace period. + */ +static int rcu_print_task_exp_stall(struct rcu_node *rnp) +{ + return 0; +} + +/* * Because there is no preemptible RCU, there can be no readers blocked, * so there is no need to check for blocked tasks. So check only for * bogus qsmask values. @@ -981,8 +972,8 @@ static int rcu_boost(struct rcu_node *rnp) struct task_struct *t; struct list_head *tb; - if (ACCESS_ONCE(rnp->exp_tasks) == NULL && - ACCESS_ONCE(rnp->boost_tasks) == NULL) + if (READ_ONCE(rnp->exp_tasks) == NULL && + READ_ONCE(rnp->boost_tasks) == NULL) return 0; /* Nothing left to boost. */ raw_spin_lock_irqsave(&rnp->lock, flags); @@ -1035,13 +1026,12 @@ static int rcu_boost(struct rcu_node *rnp) rt_mutex_lock(&rnp->boost_mtx); rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */ - return ACCESS_ONCE(rnp->exp_tasks) != NULL || - ACCESS_ONCE(rnp->boost_tasks) != NULL; + return READ_ONCE(rnp->exp_tasks) != NULL || + READ_ONCE(rnp->boost_tasks) != NULL; } /* - * Priority-boosting kthread. One per leaf rcu_node and one for the - * root rcu_node. + * Priority-boosting kthread, one per leaf rcu_node. */ static int rcu_boost_kthread(void *arg) { @@ -1143,7 +1133,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, struct sched_param sp; struct task_struct *t; - if (&rcu_preempt_state != rsp) + if (rcu_state_p != rsp) return 0; if (!rcu_scheduler_fully_active || rcu_rnp_online_cpus(rnp) == 0) @@ -1257,13 +1247,12 @@ static void rcu_prepare_kthreads(int cpu) * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs * any flavor of RCU. */ -#ifndef CONFIG_RCU_NOCB_CPU_ALL -int rcu_needs_cpu(unsigned long *delta_jiffies) +int rcu_needs_cpu(u64 basemono, u64 *nextevt) { - *delta_jiffies = ULONG_MAX; - return rcu_cpu_has_callbacks(NULL); + *nextevt = KTIME_MAX; + return IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) + ? 0 : rcu_cpu_has_callbacks(NULL); } -#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ #endif /* !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL) */ #if !defined(CONFIG_RCU_FAST_NO_HZ) @@ -1324,8 +1313,6 @@ module_param(rcu_idle_gp_delay, int, 0644); static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY; module_param(rcu_idle_lazy_gp_delay, int, 0644); -extern int tick_nohz_active; - /* * Try to advance callbacks for all flavors of RCU on the current CPU, but * only if it has been awhile since the last time we did so. Afterwards, @@ -1354,7 +1341,7 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void) * callbacks not yet ready to invoke. */ if ((rdp->completed != rnp->completed || - unlikely(ACCESS_ONCE(rdp->gpwrap))) && + unlikely(READ_ONCE(rdp->gpwrap))) && rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]) note_gp_changes(rsp, rdp); @@ -1374,17 +1361,22 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void) * * The caller must have disabled interrupts. */ -#ifndef CONFIG_RCU_NOCB_CPU_ALL -int rcu_needs_cpu(unsigned long *dj) +int rcu_needs_cpu(u64 basemono, u64 *nextevt) { struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + unsigned long dj; + + if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)) { + *nextevt = KTIME_MAX; + return 0; + } /* Snapshot to detect later posting of non-lazy callback. */ rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; /* If no callbacks, RCU doesn't need the CPU. */ if (!rcu_cpu_has_callbacks(&rdtp->all_lazy)) { - *dj = ULONG_MAX; + *nextevt = KTIME_MAX; return 0; } @@ -1398,15 +1390,16 @@ int rcu_needs_cpu(unsigned long *dj) /* Request timer delay depending on laziness, and round. */ if (!rdtp->all_lazy) { - *dj = round_up(rcu_idle_gp_delay + jiffies, + dj = round_up(rcu_idle_gp_delay + jiffies, rcu_idle_gp_delay) - jiffies; } else { - *dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies; + dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies; } + *nextevt = basemono + dj * TICK_NSEC; return 0; } -#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ #endif /* #ifndef CONFIG_PREEMPT_RT_FULL */ + /* * Prepare a CPU for idle from an RCU perspective. The first major task * is to sense whether nohz mode has been enabled or disabled via sysfs. @@ -1419,7 +1412,6 @@ int rcu_needs_cpu(unsigned long *dj) */ static void rcu_prepare_for_idle(void) { -#ifndef CONFIG_RCU_NOCB_CPU_ALL bool needwake; struct rcu_data *rdp; struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); @@ -1427,8 +1419,11 @@ static void rcu_prepare_for_idle(void) struct rcu_state *rsp; int tne; + if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)) + return; + /* Handle nohz enablement switches conservatively. */ - tne = ACCESS_ONCE(tick_nohz_active); + tne = READ_ONCE(tick_nohz_active); if (tne != rdtp->tick_nohz_enabled_snap) { if (rcu_cpu_has_callbacks(NULL)) invoke_rcu_core(); /* force nohz to see update. */ @@ -1474,7 +1469,6 @@ static void rcu_prepare_for_idle(void) if (needwake) rcu_gp_kthread_wake(rsp); } -#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ } /* @@ -1484,12 +1478,11 @@ static void rcu_prepare_for_idle(void) */ static void rcu_cleanup_after_idle(void) { -#ifndef CONFIG_RCU_NOCB_CPU_ALL - if (rcu_is_nocb_cpu(smp_processor_id())) + if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) || + rcu_is_nocb_cpu(smp_processor_id())) return; if (rcu_try_advance_all_cbs()) invoke_rcu_core(); -#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ } /* @@ -1564,12 +1557,10 @@ static int rcu_oom_notify(struct notifier_block *self, */ atomic_set(&oom_callback_count, 1); - get_online_cpus(); for_each_online_cpu(cpu) { smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1); cond_resched_rcu_qs(); } - put_online_cpus(); /* Unconditionally decrement: no need to wake ourselves up. */ atomic_dec(&oom_callback_count); @@ -1590,8 +1581,6 @@ early_initcall(rcu_register_oom_notifier); #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ -#ifdef CONFIG_RCU_CPU_STALL_INFO - #ifdef CONFIG_RCU_FAST_NO_HZ static void print_cpu_stall_fast_no_hz(char *cp, int cpu) @@ -1649,12 +1638,16 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) ticks_value = rsp->gpnum - rdp->gpnum; } print_cpu_stall_fast_no_hz(fast_no_hz, cpu); - pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n", - cpu, ticks_value, ticks_title, + pr_err("\t%d-%c%c%c: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n", + cpu, + "O."[!!cpu_online(cpu)], + "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)], + "N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)], + ticks_value, ticks_title, atomic_read(&rdtp->dynticks) & 0xfff, rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), - ACCESS_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart, + READ_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart, fast_no_hz); } @@ -1680,33 +1673,6 @@ static void increment_cpu_stall_ticks(void) raw_cpu_inc(rsp->rda->ticks_this_gp); } -#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ - -static void print_cpu_stall_info_begin(void) -{ - pr_cont(" {"); -} - -static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) -{ - pr_cont(" %d", cpu); -} - -static void print_cpu_stall_info_end(void) -{ - pr_cont("} "); -} - -static void zero_cpu_stall_ticks(struct rcu_data *rdp) -{ -} - -static void increment_cpu_stall_ticks(void) -{ -} - -#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */ - #ifdef CONFIG_RCU_NOCB_CPU /* @@ -1751,9 +1717,9 @@ early_param("rcu_nocb_poll", parse_rcu_nocb_poll); * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended * grace period. */ -static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) +static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) { - swait_wake_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]); + swake_up_all(sq); } /* @@ -1769,10 +1735,15 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq; } +static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) +{ + return &rnp->nocb_gp_wq[rnp->completed & 0x1]; +} + static void rcu_init_one_nocb(struct rcu_node *rnp) { - init_swait_head(&rnp->nocb_gp_wq[0]); - init_swait_head(&rnp->nocb_gp_wq[1]); + init_swait_queue_head(&rnp->nocb_gp_wq[0]); + init_swait_queue_head(&rnp->nocb_gp_wq[1]); } #ifndef CONFIG_RCU_NOCB_CPU_ALL @@ -1792,12 +1763,12 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force) { struct rcu_data *rdp_leader = rdp->nocb_leader; - if (!ACCESS_ONCE(rdp_leader->nocb_kthread)) + if (!READ_ONCE(rdp_leader->nocb_kthread)) return; - if (ACCESS_ONCE(rdp_leader->nocb_leader_sleep) || force) { + if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) { /* Prior smp_mb__after_atomic() orders against prior enqueue. */ - ACCESS_ONCE(rdp_leader->nocb_leader_sleep) = false; - swait_wake(&rdp_leader->nocb_wq); + WRITE_ONCE(rdp_leader->nocb_leader_sleep, false); + swake_up(&rdp_leader->nocb_wq); } } @@ -1828,14 +1799,14 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) ret = atomic_long_read(&rdp->nocb_q_count); #ifdef CONFIG_PROVE_RCU - rhp = ACCESS_ONCE(rdp->nocb_head); + rhp = READ_ONCE(rdp->nocb_head); if (!rhp) - rhp = ACCESS_ONCE(rdp->nocb_gp_head); + rhp = READ_ONCE(rdp->nocb_gp_head); if (!rhp) - rhp = ACCESS_ONCE(rdp->nocb_follower_head); + rhp = READ_ONCE(rdp->nocb_follower_head); /* Having no rcuo kthread but CBs after scheduler starts is bad! */ - if (!ACCESS_ONCE(rdp->nocb_kthread) && rhp && + if (!READ_ONCE(rdp->nocb_kthread) && rhp && rcu_scheduler_fully_active) { /* RCU callback enqueued before CPU first came online??? */ pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n", @@ -1869,12 +1840,12 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, atomic_long_add(rhcount, &rdp->nocb_q_count); /* rcu_barrier() relies on ->nocb_q_count add before xchg. */ old_rhpp = xchg(&rdp->nocb_tail, rhtp); - ACCESS_ONCE(*old_rhpp) = rhp; + WRITE_ONCE(*old_rhpp, rhp); atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy); smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */ /* If we are not being polled and there is a kthread, awaken it ... */ - t = ACCESS_ONCE(rdp->nocb_kthread); + t = READ_ONCE(rdp->nocb_kthread); if (rcu_nocb_poll || !t) { trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeNotPoll")); @@ -2012,7 +1983,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) for (;;) { swait_event_interruptible( rnp->nocb_gp_wq[c & 0x1], - (d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c))); + (d = ULONG_CMP_GE(READ_ONCE(rnp->completed), c))); if (likely(d)) break; WARN_ON(signal_pending(current)); @@ -2039,7 +2010,7 @@ wait_again: if (!rcu_nocb_poll) { trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep"); swait_event_interruptible(my_rdp->nocb_wq, - !ACCESS_ONCE(my_rdp->nocb_leader_sleep)); + !READ_ONCE(my_rdp->nocb_leader_sleep)); /* Memory barrier handled by smp_mb() calls below and repoll. */ } else if (firsttime) { firsttime = false; /* Don't drown trace log with "Poll"! */ @@ -2053,12 +2024,12 @@ wait_again: */ gotcbs = false; for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { - rdp->nocb_gp_head = ACCESS_ONCE(rdp->nocb_head); + rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head); if (!rdp->nocb_gp_head) continue; /* No CBs here, try next follower. */ /* Move callbacks to wait-for-GP list, which is empty. */ - ACCESS_ONCE(rdp->nocb_head) = NULL; + WRITE_ONCE(rdp->nocb_head, NULL); rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head); gotcbs = true; } @@ -2078,7 +2049,7 @@ wait_again: my_rdp->nocb_leader_sleep = true; smp_mb(); /* Ensure _sleep true before scan. */ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) - if (ACCESS_ONCE(rdp->nocb_head)) { + if (READ_ONCE(rdp->nocb_head)) { /* Found CB, so short-circuit next wait. */ my_rdp->nocb_leader_sleep = false; break; @@ -2099,7 +2070,7 @@ wait_again: /* Each pass through the following loop wakes a follower, if needed. */ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { - if (ACCESS_ONCE(rdp->nocb_head)) + if (READ_ONCE(rdp->nocb_head)) my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/ if (!rdp->nocb_gp_head) continue; /* No CBs, so no need to wake follower. */ @@ -2113,7 +2084,7 @@ wait_again: * List was empty, wake up the follower. * Memory barriers supplied by atomic_long_add(). */ - swait_wake(&rdp->nocb_wq); + swake_up(&rdp->nocb_wq); } } @@ -2135,7 +2106,7 @@ static void nocb_follower_wait(struct rcu_data *rdp) trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "FollowerSleep"); swait_event_interruptible(rdp->nocb_wq, - ACCESS_ONCE(rdp->nocb_follower_head)); + READ_ONCE(rdp->nocb_follower_head)); } else if (firsttime) { /* Don't drown trace log with "Poll"! */ firsttime = false; @@ -2176,10 +2147,10 @@ static int rcu_nocb_kthread(void *arg) nocb_follower_wait(rdp); /* Pull the ready-to-invoke callbacks onto local list. */ - list = ACCESS_ONCE(rdp->nocb_follower_head); + list = READ_ONCE(rdp->nocb_follower_head); BUG_ON(!list); trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty"); - ACCESS_ONCE(rdp->nocb_follower_head) = NULL; + WRITE_ONCE(rdp->nocb_follower_head, NULL); tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head); /* Each pass through the following loop invokes a callback. */ @@ -2218,7 +2189,7 @@ static int rcu_nocb_kthread(void *arg) /* Is a deferred wakeup of rcu_nocb_kthread() required? */ static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) { - return ACCESS_ONCE(rdp->nocb_defer_wakeup); + return READ_ONCE(rdp->nocb_defer_wakeup); } /* Do a deferred wakeup of rcu_nocb_kthread(). */ @@ -2228,8 +2199,8 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp) if (!rcu_nocb_need_deferred_wakeup(rdp)) return; - ndw = ACCESS_ONCE(rdp->nocb_defer_wakeup); - ACCESS_ONCE(rdp->nocb_defer_wakeup) = RCU_NOGP_WAKE_NOT; + ndw = READ_ONCE(rdp->nocb_defer_wakeup); + WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE_NOT); wake_nocb_leader(rdp, ndw == RCU_NOGP_WAKE_FORCE); trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake")); } @@ -2293,7 +2264,7 @@ void __init rcu_init_nohz(void) static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) { rdp->nocb_tail = &rdp->nocb_head; - init_swait_head(&rdp->nocb_wq); + init_swait_queue_head(&rdp->nocb_wq); rdp->nocb_follower_tail = &rdp->nocb_follower_head; } @@ -2342,7 +2313,7 @@ static void rcu_spawn_one_nocb_kthread(struct rcu_state *rsp, int cpu) t = kthread_run(rcu_nocb_kthread, rdp_spawn, "rcuo%c/%d", rsp->abbr, cpu); BUG_ON(IS_ERR(t)); - ACCESS_ONCE(rdp_spawn->nocb_kthread) = t; + WRITE_ONCE(rdp_spawn->nocb_kthread, t); } /* @@ -2443,7 +2414,7 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) return false; } -static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) +static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) { } @@ -2451,6 +2422,11 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) { } +static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) +{ + return NULL; +} + static void rcu_init_one_nocb(struct rcu_node *rnp) { } @@ -2557,7 +2533,7 @@ static void rcu_sysidle_enter(int irq) /* Record start of fully idle period. */ j = jiffies; - ACCESS_ONCE(rdtp->dynticks_idle_jiffies) = j; + WRITE_ONCE(rdtp->dynticks_idle_jiffies, j); smp_mb__before_atomic(); atomic_inc(&rdtp->dynticks_idle); smp_mb__after_atomic(); @@ -2575,7 +2551,7 @@ static void rcu_sysidle_enter(int irq) */ void rcu_sysidle_force_exit(void) { - int oldstate = ACCESS_ONCE(full_sysidle_state); + int oldstate = READ_ONCE(full_sysidle_state); int newoldstate; /* @@ -2688,7 +2664,7 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, smp_mb(); /* Read counters before timestamps. */ /* Pick up timestamps. */ - j = ACCESS_ONCE(rdtp->dynticks_idle_jiffies); + j = READ_ONCE(rdtp->dynticks_idle_jiffies); /* If this CPU entered idle more recently, update maxj timestamp. */ if (ULONG_CMP_LT(*maxj, j)) *maxj = j; @@ -2725,11 +2701,11 @@ static unsigned long rcu_sysidle_delay(void) static void rcu_sysidle(unsigned long j) { /* Check the current state. */ - switch (ACCESS_ONCE(full_sysidle_state)) { + switch (READ_ONCE(full_sysidle_state)) { case RCU_SYSIDLE_NOT: /* First time all are idle, so note a short idle period. */ - ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_SHORT; + WRITE_ONCE(full_sysidle_state, RCU_SYSIDLE_SHORT); break; case RCU_SYSIDLE_SHORT: @@ -2767,7 +2743,7 @@ static void rcu_sysidle_cancel(void) { smp_mb(); if (full_sysidle_state > RCU_SYSIDLE_SHORT) - ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT; + WRITE_ONCE(full_sysidle_state, RCU_SYSIDLE_NOT); } /* @@ -2819,7 +2795,7 @@ static void rcu_sysidle_cb(struct rcu_head *rhp) smp_mb(); /* grace period precedes setting inuse. */ rshp = container_of(rhp, struct rcu_sysidle_head, rh); - ACCESS_ONCE(rshp->inuse) = 0; + WRITE_ONCE(rshp->inuse, 0); } /* @@ -2830,7 +2806,7 @@ static void rcu_sysidle_cb(struct rcu_head *rhp) bool rcu_sys_is_idle(void) { static struct rcu_sysidle_head rsh; - int rss = ACCESS_ONCE(full_sysidle_state); + int rss = READ_ONCE(full_sysidle_state); if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu)) return false; @@ -2858,7 +2834,7 @@ bool rcu_sys_is_idle(void) } rcu_sysidle_report(rcu_state_p, isidle, maxj, false); oldrss = rss; - rss = ACCESS_ONCE(full_sysidle_state); + rss = READ_ONCE(full_sysidle_state); } } @@ -2942,10 +2918,10 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp) #ifdef CONFIG_NO_HZ_FULL if (tick_nohz_full_cpu(smp_processor_id()) && (!rcu_gp_in_progress(rsp) || - ULONG_CMP_LT(jiffies, ACCESS_ONCE(rsp->gp_start) + HZ))) - return 1; + ULONG_CMP_LT(jiffies, READ_ONCE(rsp->gp_start) + HZ))) + return true; #endif /* #ifdef CONFIG_NO_HZ_FULL */ - return 0; + return false; } /* @@ -2971,7 +2947,7 @@ static void rcu_bind_gp_kthread(void) static void rcu_dynticks_task_enter(void) { #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) - ACCESS_ONCE(current->rcu_tasks_idle_cpu) = smp_processor_id(); + WRITE_ONCE(current->rcu_tasks_idle_cpu, smp_processor_id()); #endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */ } @@ -2979,6 +2955,6 @@ static void rcu_dynticks_task_enter(void) static void rcu_dynticks_task_exit(void) { #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) - ACCESS_ONCE(current->rcu_tasks_idle_cpu) = -1; + WRITE_ONCE(current->rcu_tasks_idle_cpu, -1); #endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */ } diff --git a/kernel/kernel/rcu/tree_trace.c b/kernel/kernel/rcu/tree_trace.c index f92361efd..ef7093cc9 100644 --- a/kernel/kernel/rcu/tree_trace.c +++ b/kernel/kernel/rcu/tree_trace.c @@ -81,9 +81,9 @@ static void r_stop(struct seq_file *m, void *v) static int show_rcubarrier(struct seq_file *m, void *v) { struct rcu_state *rsp = (struct rcu_state *)m->private; - seq_printf(m, "bcc: %d nbd: %lu\n", + seq_printf(m, "bcc: %d bseq: %lu\n", atomic_read(&rsp->barrier_cpu_count), - rsp->n_barrier_done); + rsp->barrier_sequence); return 0; } @@ -117,13 +117,13 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) if (!rdp->beenonline) return; - seq_printf(m, "%3d%cc=%ld g=%ld pq=%d/%d qp=%d", + seq_printf(m, "%3d%cc=%ld g=%ld cnq=%d/%d:%d", rdp->cpu, cpu_is_offline(rdp->cpu) ? '!' : ' ', ulong2long(rdp->completed), ulong2long(rdp->gpnum), - rdp->passed_quiesce, + rdp->cpu_no_qs.b.norm, rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu), - rdp->qs_pending); + rdp->core_needs_qs); seq_printf(m, " dt=%d/%llx/%d df=%lu", atomic_read(&rdp->dynticks->dynticks), rdp->dynticks->dynticks_nesting, @@ -185,18 +185,15 @@ static int show_rcuexp(struct seq_file *m, void *v) { struct rcu_state *rsp = (struct rcu_state *)m->private; - seq_printf(m, "s=%lu d=%lu w=%lu tf=%lu wd1=%lu wd2=%lu n=%lu sc=%lu dt=%lu dl=%lu dx=%lu\n", - atomic_long_read(&rsp->expedited_start), - atomic_long_read(&rsp->expedited_done), - atomic_long_read(&rsp->expedited_wrap), - atomic_long_read(&rsp->expedited_tryfail), + seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n", + rsp->expedited_sequence, + atomic_long_read(&rsp->expedited_workdone0), atomic_long_read(&rsp->expedited_workdone1), atomic_long_read(&rsp->expedited_workdone2), + atomic_long_read(&rsp->expedited_workdone3), atomic_long_read(&rsp->expedited_normal), - atomic_long_read(&rsp->expedited_stoppedcpus), - atomic_long_read(&rsp->expedited_done_tries), - atomic_long_read(&rsp->expedited_done_lost), - atomic_long_read(&rsp->expedited_done_exit)); + atomic_read(&rsp->expedited_need_qs), + rsp->expedited_sequence / 2); return 0; } @@ -271,13 +268,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) gpnum = rsp->gpnum; seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x ", ulong2long(rsp->completed), ulong2long(gpnum), - rsp->fqs_state, + rsp->gp_state, (long)(rsp->jiffies_force_qs - jiffies), (int)(jiffies & 0xffff)); seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", rsp->n_force_qs, rsp->n_force_qs_ngp, rsp->n_force_qs - rsp->n_force_qs_ngp, - ACCESS_ONCE(rsp->n_force_qs_lh), rsp->qlen_lazy, rsp->qlen); + READ_ONCE(rsp->n_force_qs_lh), rsp->qlen_lazy, rsp->qlen); for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) { if (rnp->level != level) { seq_puts(m, "\n"); @@ -323,8 +320,8 @@ static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) struct rcu_node *rnp = &rsp->node[0]; raw_spin_lock_irqsave(&rnp->lock, flags); - completed = ACCESS_ONCE(rsp->completed); - gpnum = ACCESS_ONCE(rsp->gpnum); + completed = READ_ONCE(rsp->completed); + gpnum = READ_ONCE(rsp->gpnum); if (completed == gpnum) gpage = 0; else @@ -364,7 +361,7 @@ static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) cpu_is_offline(rdp->cpu) ? '!' : ' ', rdp->n_rcu_pending); seq_printf(m, "qsp=%ld rpq=%ld cbr=%ld cng=%ld ", - rdp->n_rp_qs_pending, + rdp->n_rp_core_needs_qs, rdp->n_rp_report_qs, rdp->n_rp_cb_ready, rdp->n_rp_cpu_needs_gp); diff --git a/kernel/kernel/rcu/update.c b/kernel/kernel/rcu/update.c index 1718c4fe9..9a3904603 100644 --- a/kernel/kernel/rcu/update.c +++ b/kernel/kernel/rcu/update.c @@ -62,6 +62,55 @@ MODULE_ALIAS("rcupdate"); module_param(rcu_expedited, int, 0); +#if defined(CONFIG_DEBUG_LOCK_ALLOC) && defined(CONFIG_PREEMPT_COUNT) +/** + * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section? + * + * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an + * RCU-sched read-side critical section. In absence of + * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side + * critical section unless it can prove otherwise. Note that disabling + * of preemption (including disabling irqs) counts as an RCU-sched + * read-side critical section. This is useful for debug checks in functions + * that required that they be called within an RCU-sched read-side + * critical section. + * + * Check debug_lockdep_rcu_enabled() to prevent false positives during boot + * and while lockdep is disabled. + * + * Note that if the CPU is in the idle loop from an RCU point of + * view (ie: that we are in the section between rcu_idle_enter() and + * rcu_idle_exit()) then rcu_read_lock_held() returns false even if the CPU + * did an rcu_read_lock(). The reason for this is that RCU ignores CPUs + * that are in such a section, considering these as in extended quiescent + * state, so such a CPU is effectively never in an RCU read-side critical + * section regardless of what RCU primitives it invokes. This state of + * affairs is required --- we need to keep an RCU-free window in idle + * where the CPU may possibly enter into low power mode. This way we can + * notice an extended quiescent state to other CPUs that started a grace + * period. Otherwise we would delay any grace period as long as we run in + * the idle task. + * + * Similarly, we avoid claiming an SRCU read lock held if the current + * CPU is offline. + */ +int rcu_read_lock_sched_held(void) +{ + int lockdep_opinion = 0; + + if (!debug_lockdep_rcu_enabled()) + return 1; + if (!rcu_is_watching()) + return 0; + if (!rcu_lockdep_current_cpu_online()) + return 0; + if (debug_locks) + lockdep_opinion = lock_is_held(&rcu_sched_lock_map); + return lockdep_opinion || preempt_count() != 0 || irqs_disabled(); +} +EXPORT_SYMBOL(rcu_read_lock_sched_held); +#endif + #ifndef CONFIG_TINY_RCU static atomic_t rcu_expedited_nesting = @@ -150,14 +199,14 @@ void __rcu_read_unlock(void) barrier(); /* critical section before exit code. */ t->rcu_read_lock_nesting = INT_MIN; barrier(); /* assign before ->rcu_read_unlock_special load */ - if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special.s))) + if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s))) rcu_read_unlock_special(t); barrier(); /* ->rcu_read_unlock_special load before assign */ t->rcu_read_lock_nesting = 0; } #ifdef CONFIG_PROVE_LOCKING { - int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting); + int rrln = READ_ONCE(t->rcu_read_lock_nesting); WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2); } @@ -271,20 +320,37 @@ void wakeme_after_rcu(struct rcu_head *head) rcu = container_of(head, struct rcu_synchronize, head); complete(&rcu->completion); } +EXPORT_SYMBOL_GPL(wakeme_after_rcu); -void wait_rcu_gp(call_rcu_func_t crf) +void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, + struct rcu_synchronize *rs_array) { - struct rcu_synchronize rcu; + int i; - init_rcu_head_on_stack(&rcu.head); - init_completion(&rcu.completion); - /* Will wake me after RCU finished. */ - crf(&rcu.head, wakeme_after_rcu); - /* Wait for it. */ - wait_for_completion(&rcu.completion); - destroy_rcu_head_on_stack(&rcu.head); + /* Initialize and register callbacks for each flavor specified. */ + for (i = 0; i < n; i++) { + if (checktiny && + (crcu_array[i] == call_rcu || + crcu_array[i] == call_rcu_bh)) { + might_sleep(); + continue; + } + init_rcu_head_on_stack(&rs_array[i].head); + init_completion(&rs_array[i].completion); + (crcu_array[i])(&rs_array[i].head, wakeme_after_rcu); + } + + /* Wait for all callbacks to be invoked. */ + for (i = 0; i < n; i++) { + if (checktiny && + (crcu_array[i] == call_rcu || + crcu_array[i] == call_rcu_bh)) + continue; + wait_for_completion(&rs_array[i].completion); + destroy_rcu_head_on_stack(&rs_array[i].head); + } } -EXPORT_SYMBOL_GPL(wait_rcu_gp); +EXPORT_SYMBOL_GPL(__wait_rcu_gp); #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD void init_rcu_head(struct rcu_head *head) @@ -391,17 +457,17 @@ module_param(rcu_cpu_stall_timeout, int, 0644); int rcu_jiffies_till_stall_check(void) { - int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout); + int till_stall_check = READ_ONCE(rcu_cpu_stall_timeout); /* * Limit check must be consistent with the Kconfig limits * for CONFIG_RCU_CPU_STALL_TIMEOUT. */ if (till_stall_check < 3) { - ACCESS_ONCE(rcu_cpu_stall_timeout) = 3; + WRITE_ONCE(rcu_cpu_stall_timeout, 3); till_stall_check = 3; } else if (till_stall_check > 300) { - ACCESS_ONCE(rcu_cpu_stall_timeout) = 300; + WRITE_ONCE(rcu_cpu_stall_timeout, 300); till_stall_check = 300; } return till_stall_check * HZ + RCU_STALL_DELAY_DELTA; @@ -470,7 +536,7 @@ static void rcu_spawn_tasks_kthread(void); * Post an RCU-tasks callback. First call must be from process context * after the scheduler if fully operational. */ -void call_rcu_tasks(struct rcu_head *rhp, void (*func)(struct rcu_head *rhp)) +void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) { unsigned long flags; bool needwake; @@ -525,8 +591,8 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks); void synchronize_rcu_tasks(void) { /* Complain if the scheduler has not started. */ - rcu_lockdep_assert(!rcu_scheduler_active, - "synchronize_rcu_tasks called too soon"); + RCU_LOCKDEP_WARN(!rcu_scheduler_active, + "synchronize_rcu_tasks called too soon"); /* Wait for the grace period. */ wait_rcu_gp(call_rcu_tasks); @@ -552,12 +618,12 @@ static void check_holdout_task(struct task_struct *t, { int cpu; - if (!ACCESS_ONCE(t->rcu_tasks_holdout) || - t->rcu_tasks_nvcsw != ACCESS_ONCE(t->nvcsw) || - !ACCESS_ONCE(t->on_rq) || + if (!READ_ONCE(t->rcu_tasks_holdout) || + t->rcu_tasks_nvcsw != READ_ONCE(t->nvcsw) || + !READ_ONCE(t->on_rq) || (IS_ENABLED(CONFIG_NO_HZ_FULL) && !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) { - ACCESS_ONCE(t->rcu_tasks_holdout) = false; + WRITE_ONCE(t->rcu_tasks_holdout, false); list_del_init(&t->rcu_tasks_holdout_list); put_task_struct(t); return; @@ -641,11 +707,11 @@ static int __noreturn rcu_tasks_kthread(void *arg) */ rcu_read_lock(); for_each_process_thread(g, t) { - if (t != current && ACCESS_ONCE(t->on_rq) && + if (t != current && READ_ONCE(t->on_rq) && !is_idle_task(t)) { get_task_struct(t); - t->rcu_tasks_nvcsw = ACCESS_ONCE(t->nvcsw); - ACCESS_ONCE(t->rcu_tasks_holdout) = true; + t->rcu_tasks_nvcsw = READ_ONCE(t->nvcsw); + WRITE_ONCE(t->rcu_tasks_holdout, true); list_add(&t->rcu_tasks_holdout_list, &rcu_tasks_holdouts); } @@ -674,7 +740,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) struct task_struct *t1; schedule_timeout_interruptible(HZ); - rtst = ACCESS_ONCE(rcu_task_stall_timeout); + rtst = READ_ONCE(rcu_task_stall_timeout); needreport = rtst > 0 && time_after(jiffies, lastreport + rtst); if (needreport) @@ -730,7 +796,7 @@ static void rcu_spawn_tasks_kthread(void) static struct task_struct *rcu_tasks_kthread_ptr; struct task_struct *t; - if (ACCESS_ONCE(rcu_tasks_kthread_ptr)) { + if (READ_ONCE(rcu_tasks_kthread_ptr)) { smp_mb(); /* Ensure caller sees full kthread. */ return; } @@ -742,7 +808,7 @@ static void rcu_spawn_tasks_kthread(void) t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread"); BUG_ON(IS_ERR(t)); smp_mb(); /* Ensure others see full kthread. */ - ACCESS_ONCE(rcu_tasks_kthread_ptr) = t; + WRITE_ONCE(rcu_tasks_kthread_ptr, t); mutex_unlock(&rcu_tasks_kthread_mutex); } diff --git a/kernel/kernel/reboot.c b/kernel/kernel/reboot.c index d20c85d9f..bd30a973f 100644 --- a/kernel/kernel/reboot.c +++ b/kernel/kernel/reboot.c @@ -346,7 +346,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, kernel_restart(buffer); break; -#ifdef CONFIG_KEXEC +#ifdef CONFIG_KEXEC_CORE case LINUX_REBOOT_CMD_KEXEC: ret = kernel_kexec(); break; diff --git a/kernel/kernel/relay.c b/kernel/kernel/relay.c index 509f68fb9..60684be39 100644 --- a/kernel/kernel/relay.c +++ b/kernel/kernel/relay.c @@ -81,10 +81,7 @@ static struct page **relay_alloc_page_array(unsigned int n_pages) */ static void relay_free_page_array(struct page **array) { - if (is_vmalloc_addr(array)) - vfree(array); - else - kfree(array); + kvfree(array); } /** diff --git a/kernel/kernel/resource.c b/kernel/kernel/resource.c index 90552aab5..249b1eb1e 100644 --- a/kernel/kernel/resource.c +++ b/kernel/kernel/resource.c @@ -492,40 +492,51 @@ int __weak page_is_ram(unsigned long pfn) } EXPORT_SYMBOL_GPL(page_is_ram); -/* - * Search for a resouce entry that fully contains the specified region. - * If found, return 1 if it is RAM, 0 if not. - * If not found, or region is not fully contained, return -1 +/** + * region_intersects() - determine intersection of region with known resources + * @start: region start address + * @size: size of region + * @name: name of resource (in iomem_resource) * - * Used by the ioremap functions to ensure the user is not remapping RAM and is - * a vast speed up over walking through the resource table page by page. + * Check if the specified region partially overlaps or fully eclipses a + * resource identified by @name. Return REGION_DISJOINT if the region + * does not overlap @name, return REGION_MIXED if the region overlaps + * @type and another resource, and return REGION_INTERSECTS if the + * region overlaps @type and no other defined resource. Note, that + * REGION_INTERSECTS is also returned in the case when the specified + * region overlaps RAM and undefined memory holes. + * + * region_intersect() is used by memory remapping functions to ensure + * the user is not remapping RAM and is a vast speed up over walking + * through the resource table page by page. */ -int region_is_ram(resource_size_t start, unsigned long size) +int region_intersects(resource_size_t start, size_t size, const char *name) { - struct resource *p; + unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY; resource_size_t end = start + size - 1; - int flags = IORESOURCE_MEM | IORESOURCE_BUSY; - const char *name = "System RAM"; - int ret = -1; + int type = 0; int other = 0; + struct resource *p; read_lock(&resource_lock); for (p = iomem_resource.child; p ; p = p->sibling) { - if (end < p->start) - continue; - - if (p->start <= start && end <= p->end) { - /* resource fully contains region */ - if ((p->flags != flags) || strcmp(p->name, name)) - ret = 0; - else - ret = 1; - break; - } - if (p->end < start) - break; /* not found */ + bool is_type = strcmp(p->name, name) == 0 && p->flags == flags; + + if (start >= p->start && start <= p->end) + is_type ? type++ : other++; + if (end >= p->start && end <= p->end) + is_type ? type++ : other++; + if (p->start >= start && p->end <= end) + is_type ? type++ : other++; } read_unlock(&resource_lock); - return ret; + + if (other == 0) + return type ? REGION_INTERSECTS : REGION_DISJOINT; + + if (type) + return REGION_MIXED; + + return REGION_DISJOINT; } void __weak arch_remove_reservations(struct resource *avail) @@ -1072,9 +1083,10 @@ struct resource * __request_region(struct resource *parent, if (!conflict) break; if (conflict != parent) { - parent = conflict; - if (!(conflict->flags & IORESOURCE_BUSY)) + if (!(conflict->flags & IORESOURCE_BUSY)) { + parent = conflict; continue; + } } if (conflict->flags & flags & IORESOURCE_MUXED) { add_wait_queue(&muxed_resource_wait, &wait); diff --git a/kernel/kernel/sched/Makefile b/kernel/kernel/sched/Makefile index 3944d32a0..debedbee5 100644 --- a/kernel/kernel/sched/Makefile +++ b/kernel/kernel/sched/Makefile @@ -11,9 +11,9 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif -obj-y += core.o proc.o clock.o cputime.o +obj-y += core.o loadavg.o clock.o cputime.o obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o -obj-y += wait.o wait-simple.o work-simple.o completion.o idle.o +obj-y += wait.o swait.o work-simple.o completion.o idle.o obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o diff --git a/kernel/kernel/sched/auto_group.c b/kernel/kernel/sched/auto_group.c index eae160dd6..750ed601d 100644 --- a/kernel/kernel/sched/auto_group.c +++ b/kernel/kernel/sched/auto_group.c @@ -1,5 +1,3 @@ -#ifdef CONFIG_SCHED_AUTOGROUP - #include "sched.h" #include <linux/proc_fs.h> @@ -141,7 +139,7 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) p->signal->autogroup = autogroup_kref_get(ag); - if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled)) + if (!READ_ONCE(sysctl_sched_autogroup_enabled)) goto out; for_each_thread(p, t) @@ -249,5 +247,3 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen) return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); } #endif /* CONFIG_SCHED_DEBUG */ - -#endif /* CONFIG_SCHED_AUTOGROUP */ diff --git a/kernel/kernel/sched/auto_group.h b/kernel/kernel/sched/auto_group.h index 8bd047142..890c95f25 100644 --- a/kernel/kernel/sched/auto_group.h +++ b/kernel/kernel/sched/auto_group.h @@ -29,7 +29,7 @@ extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg); static inline struct task_group * autogroup_task_group(struct task_struct *p, struct task_group *tg) { - int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); + int enabled = READ_ONCE(sysctl_sched_autogroup_enabled); if (enabled && task_wants_autogroup(p, tg)) return p->signal->autogroup->tg; diff --git a/kernel/kernel/sched/clock.c b/kernel/kernel/sched/clock.c index c0a205101..caf4041f5 100644 --- a/kernel/kernel/sched/clock.c +++ b/kernel/kernel/sched/clock.c @@ -1,7 +1,7 @@ /* * sched_clock for unstable cpu clocks * - * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra * * Updates and enhancements: * Copyright (C) 2008 Red Hat, Inc. Steven Rostedt <srostedt@redhat.com> diff --git a/kernel/kernel/sched/completion.c b/kernel/kernel/sched/completion.c index 45ebcffd9..b62cf6400 100644 --- a/kernel/kernel/sched/completion.c +++ b/kernel/kernel/sched/completion.c @@ -32,7 +32,7 @@ void complete(struct completion *x) raw_spin_lock_irqsave(&x->wait.lock, flags); x->done++; - __swait_wake_locked(&x->wait, TASK_NORMAL, 1); + swake_up_locked(&x->wait); raw_spin_unlock_irqrestore(&x->wait.lock, flags); } EXPORT_SYMBOL(complete); @@ -52,7 +52,7 @@ void complete_all(struct completion *x) raw_spin_lock_irqsave(&x->wait.lock, flags); x->done += UINT_MAX/2; - __swait_wake_locked(&x->wait, TASK_NORMAL, 0); + swake_up_all_locked(&x->wait); raw_spin_unlock_irqrestore(&x->wait.lock, flags); } EXPORT_SYMBOL(complete_all); @@ -62,9 +62,9 @@ do_wait_for_common(struct completion *x, long (*action)(long), long timeout, int state) { if (!x->done) { - DEFINE_SWAITER(wait); + DECLARE_SWAITQUEUE(wait); - swait_prepare_locked(&x->wait, &wait); + __prepare_to_swait(&x->wait, &wait); do { if (signal_pending_state(state, current)) { timeout = -ERESTARTSYS; @@ -75,7 +75,7 @@ do_wait_for_common(struct completion *x, timeout = action(timeout); raw_spin_lock_irq(&x->wait.lock); } while (!x->done && timeout); - swait_finish_locked(&x->wait, &wait); + __finish_swait(&x->wait, &wait); if (!x->done) return timeout; } diff --git a/kernel/kernel/sched/core.c b/kernel/kernel/sched/core.c index 799b75b27..94827a593 100644 --- a/kernel/kernel/sched/core.c +++ b/kernel/kernel/sched/core.c @@ -90,26 +90,6 @@ #define CREATE_TRACE_POINTS #include <trace/events/sched.h> -void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) -{ - unsigned long delta; - ktime_t soft, hard, now; - - for (;;) { - if (hrtimer_active(period_timer)) - break; - - now = hrtimer_cb_get_time(period_timer); - hrtimer_forward(period_timer, now, period); - - soft = hrtimer_get_softexpires(period_timer); - hard = hrtimer_get_expires(period_timer); - delta = ktime_to_ns(ktime_sub(hard, soft)); - __hrtimer_start_range_ns(period_timer, soft, delta, - HRTIMER_MODE_ABS_PINNED, 0); - } -} - DEFINE_MUTEX(sched_domains_mutex); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); @@ -184,14 +164,12 @@ struct static_key sched_feat_keys[__SCHED_FEAT_NR] = { static void sched_feat_disable(int i) { - if (static_key_enabled(&sched_feat_keys[i])) - static_key_slow_dec(&sched_feat_keys[i]); + static_key_disable(&sched_feat_keys[i]); } static void sched_feat_enable(int i) { - if (!static_key_enabled(&sched_feat_keys[i])) - static_key_slow_inc(&sched_feat_keys[i]); + static_key_enable(&sched_feat_keys[i]); } #else static void sched_feat_disable(int i) { }; @@ -359,12 +337,11 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) #ifdef CONFIG_SMP -static int __hrtick_restart(struct rq *rq) +static void __hrtick_restart(struct rq *rq) { struct hrtimer *timer = &rq->hrtick_timer; - ktime_t time = hrtimer_get_softexpires(timer); - return __hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS_PINNED, 0); + hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); } /* @@ -444,8 +421,8 @@ void hrtick_start(struct rq *rq, u64 delay) * doesn't make sense. Rely on vruntime for fairness. */ delay = max_t(u64, delay, 10000LL); - __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, - HRTIMER_MODE_REL_PINNED, 0); + hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), + HRTIMER_MODE_REL_PINNED); } static inline void init_hrtick(void) @@ -516,7 +493,7 @@ static bool set_nr_and_not_polling(struct task_struct *p) static bool set_nr_if_polling(struct task_struct *p) { struct thread_info *ti = task_thread_info(p); - typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags); + typeof(ti->flags) old, val = READ_ONCE(ti->flags); for (;;) { if (!(val & _TIF_POLLING_NRFLAG)) @@ -570,7 +547,7 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task) head->lastp = &node->next; } -void wake_up_q(struct wake_q_head *head) +void __wake_up_q(struct wake_q_head *head, bool sleeper) { struct wake_q_node *node = head->first; @@ -587,7 +564,10 @@ void wake_up_q(struct wake_q_head *head) * wake_up_process() implies a wmb() to pair with the queueing * in wake_q_add() so as not to miss wakeups. */ - wake_up_process(task); + if (sleeper) + wake_up_lock_sleeper(task); + else + wake_up_process(task); put_task_struct(task); } } @@ -676,26 +656,29 @@ void resched_cpu(int cpu) * selecting an idle cpu will add more delays to the timers than intended * (as that cpu's timer base may not be uptodate wrt jiffies etc). */ -int get_nohz_timer_target(int pinned) +int get_nohz_timer_target(void) { - int cpu; - int i; + int i, cpu; struct sched_domain *sd; preempt_disable_rt(); cpu = smp_processor_id(); - if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu)) + + if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu)) goto preempt_en_rt; rcu_read_lock(); for_each_domain(cpu, sd) { for_each_cpu(i, sched_domain_span(sd)) { - if (!idle_cpu(i)) { + if (!idle_cpu(i) && is_housekeeping_cpu(cpu)) { cpu = i; goto unlock; } } } + + if (!is_housekeeping_cpu(cpu)) + cpu = housekeeping_any_cpu(); unlock: rcu_read_unlock(); preempt_en_rt: @@ -879,7 +862,7 @@ static void set_load_weight(struct task_struct *p) /* * SCHED_IDLE tasks get minimal weight: */ - if (p->policy == SCHED_IDLE) { + if (idle_policy(p->policy)) { load->weight = scale_load(WEIGHT_IDLEPRIO); load->inv_weight = WMULT_IDLEPRIO; return; @@ -889,17 +872,19 @@ static void set_load_weight(struct task_struct *p) load->inv_weight = prio_to_wmult[prio]; } -static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) +static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) { update_rq_clock(rq); - sched_info_queued(rq, p); + if (!(flags & ENQUEUE_RESTORE)) + sched_info_queued(rq, p); p->sched_class->enqueue_task(rq, p, flags); } -static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) +static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) { update_rq_clock(rq); - sched_info_dequeued(rq, p); + if (!(flags & DEQUEUE_SAVE)) + sched_info_dequeued(rq, p); p->sched_class->dequeue_task(rq, p, flags); } @@ -1063,7 +1048,11 @@ inline int task_curr(const struct task_struct *p) } /* - * Can drop rq->lock because from sched_class::switched_from() methods drop it. + * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock, + * use the balance_callback list if you want balancing. + * + * this means any call to check_class_changed() must be followed by a call to + * balance_callback(). */ static inline void check_class_changed(struct rq *rq, struct task_struct *p, const struct sched_class *prev_class, @@ -1072,7 +1061,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, if (prev_class != p->sched_class) { if (prev_class->switched_from) prev_class->switched_from(rq, p); - /* Possble rq->lock 'hole'. */ + p->sched_class->switched_to(rq, p); } else if (oldprio != p->prio || dl_task(p)) p->sched_class->prio_changed(rq, p, oldprio); @@ -1104,6 +1093,305 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) } #ifdef CONFIG_SMP +/* + * This is how migration works: + * + * 1) we invoke migration_cpu_stop() on the target CPU using + * stop_one_cpu(). + * 2) stopper starts to run (implicitly forcing the migrated thread + * off the CPU) + * 3) it checks whether the migrated task is still in the wrong runqueue. + * 4) if it's in the wrong runqueue then the migration thread removes + * it and puts it into the right queue. + * 5) stopper completes and stop_one_cpu() returns and the migration + * is done. + */ + +/* + * move_queued_task - move a queued task to new rq. + * + * Returns (locked) new rq. Old rq's lock is released. + */ +static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new_cpu) +{ + lockdep_assert_held(&rq->lock); + + dequeue_task(rq, p, 0); + p->on_rq = TASK_ON_RQ_MIGRATING; + set_task_cpu(p, new_cpu); + raw_spin_unlock(&rq->lock); + + rq = cpu_rq(new_cpu); + + raw_spin_lock(&rq->lock); + BUG_ON(task_cpu(p) != new_cpu); + p->on_rq = TASK_ON_RQ_QUEUED; + enqueue_task(rq, p, 0); + check_preempt_curr(rq, p, 0); + + return rq; +} + +struct migration_arg { + struct task_struct *task; + int dest_cpu; +}; + +/* + * Move (not current) task off this cpu, onto dest cpu. We're doing + * this because either it can't run here any more (set_cpus_allowed() + * away from this CPU, or CPU going down), or because we're + * attempting to rebalance this task on exec (sched_exec). + * + * So we race with normal scheduler movements, but that's OK, as long + * as the task is no longer on this CPU. + */ +static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_cpu) +{ + if (unlikely(!cpu_active(dest_cpu))) + return rq; + + /* Affinity changed (again). */ + if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) + return rq; + + rq = move_queued_task(rq, p, dest_cpu); + + return rq; +} + +/* + * migration_cpu_stop - this will be executed by a highprio stopper thread + * and performs thread migration by bumping thread off CPU then + * 'pushing' onto another runqueue. + */ +static int migration_cpu_stop(void *data) +{ + struct migration_arg *arg = data; + struct task_struct *p = arg->task; + struct rq *rq = this_rq(); + + /* + * The original target cpu might have gone down and we might + * be on another cpu but it doesn't matter. + */ + local_irq_disable(); + /* + * We need to explicitly wake pending tasks before running + * __migrate_task() such that we will not miss enforcing cpus_allowed + * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. + */ + sched_ttwu_pending(); + + raw_spin_lock(&p->pi_lock); + raw_spin_lock(&rq->lock); + /* + * If task_rq(p) != rq, it cannot be migrated here, because we're + * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because + * we're holding p->pi_lock. + */ + if (task_rq(p) == rq && task_on_rq_queued(p)) + rq = __migrate_task(rq, p, arg->dest_cpu); + raw_spin_unlock(&rq->lock); + raw_spin_unlock(&p->pi_lock); + + local_irq_enable(); + return 0; +} + +/* + * sched_class::set_cpus_allowed must do the below, but is not required to + * actually call this function. + */ +void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) +{ + cpumask_copy(&p->cpus_allowed, new_mask); + p->nr_cpus_allowed = cpumask_weight(new_mask); +} + +void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) +{ + struct rq *rq = task_rq(p); + bool queued, running; + + lockdep_assert_held(&p->pi_lock); + + if (__migrate_disabled(p)) { + cpumask_copy(&p->cpus_allowed, new_mask); + return; + } + + queued = task_on_rq_queued(p); + running = task_current(rq, p); + + if (queued) { + /* + * Because __kthread_bind() calls this on blocked tasks without + * holding rq->lock. + */ + lockdep_assert_held(&rq->lock); + dequeue_task(rq, p, DEQUEUE_SAVE); + } + if (running) + put_prev_task(rq, p); + + p->sched_class->set_cpus_allowed(p, new_mask); + + if (running) + p->sched_class->set_curr_task(rq); + if (queued) + enqueue_task(rq, p, ENQUEUE_RESTORE); +} + +static DEFINE_PER_CPU(struct cpumask, sched_cpumasks); +static DEFINE_MUTEX(sched_down_mutex); +static cpumask_t sched_down_cpumask; + +void tell_sched_cpu_down_begin(int cpu) +{ + mutex_lock(&sched_down_mutex); + cpumask_set_cpu(cpu, &sched_down_cpumask); + mutex_unlock(&sched_down_mutex); +} + +void tell_sched_cpu_down_done(int cpu) +{ + mutex_lock(&sched_down_mutex); + cpumask_clear_cpu(cpu, &sched_down_cpumask); + mutex_unlock(&sched_down_mutex); +} + +/** + * migrate_me - try to move the current task off this cpu + * + * Used by the pin_current_cpu() code to try to get tasks + * to move off the current CPU as it is going down. + * It will only move the task if the task isn't pinned to + * the CPU (with migrate_disable, affinity or NO_SETAFFINITY) + * and the task has to be in a RUNNING state. Otherwise the + * movement of the task will wake it up (change its state + * to running) when the task did not expect it. + * + * Returns 1 if it succeeded in moving the current task + * 0 otherwise. + */ +int migrate_me(void) +{ + struct task_struct *p = current; + struct migration_arg arg; + struct cpumask *cpumask; + struct cpumask *mask; + unsigned long flags; + unsigned int dest_cpu; + struct rq *rq; + + /* + * We can not migrate tasks bounded to a CPU or tasks not + * running. The movement of the task will wake it up. + */ + if (p->flags & PF_NO_SETAFFINITY || p->state) + return 0; + + mutex_lock(&sched_down_mutex); + rq = task_rq_lock(p, &flags); + + cpumask = this_cpu_ptr(&sched_cpumasks); + mask = &p->cpus_allowed; + + cpumask_andnot(cpumask, mask, &sched_down_cpumask); + + if (!cpumask_weight(cpumask)) { + /* It's only on this CPU? */ + task_rq_unlock(rq, p, &flags); + mutex_unlock(&sched_down_mutex); + return 0; + } + + dest_cpu = cpumask_any_and(cpu_active_mask, cpumask); + + arg.task = p; + arg.dest_cpu = dest_cpu; + + task_rq_unlock(rq, p, &flags); + + stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); + tlb_migrate_finish(p->mm); + mutex_unlock(&sched_down_mutex); + + return 1; +} + +/* + * Change a given task's CPU affinity. Migrate the thread to a + * proper CPU and schedule it away if the CPU it's executing on + * is removed from the allowed bitmask. + * + * NOTE: the caller must have a valid reference to the task, the + * task must not exit() & deallocate itself prematurely. The + * call is not atomic; no spinlocks may be held. + */ +static int __set_cpus_allowed_ptr(struct task_struct *p, + const struct cpumask *new_mask, bool check) +{ + unsigned long flags; + struct rq *rq; + unsigned int dest_cpu; + int ret = 0; + + rq = task_rq_lock(p, &flags); + + /* + * Must re-check here, to close a race against __kthread_bind(), + * sched_setaffinity() is not guaranteed to observe the flag. + */ + if (check && (p->flags & PF_NO_SETAFFINITY)) { + ret = -EINVAL; + goto out; + } + + if (cpumask_equal(&p->cpus_allowed, new_mask)) + goto out; + + if (!cpumask_intersects(new_mask, cpu_active_mask)) { + ret = -EINVAL; + goto out; + } + + do_set_cpus_allowed(p, new_mask); + + /* Can the task run on the task's current CPU? If so, we're done */ + if (cpumask_test_cpu(task_cpu(p), new_mask) || __migrate_disabled(p)) + goto out; + + dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); + if (task_running(rq, p) || p->state == TASK_WAKING) { + struct migration_arg arg = { p, dest_cpu }; + /* Need help from migration thread: drop lock and wait. */ + task_rq_unlock(rq, p, &flags); + stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); + tlb_migrate_finish(p->mm); + return 0; + } else if (task_on_rq_queued(p)) { + /* + * OK, since we're going to drop the lock immediately + * afterwards anyway. + */ + lockdep_unpin_lock(&rq->lock); + rq = move_queued_task(rq, p, dest_cpu); + lockdep_pin_lock(&rq->lock); + } +out: + task_rq_unlock(rq, p, &flags); + + return ret; +} + +int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) +{ + return __set_cpus_allowed_ptr(p, new_mask, false); +} +EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); + void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { #ifdef CONFIG_SCHED_DEBUG @@ -1134,9 +1422,9 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) if (task_cpu(p) != new_cpu) { if (p->sched_class->migrate_task_rq) - p->sched_class->migrate_task_rq(p, new_cpu); + p->sched_class->migrate_task_rq(p); p->se.nr_migrations++; - perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0); + perf_event_task_migrate(p); } __set_task_cpu(p, new_cpu); @@ -1175,12 +1463,16 @@ static int migrate_swap_stop(void *data) struct rq *src_rq, *dst_rq; int ret = -EAGAIN; + if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu)) + return -EAGAIN; + src_rq = cpu_rq(arg->src_cpu); dst_rq = cpu_rq(arg->dst_cpu); double_raw_lock(&arg->src_task->pi_lock, &arg->dst_task->pi_lock); double_rq_lock(src_rq, dst_rq); + if (task_cpu(arg->dst_task) != arg->dst_cpu) goto unlock; @@ -1244,13 +1536,6 @@ out: return ret; } -struct migration_arg { - struct task_struct *task; - int dest_cpu; -}; - -static int migration_cpu_stop(void *data); - static bool check_task_state(struct task_struct *p, long match_state) { bool match = false; @@ -1396,9 +1681,7 @@ void kick_process(struct task_struct *p) preempt_enable(); } EXPORT_SYMBOL_GPL(kick_process); -#endif /* CONFIG_SMP */ -#ifdef CONFIG_SMP /* * ->cpus_allowed is protected by both rq->lock and p->pi_lock */ @@ -1438,13 +1721,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p) goto out; } + /* No more Mr. Nice Guy. */ switch (state) { case cpuset: - /* No more Mr. Nice Guy. */ - cpuset_cpus_allowed_fallback(p); - state = possible; - break; - + if (IS_ENABLED(CONFIG_CPUSETS)) { + cpuset_cpus_allowed_fallback(p); + state = possible; + break; + } + /* fall-through */ case possible: do_set_cpus_allowed(p, cpu_possible_mask); state = fail; @@ -1478,7 +1763,9 @@ out: static inline int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) { - if (p->nr_cpus_allowed > 1) + lockdep_assert_held(&p->pi_lock); + + if (tsk_nr_cpus_allowed(p) > 1) cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); /* @@ -1503,7 +1790,16 @@ static void update_avg(u64 *avg, u64 sample) s64 diff = sample - *avg; *avg += diff >> 3; } -#endif + +#else + +static inline int __set_cpus_allowed_ptr(struct task_struct *p, + const struct cpumask *new_mask, bool check) +{ + return set_cpus_allowed_ptr(p, new_mask); +} + +#endif /* CONFIG_SMP */ static void ttwu_stat(struct task_struct *p, int cpu, int wake_flags) @@ -1545,7 +1841,7 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) #endif /* CONFIG_SCHEDSTATS */ } -static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) +static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) { activate_task(rq, p, en_flags); p->on_rq = TASK_ON_RQ_QUEUED; @@ -1558,12 +1854,19 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) { check_preempt_curr(rq, p, wake_flags); - trace_sched_wakeup(p, true); - p->state = TASK_RUNNING; + trace_sched_wakeup(p); + #ifdef CONFIG_SMP - if (p->sched_class->task_woken) + if (p->sched_class->task_woken) { + /* + * Our task @p is fully woken up and running; so its safe to + * drop the rq->lock, hereafter rq is only used for statistics. + */ + lockdep_unpin_lock(&rq->lock); p->sched_class->task_woken(rq, p); + lockdep_pin_lock(&rq->lock); + } if (rq->idle_stamp) { u64 delta = rq_clock(rq) - rq->idle_stamp; @@ -1582,6 +1885,8 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) static void ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) { + lockdep_assert_held(&rq->lock); + #ifdef CONFIG_SMP if (p->sched_contributes_to_load) rq->nr_uninterruptible--; @@ -1626,6 +1931,7 @@ void sched_ttwu_pending(void) return; raw_spin_lock_irqsave(&rq->lock, flags); + lockdep_pin_lock(&rq->lock); while (llist) { p = llist_entry(llist, struct task_struct, wake_entry); @@ -1633,6 +1939,7 @@ void sched_ttwu_pending(void) ttwu_do_activate(rq, p, 0); } + lockdep_unpin_lock(&rq->lock); raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -1729,7 +2036,9 @@ static void ttwu_queue(struct task_struct *p, int cpu) #endif raw_spin_lock(&rq->lock); + lockdep_pin_lock(&rq->lock); ttwu_do_activate(rq, p, 0); + lockdep_unpin_lock(&rq->lock); raw_spin_unlock(&rq->lock); } @@ -1784,6 +2093,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) if (!(wake_flags & WF_LOCK_SLEEPER)) p->saved_state = TASK_RUNNING; + trace_sched_waking(p); + success = 1; /* we're going to change ->state */ cpu = task_cpu(p); @@ -1792,13 +2103,38 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) #ifdef CONFIG_SMP /* + * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be + * possible to, falsely, observe p->on_cpu == 0. + * + * One must be running (->on_cpu == 1) in order to remove oneself + * from the runqueue. + * + * [S] ->on_cpu = 1; [L] ->on_rq + * UNLOCK rq->lock + * RMB + * LOCK rq->lock + * [S] ->on_rq = 0; [L] ->on_cpu + * + * Pairs with the full barrier implied in the UNLOCK+LOCK on rq->lock + * from the consecutive calls to schedule(); the first switching to our + * task, the second putting it to sleep. + */ + smp_rmb(); + + /* * If the owning (remote) cpu is still in the middle of schedule() with * this task as prev, wait until its done referencing the task. */ while (p->on_cpu) cpu_relax(); /* - * Pairs with the smp_wmb() in finish_lock_switch(). + * Combined with the control dependency above, we have an effective + * smp_load_acquire() without the need for full barriers. + * + * Pairs with the smp_store_release() in finish_lock_switch(). + * + * This ensures that tasks getting woken will be fully ordered against + * their previous state and preserve Program Order. */ smp_rmb(); @@ -1838,7 +2174,6 @@ out: */ int wake_up_process(struct task_struct *p) { - WARN_ON(__task_is_stopped_or_traced(p)); return try_to_wake_up(p, TASK_NORMAL, 0); } EXPORT_SYMBOL(wake_up_process); @@ -1894,9 +2229,6 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; p->se.vruntime = 0; -#ifdef CONFIG_SMP - p->se.avg.decay_count = 0; -#endif INIT_LIST_HEAD(&p->se.group_node); #ifdef CONFIG_SCHEDSTATS @@ -1936,23 +2268,17 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) #endif /* CONFIG_NUMA_BALANCING */ } +DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); + #ifdef CONFIG_NUMA_BALANCING -#ifdef CONFIG_SCHED_DEBUG + void set_numabalancing_state(bool enabled) { if (enabled) - sched_feat_set("NUMA"); + static_branch_enable(&sched_numa_balancing); else - sched_feat_set("NO_NUMA"); -} -#else -__read_mostly bool numabalancing_enabled; - -void set_numabalancing_state(bool enabled) -{ - numabalancing_enabled = enabled; + static_branch_disable(&sched_numa_balancing); } -#endif /* CONFIG_SCHED_DEBUG */ #ifdef CONFIG_PROC_SYSCTL int sysctl_numa_balancing(struct ctl_table *table, int write, @@ -1960,7 +2286,7 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, { struct ctl_table t; int err; - int state = numabalancing_enabled; + int state = static_branch_likely(&sched_numa_balancing); if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; @@ -2042,7 +2368,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) set_task_cpu(p, cpu); raw_spin_unlock_irqrestore(&p->pi_lock, flags); -#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) +#ifdef CONFIG_SCHED_INFO if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif @@ -2081,8 +2407,8 @@ unsigned long to_ratio(u64 period, u64 runtime) #ifdef CONFIG_SMP inline struct dl_bw *dl_bw_of(int i) { - rcu_lockdep_assert(rcu_read_lock_sched_held(), - "sched RCU must be held"); + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), + "sched RCU must be held"); return &cpu_rq(i)->rd->dl_bw; } @@ -2091,8 +2417,8 @@ static inline int dl_bw_cpus(int i) struct root_domain *rd = cpu_rq(i)->rd; int cpus = 0; - rcu_lockdep_assert(rcu_read_lock_sched_held(), - "sched RCU must be held"); + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), + "sched RCU must be held"); for_each_cpu_and(i, rd->span, cpu_active_mask) cpus++; @@ -2174,6 +2500,8 @@ void wake_up_new_task(struct task_struct *p) struct rq *rq; raw_spin_lock_irqsave(&p->pi_lock, flags); + /* Initialize new task's runnable average */ + init_entity_runnable_average(&p->se); #ifdef CONFIG_SMP /* * Fork balancing, do it here and not earlier because: @@ -2183,28 +2511,50 @@ void wake_up_new_task(struct task_struct *p) set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif - /* Initialize new task's runnable average */ - init_task_runnable_average(p); rq = __task_rq_lock(p); activate_task(rq, p, 0); p->on_rq = TASK_ON_RQ_QUEUED; - trace_sched_wakeup_new(p, true); + trace_sched_wakeup_new(p); check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP - if (p->sched_class->task_woken) + if (p->sched_class->task_woken) { + /* + * Nothing relies on rq->lock after this, so its fine to + * drop it. + */ + lockdep_unpin_lock(&rq->lock); p->sched_class->task_woken(rq, p); + lockdep_pin_lock(&rq->lock); + } #endif task_rq_unlock(rq, p, &flags); } #ifdef CONFIG_PREEMPT_NOTIFIERS +static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE; + +void preempt_notifier_inc(void) +{ + static_key_slow_inc(&preempt_notifier_key); +} +EXPORT_SYMBOL_GPL(preempt_notifier_inc); + +void preempt_notifier_dec(void) +{ + static_key_slow_dec(&preempt_notifier_key); +} +EXPORT_SYMBOL_GPL(preempt_notifier_dec); + /** * preempt_notifier_register - tell me when current is being preempted & rescheduled * @notifier: notifier struct to register */ void preempt_notifier_register(struct preempt_notifier *notifier) { + if (!static_key_false(&preempt_notifier_key)) + WARN(1, "registering preempt_notifier while notifiers disabled\n"); + hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); } EXPORT_SYMBOL_GPL(preempt_notifier_register); @@ -2213,7 +2563,7 @@ EXPORT_SYMBOL_GPL(preempt_notifier_register); * preempt_notifier_unregister - no longer interested in preemption notifications * @notifier: notifier struct to unregister * - * This is safe to call from within a preemption notifier. + * This is *not* safe to call from within a preemption notifier. */ void preempt_notifier_unregister(struct preempt_notifier *notifier) { @@ -2221,7 +2571,7 @@ void preempt_notifier_unregister(struct preempt_notifier *notifier) } EXPORT_SYMBOL_GPL(preempt_notifier_unregister); -static void fire_sched_in_preempt_notifiers(struct task_struct *curr) +static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) { struct preempt_notifier *notifier; @@ -2229,9 +2579,15 @@ static void fire_sched_in_preempt_notifiers(struct task_struct *curr) notifier->ops->sched_in(notifier, raw_smp_processor_id()); } +static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) +{ + if (static_key_false(&preempt_notifier_key)) + __fire_sched_in_preempt_notifiers(curr); +} + static void -fire_sched_out_preempt_notifiers(struct task_struct *curr, - struct task_struct *next) +__fire_sched_out_preempt_notifiers(struct task_struct *curr, + struct task_struct *next) { struct preempt_notifier *notifier; @@ -2239,13 +2595,21 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr, notifier->ops->sched_out(notifier, next); } +static __always_inline void +fire_sched_out_preempt_notifiers(struct task_struct *curr, + struct task_struct *next) +{ + if (static_key_false(&preempt_notifier_key)) + __fire_sched_out_preempt_notifiers(curr, next); +} + #else /* !CONFIG_PREEMPT_NOTIFIERS */ -static void fire_sched_in_preempt_notifiers(struct task_struct *curr) +static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) { } -static void +static inline void fire_sched_out_preempt_notifiers(struct task_struct *curr, struct task_struct *next) { @@ -2270,7 +2634,6 @@ static inline void prepare_task_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { - trace_sched_switch(prev, next); sched_info_switch(rq, prev, next); perf_event_task_sched_out(prev, next); fire_sched_out_preempt_notifiers(prev, next); @@ -2304,6 +2667,22 @@ static struct rq *finish_task_switch(struct task_struct *prev) struct mm_struct *mm = rq->prev_mm; long prev_state; + /* + * The previous task will have left us with a preempt_count of 2 + * because it left us after: + * + * schedule() + * preempt_disable(); // 1 + * __schedule() + * raw_spin_lock_irq(&rq->lock) // 2 + * + * Also, see FORK_PREEMPT_COUNT. + */ + if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, + "corrupted preempt_count: %s/%d/0x%x\n", + current->comm, current->pid, preempt_count())) + preempt_count_set(FORK_PREEMPT_COUNT); + rq->prev_mm = NULL; /* @@ -2311,15 +2690,14 @@ static struct rq *finish_task_switch(struct task_struct *prev) * If a task dies, then it sets TASK_DEAD in tsk->state and calls * schedule one last time. The schedule call will never return, and * the scheduled task must drop that reference. - * The test for TASK_DEAD must occur while the runqueue locks are - * still held, otherwise prev could be scheduled on another cpu, die - * there before we look at prev->state, and then the reference would - * be dropped twice. - * Manfred Spraul <manfred@colorfullife.com> + * + * We must observe prev->state before clearing prev->on_cpu (in + * finish_lock_switch), otherwise a concurrent wakeup can get prev + * running on another CPU and we could rave with its RUNNING -> DEAD + * transition, resulting in a double drop. */ prev_state = prev->state; vtime_task_switch(prev); - finish_arch_switch(prev); perf_event_task_sched_in(prev, current); finish_lock_switch(rq, prev); finish_arch_post_lock_switch(); @@ -2343,30 +2721,42 @@ static struct rq *finish_task_switch(struct task_struct *prev) put_task_struct(prev); } - tick_nohz_task_switch(current); + tick_nohz_task_switch(); return rq; } #ifdef CONFIG_SMP /* rq->lock is NOT held, but preemption is disabled */ -static inline void post_schedule(struct rq *rq) +static void __balance_callback(struct rq *rq) { - if (rq->post_schedule) { - unsigned long flags; + struct callback_head *head, *next; + void (*func)(struct rq *rq); + unsigned long flags; - raw_spin_lock_irqsave(&rq->lock, flags); - if (rq->curr->sched_class->post_schedule) - rq->curr->sched_class->post_schedule(rq); - raw_spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_lock_irqsave(&rq->lock, flags); + head = rq->balance_callback; + rq->balance_callback = NULL; + while (head) { + func = (void (*)(struct rq *))head->func; + next = head->next; + head->next = NULL; + head = next; - rq->post_schedule = 0; + func(rq); } + raw_spin_unlock_irqrestore(&rq->lock, flags); +} + +static inline void balance_callback(struct rq *rq) +{ + if (unlikely(rq->balance_callback)) + __balance_callback(rq); } #else -static inline void post_schedule(struct rq *rq) +static inline void balance_callback(struct rq *rq) { } @@ -2381,10 +2771,17 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) { struct rq *rq; - /* finish_task_switch() drops rq->lock and enables preemtion */ - preempt_disable(); + /* + * New tasks start with FORK_PREEMPT_COUNT, see there and + * finish_task_switch() for details. + * + * finish_task_switch() will drop rq->lock() and lower preempt_count + * and the preempt_enable() will end up enabling preemption (on + * PREEMPT_COUNT kernels). + */ + rq = finish_task_switch(prev); - post_schedule(rq); + balance_callback(rq); preempt_enable(); if (current->set_child_tid) @@ -2428,9 +2825,9 @@ context_switch(struct rq *rq, struct task_struct *prev, * of the scheduler it's an obvious special-case), so we * do an early lockdep release here: */ + lockdep_unpin_lock(&rq->lock); spin_release(&rq->lock.dep_map, 1, _THIS_IP_); - context_tracking_task_switch(prev, next); /* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); barrier(); @@ -2456,13 +2853,20 @@ unsigned long nr_running(void) /* * Check if only the current task is running on the cpu. + * + * Caution: this function does not check that the caller has disabled + * preemption, thus the result might have a time-of-check-to-time-of-use + * race. The caller is responsible to use it correctly, for example: + * + * - from a non-preemptable section (of course) + * + * - from a thread that is bound to a single CPU + * + * - in a loop with very short iterations (e.g. a polling loop) */ bool single_task_running(void) { - if (cpu_rq(smp_processor_id())->nr_running == 1) - return true; - else - return false; + return raw_rq()->nr_running == 1; } EXPORT_SYMBOL(single_task_running); @@ -2495,9 +2899,9 @@ unsigned long nr_iowait_cpu(int cpu) void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) { - struct rq *this = this_rq(); - *nr_waiters = atomic_read(&this->nr_iowait); - *load = this->cpu_load[0]; + struct rq *rq = this_rq(); + *nr_waiters = atomic_read(&rq->nr_iowait); + *load = rq->load.weight; } #ifdef CONFIG_SMP @@ -2595,6 +2999,7 @@ void scheduler_tick(void) update_rq_clock(rq); curr->sched_class->task_tick(rq, curr, 0); update_cpu_load_active(rq); + calc_global_load_tick(rq); raw_spin_unlock(&rq->lock); perf_event_task_tick(); @@ -2623,7 +3028,7 @@ void scheduler_tick(void) u64 scheduler_tick_max_deferment(void) { struct rq *rq = this_rq(); - unsigned long next, now = ACCESS_ONCE(jiffies); + unsigned long next, now = READ_ONCE(jiffies); next = rq->last_sched_tick + HZ; @@ -2634,16 +3039,6 @@ u64 scheduler_tick_max_deferment(void) } #endif -notrace unsigned long get_parent_ip(unsigned long addr) -{ - if (in_lock_functions(addr)) { - addr = CALLER_ADDR2; - if (in_lock_functions(addr)) - addr = CALLER_ADDR3; - } - return addr; -} - #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ defined(CONFIG_PREEMPT_TRACER)) @@ -2665,7 +3060,7 @@ void preempt_count_add(int val) PREEMPT_MASK - 10); #endif if (preempt_count() == val) { - unsigned long ip = get_parent_ip(CALLER_ADDR1); + unsigned long ip = get_lock_parent_ip(); #ifdef CONFIG_DEBUG_PREEMPT current->preempt_disable_ip = ip; #endif @@ -2692,7 +3087,7 @@ void preempt_count_sub(int val) #endif if (preempt_count() == val) - trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); + trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); __preempt_count_sub(val); } EXPORT_SYMBOL(preempt_count_sub); @@ -2732,15 +3127,13 @@ static noinline void __schedule_bug(struct task_struct *prev) static inline void schedule_debug(struct task_struct *prev) { #ifdef CONFIG_SCHED_STACK_END_CHECK - BUG_ON(unlikely(task_stack_end_corrupted(prev))); + BUG_ON(task_stack_end_corrupted(prev)); #endif - /* - * Test if we are atomic. Since do_exit() needs to call into - * schedule() atomically, we ignore that path. Otherwise whine - * if we are scheduling when we should not. - */ - if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD)) + + if (unlikely(in_atomic_preempt_off())) { __schedule_bug(prev); + preempt_count_set(PREEMPT_DISABLED); + } rcu_sleep_check(); profile_hit(SCHED_PROFILING, __builtin_return_address(0)); @@ -2749,47 +3142,12 @@ static inline void schedule_debug(struct task_struct *prev) } #if defined(CONFIG_PREEMPT_RT_FULL) && defined(CONFIG_SMP) -#define MIGRATE_DISABLE_SET_AFFIN (1<<30) /* Can't make a negative */ -#define migrate_disabled_updated(p) ((p)->migrate_disable & MIGRATE_DISABLE_SET_AFFIN) -#define migrate_disable_count(p) ((p)->migrate_disable & ~MIGRATE_DISABLE_SET_AFFIN) - -static inline void update_migrate_disable(struct task_struct *p) -{ - const struct cpumask *mask; - - if (likely(!p->migrate_disable)) - return; - - /* Did we already update affinity? */ - if (unlikely(migrate_disabled_updated(p))) - return; - - /* - * Since this is always current we can get away with only locking - * rq->lock, the ->cpus_allowed value can normally only be changed - * while holding both p->pi_lock and rq->lock, but seeing that this - * is current, we cannot actually be waking up, so all code that - * relies on serialization against p->pi_lock is out of scope. - * - * Having rq->lock serializes us against things like - * set_cpus_allowed_ptr() that can still happen concurrently. - */ - mask = tsk_cpus_allowed(p); - - if (p->sched_class->set_cpus_allowed) - p->sched_class->set_cpus_allowed(p, mask); - /* mask==cpumask_of(task_cpu(p)) which has a cpumask_weight==1 */ - p->nr_cpus_allowed = 1; - - /* Let migrate_enable know to fix things back up */ - p->migrate_disable |= MIGRATE_DISABLE_SET_AFFIN; -} void migrate_disable(void) { struct task_struct *p = current; - if (in_atomic()) { + if (in_atomic() || irqs_disabled()) { #ifdef CONFIG_SCHED_DEBUG p->migrate_disable_atomic++; #endif @@ -2819,11 +3177,8 @@ EXPORT_SYMBOL(migrate_disable); void migrate_enable(void) { struct task_struct *p = current; - const struct cpumask *mask; - unsigned long flags; - struct rq *rq; - if (in_atomic()) { + if (in_atomic() || irqs_disabled()) { #ifdef CONFIG_SCHED_DEBUG p->migrate_disable_atomic--; #endif @@ -2838,41 +3193,23 @@ void migrate_enable(void) #endif WARN_ON_ONCE(p->migrate_disable <= 0); - if (migrate_disable_count(p) > 1) { + if (p->migrate_disable > 1) { p->migrate_disable--; return; } preempt_disable(); - if (unlikely(migrate_disabled_updated(p))) { - /* - * Undo whatever update_migrate_disable() did, also see there - * about locking. - */ - rq = this_rq(); - raw_spin_lock_irqsave(&rq->lock, flags); - - /* - * Clearing migrate_disable causes tsk_cpus_allowed to - * show the tasks original cpu affinity. - */ - p->migrate_disable = 0; - mask = tsk_cpus_allowed(p); - if (p->sched_class->set_cpus_allowed) - p->sched_class->set_cpus_allowed(p, mask); - p->nr_cpus_allowed = cpumask_weight(mask); - raw_spin_unlock_irqrestore(&rq->lock, flags); - } else - p->migrate_disable = 0; + /* + * Clearing migrate_disable causes tsk_cpus_allowed to + * show the tasks original cpu affinity. + */ + p->migrate_disable = 0; unpin_current_cpu(); preempt_enable(); preempt_lazy_enable(); } EXPORT_SYMBOL(migrate_enable); -#else -static inline void update_migrate_disable(struct task_struct *p) { } -#define migrate_disabled_updated(p) 0 #endif /* @@ -2951,23 +3288,31 @@ again: * - return from syscall or exception to user-space * - return from interrupt-handler to user-space * - * WARNING: all callers must re-check need_resched() afterward and reschedule - * accordingly in case an event triggered the need for rescheduling (such as - * an interrupt waking up a task) while preemption was disabled in __schedule(). + * WARNING: must be called with preemption disabled! */ -static void __sched __schedule(void) +static void __sched notrace __schedule(bool preempt) { struct task_struct *prev, *next; unsigned long *switch_count; struct rq *rq; int cpu; - preempt_disable(); cpu = smp_processor_id(); rq = cpu_rq(cpu); rcu_note_context_switch(); prev = rq->curr; + /* + * do_exit() calls schedule() with preemption disabled as an exception; + * however we must fix that up, otherwise the next task will see an + * inconsistent (higher) preempt count. + * + * It also avoids the below schedule_debug() test from complaining + * about this. + */ + if (unlikely(prev->state == TASK_DEAD)) + preempt_enable_no_resched_notrace(); + schedule_debug(prev); if (sched_feat(HRTICK)) @@ -2980,13 +3325,12 @@ static void __sched __schedule(void) */ smp_mb__before_spinlock(); raw_spin_lock_irq(&rq->lock); - - update_migrate_disable(prev); + lockdep_pin_lock(&rq->lock); rq->clock_skip_update <<= 1; /* promote REQ to ACT */ switch_count = &prev->nivcsw; - if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { + if (!preempt && prev->state) { if (unlikely(signal_pending_state(prev->state, prev))) { prev->state = TASK_RUNNING; } else { @@ -3010,14 +3354,15 @@ static void __sched __schedule(void) rq->curr = next; ++*switch_count; + trace_sched_switch(preempt, prev, next); rq = context_switch(rq, prev, next); /* unlocks the rq */ cpu = cpu_of(rq); - } else + } else { + lockdep_unpin_lock(&rq->lock); raw_spin_unlock_irq(&rq->lock); + } - post_schedule(rq); - - sched_preempt_enable_no_resched(); + balance_callback(rq); } static inline void sched_submit_work(struct task_struct *tsk) @@ -3055,7 +3400,9 @@ asmlinkage __visible void __sched schedule(void) sched_submit_work(tsk); do { - __schedule(); + preempt_disable(); + __schedule(false); + sched_preempt_enable_no_resched(); } while (need_resched()); sched_update_worker(tsk); } @@ -3095,18 +3442,41 @@ void __sched schedule_preempt_disabled(void) static void __sched notrace preempt_schedule_common(void) { do { - __preempt_count_add(PREEMPT_ACTIVE); - __schedule(); - __preempt_count_sub(PREEMPT_ACTIVE); + preempt_disable_notrace(); + __schedule(true); + preempt_enable_no_resched_notrace(); /* * Check again in case we missed a preemption opportunity * between schedule and now. */ - barrier(); } while (need_resched()); } +#ifdef CONFIG_PREEMPT_LAZY +/* + * If TIF_NEED_RESCHED is then we allow to be scheduled away since this is + * set by a RT task. Oterwise we try to avoid beeing scheduled out as long as + * preempt_lazy_count counter >0. + */ +static __always_inline int preemptible_lazy(void) +{ + if (test_thread_flag(TIF_NEED_RESCHED)) + return 1; + if (current_thread_info()->preempt_lazy_count) + return 0; + return 1; +} + +#else + +static int preemptible_lazy(void) +{ + return 1; +} + +#endif + #ifdef CONFIG_PREEMPT /* * this is the entry point to schedule() from in-kernel preemption @@ -3121,15 +3491,16 @@ asmlinkage __visible void __sched notrace preempt_schedule(void) */ if (likely(!preemptible())) return; + if (!preemptible_lazy()) + return; preempt_schedule_common(); } NOKPROBE_SYMBOL(preempt_schedule); EXPORT_SYMBOL(preempt_schedule); -#ifdef CONFIG_CONTEXT_TRACKING /** - * preempt_schedule_context - preempt_schedule called by tracing + * preempt_schedule_notrace - preempt_schedule called by tracing * * The tracing infrastructure uses preempt_enable_notrace to prevent * recursion and tracing preempt enabling caused by the tracing @@ -3142,23 +3513,17 @@ EXPORT_SYMBOL(preempt_schedule); * instead of preempt_schedule() to exit user context if needed before * calling the scheduler. */ -asmlinkage __visible void __sched notrace preempt_schedule_context(void) +asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) { enum ctx_state prev_ctx; if (likely(!preemptible())) return; - -#ifdef CONFIG_PREEMPT_LAZY - /* - * Check for lazy preemption - */ - if (current_thread_info()->preempt_lazy_count && - !test_thread_flag(TIF_NEED_RESCHED)) + if (!preemptible_lazy()) return; -#endif + do { - __preempt_count_add(PREEMPT_ACTIVE); + preempt_disable_notrace(); /* * Needs preempt disabled in case user_exit() is traced * and the tracer calls preempt_enable_notrace() causing @@ -3173,16 +3538,14 @@ asmlinkage __visible void __sched notrace preempt_schedule_context(void) * we must force it manually. */ start_critical_timings(); - __schedule(); + __schedule(true); stop_critical_timings(); exception_exit(prev_ctx); - __preempt_count_sub(PREEMPT_ACTIVE); - barrier(); + preempt_enable_no_resched_notrace(); } while (need_resched()); } -EXPORT_SYMBOL_GPL(preempt_schedule_context); -#endif /* CONFIG_CONTEXT_TRACKING */ +EXPORT_SYMBOL_GPL(preempt_schedule_notrace); #endif /* CONFIG_PREEMPT */ @@ -3202,17 +3565,11 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) prev_state = exception_enter(); do { - __preempt_count_add(PREEMPT_ACTIVE); + preempt_disable(); local_irq_enable(); - __schedule(); + __schedule(true); local_irq_disable(); - __preempt_count_sub(PREEMPT_ACTIVE); - - /* - * Check again in case we missed a preemption opportunity - * between schedule and now. - */ - barrier(); + sched_preempt_enable_no_resched(); } while (need_resched()); exception_exit(prev_state); @@ -3240,7 +3597,7 @@ EXPORT_SYMBOL(default_wake_function); */ void rt_mutex_setprio(struct task_struct *p, int prio) { - int oldprio, queued, running, enqueue_flag = 0; + int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE; struct rq *rq; const struct sched_class *prev_class; @@ -3272,7 +3629,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) queued = task_on_rq_queued(p); running = task_current(rq, p); if (queued) - dequeue_task(rq, p, 0); + dequeue_task(rq, p, DEQUEUE_SAVE); if (running) put_prev_task(rq, p); @@ -3290,8 +3647,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (!dl_prio(p->normal_prio) || (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { p->dl.dl_boosted = 1; - p->dl.dl_throttled = 0; - enqueue_flag = ENQUEUE_REPLENISH; + enqueue_flag |= ENQUEUE_REPLENISH; } else p->dl.dl_boosted = 0; p->sched_class = &dl_sched_class; @@ -3299,7 +3655,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (dl_prio(oldprio)) p->dl.dl_boosted = 0; if (oldprio < prio) - enqueue_flag = ENQUEUE_HEAD; + enqueue_flag |= ENQUEUE_HEAD; p->sched_class = &rt_sched_class; } else { if (dl_prio(oldprio)) @@ -3318,7 +3674,11 @@ void rt_mutex_setprio(struct task_struct *p, int prio) check_class_changed(rq, p, prev_class, oldprio); out_unlock: + preempt_disable(); /* avoid rq from going away on us */ __task_rq_unlock(rq); + + balance_callback(rq); + preempt_enable(); } #endif @@ -3347,7 +3707,7 @@ void set_user_nice(struct task_struct *p, long nice) } queued = task_on_rq_queued(p); if (queued) - dequeue_task(rq, p, 0); + dequeue_task(rq, p, DEQUEUE_SAVE); p->static_prio = NICE_TO_PRIO(nice); set_load_weight(p); @@ -3356,7 +3716,7 @@ void set_user_nice(struct task_struct *p, long nice) delta = p->prio - old_prio; if (queued) { - enqueue_task(rq, p, 0); + enqueue_task(rq, p, ENQUEUE_RESTORE); /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: @@ -3656,7 +4016,7 @@ static bool dl_param_changed(struct task_struct *p, static int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, - bool user) + bool user, bool pi) { int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 : MAX_RT_PRIO - 1 - attr->sched_priority; @@ -3677,10 +4037,7 @@ recheck: } else { reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK); - if (policy != SCHED_DEADLINE && - policy != SCHED_FIFO && policy != SCHED_RR && - policy != SCHED_NORMAL && policy != SCHED_BATCH && - policy != SCHED_IDLE) + if (!valid_policy(policy)) return -EINVAL; } @@ -3736,7 +4093,7 @@ recheck: * Treat SCHED_IDLE as nice 20. Only allow a switch to * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. */ - if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { + if (idle_policy(p->policy) && !idle_policy(policy)) { if (!can_nice(p, task_nice(p))) return -EPERM; } @@ -3842,44 +4199,58 @@ change: p->sched_reset_on_fork = reset_on_fork; oldprio = p->prio; - /* - * Take priority boosted tasks into account. If the new - * effective priority is unchanged, we just store the new - * normal parameters and do not touch the scheduler class and - * the runqueue. This will be done when the task deboost - * itself. - */ - new_effective_prio = rt_mutex_get_effective_prio(p, newprio); - if (new_effective_prio == oldprio) { - __setscheduler_params(p, attr); - task_rq_unlock(rq, p, &flags); - return 0; + if (pi) { + /* + * Take priority boosted tasks into account. If the new + * effective priority is unchanged, we just store the new + * normal parameters and do not touch the scheduler class and + * the runqueue. This will be done when the task deboost + * itself. + */ + new_effective_prio = rt_mutex_get_effective_prio(p, newprio); + if (new_effective_prio == oldprio) { + __setscheduler_params(p, attr); + task_rq_unlock(rq, p, &flags); + return 0; + } } queued = task_on_rq_queued(p); running = task_current(rq, p); if (queued) - dequeue_task(rq, p, 0); + dequeue_task(rq, p, DEQUEUE_SAVE); if (running) put_prev_task(rq, p); prev_class = p->sched_class; - __setscheduler(rq, p, attr, true); + __setscheduler(rq, p, attr, pi); if (running) p->sched_class->set_curr_task(rq); if (queued) { + int enqueue_flags = ENQUEUE_RESTORE; /* * We enqueue to tail when the priority of a task is * increased (user space view). */ - enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0); + if (oldprio <= p->prio) + enqueue_flags |= ENQUEUE_HEAD; + + enqueue_task(rq, p, enqueue_flags); } check_class_changed(rq, p, prev_class, oldprio); + preempt_disable(); /* avoid rq from going away on us */ task_rq_unlock(rq, p, &flags); - rt_mutex_adjust_pi(p); + if (pi) + rt_mutex_adjust_pi(p); + + /* + * Run balance callbacks after we've adjusted the PI chain. + */ + balance_callback(rq); + preempt_enable(); return 0; } @@ -3900,7 +4271,7 @@ static int _sched_setscheduler(struct task_struct *p, int policy, attr.sched_policy = policy; } - return __sched_setscheduler(p, &attr, check); + return __sched_setscheduler(p, &attr, check, true); } /** * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. @@ -3921,7 +4292,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler); int sched_setattr(struct task_struct *p, const struct sched_attr *attr) { - return __sched_setscheduler(p, attr, true); + return __sched_setscheduler(p, attr, true, true); } EXPORT_SYMBOL_GPL(sched_setattr); @@ -3943,6 +4314,7 @@ int sched_setscheduler_nocheck(struct task_struct *p, int policy, { return _sched_setscheduler(p, policy, param, false); } +EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) @@ -4323,7 +4695,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) } #endif again: - retval = set_cpus_allowed_ptr(p, new_mask); + retval = __set_cpus_allowed_ptr(p, new_mask, true); if (!retval) { cpuset_cpus_allowed(p, cpus_allowed); @@ -4475,7 +4847,7 @@ SYSCALL_DEFINE0(sched_yield) int __sched _cond_resched(void) { - if (should_resched()) { + if (should_resched(0)) { preempt_schedule_common(); return 1; } @@ -4493,7 +4865,7 @@ EXPORT_SYMBOL(_cond_resched); */ int __cond_resched_lock(spinlock_t *lock) { - int resched = should_resched(); + int resched = should_resched(PREEMPT_LOCK_OFFSET); int ret = 0; lockdep_assert_held(lock); @@ -4516,7 +4888,7 @@ int __sched __cond_resched_softirq(void) { BUG_ON(!in_softirq()); - if (should_resched()) { + if (should_resched(SOFTIRQ_DISABLE_OFFSET)) { local_bh_enable(); preempt_schedule_common(); local_bh_disable(); @@ -4850,13 +5222,22 @@ void init_idle(struct task_struct *idle, int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags; - raw_spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock_irqsave(&idle->pi_lock, flags); + raw_spin_lock(&rq->lock); __sched_fork(0, idle); idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); - do_set_cpus_allowed(idle, cpumask_of(cpu)); +#ifdef CONFIG_SMP + /* + * Its possible that init_idle() gets called multiple times on a task, + * in that case do_set_cpus_allowed() will not do the right thing. + * + * And since this is boot we can forgo the serialization. + */ + set_cpus_allowed_common(idle, cpumask_of(cpu)); +#endif /* * We're having a chicken and egg problem, even though we are * holding rq->lock, the cpu isn't yet set to this cpu so the @@ -4873,10 +5254,11 @@ void init_idle(struct task_struct *idle, int cpu) rq->curr = rq->idle = idle; idle->on_rq = TASK_ON_RQ_QUEUED; -#if defined(CONFIG_SMP) +#ifdef CONFIG_SMP idle->on_cpu = 1; #endif - raw_spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&idle->pi_lock, flags); /* Set the preempt count _outside_ the spinlocks! */ init_idle_preempt_count(idle, cpu); @@ -4889,7 +5271,7 @@ void init_idle(struct task_struct *idle, int cpu) idle->sched_class = &idle_sched_class; ftrace_graph_init_idle_task(idle, cpu); vtime_init_idle(idle, cpu); -#if defined(CONFIG_SMP) +#ifdef CONFIG_SMP sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); #endif } @@ -4973,229 +5355,6 @@ out: } #ifdef CONFIG_SMP -/* - * move_queued_task - move a queued task to new rq. - * - * Returns (locked) new rq. Old rq's lock is released. - */ -static struct rq *move_queued_task(struct task_struct *p, int new_cpu) -{ - struct rq *rq = task_rq(p); - - lockdep_assert_held(&rq->lock); - - dequeue_task(rq, p, 0); - p->on_rq = TASK_ON_RQ_MIGRATING; - set_task_cpu(p, new_cpu); - raw_spin_unlock(&rq->lock); - - rq = cpu_rq(new_cpu); - - raw_spin_lock(&rq->lock); - BUG_ON(task_cpu(p) != new_cpu); - p->on_rq = TASK_ON_RQ_QUEUED; - enqueue_task(rq, p, 0); - check_preempt_curr(rq, p, 0); - - return rq; -} - -void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) -{ - if (!migrate_disabled_updated(p)) { - if (p->sched_class->set_cpus_allowed) - p->sched_class->set_cpus_allowed(p, new_mask); - p->nr_cpus_allowed = cpumask_weight(new_mask); - } - - cpumask_copy(&p->cpus_allowed, new_mask); -} - -static DEFINE_PER_CPU(struct cpumask, sched_cpumasks); -static DEFINE_MUTEX(sched_down_mutex); -static cpumask_t sched_down_cpumask; - -void tell_sched_cpu_down_begin(int cpu) -{ - mutex_lock(&sched_down_mutex); - cpumask_set_cpu(cpu, &sched_down_cpumask); - mutex_unlock(&sched_down_mutex); -} - -void tell_sched_cpu_down_done(int cpu) -{ - mutex_lock(&sched_down_mutex); - cpumask_clear_cpu(cpu, &sched_down_cpumask); - mutex_unlock(&sched_down_mutex); -} - -/** - * migrate_me - try to move the current task off this cpu - * - * Used by the pin_current_cpu() code to try to get tasks - * to move off the current CPU as it is going down. - * It will only move the task if the task isn't pinned to - * the CPU (with migrate_disable, affinity or NO_SETAFFINITY) - * and the task has to be in a RUNNING state. Otherwise the - * movement of the task will wake it up (change its state - * to running) when the task did not expect it. - * - * Returns 1 if it succeeded in moving the current task - * 0 otherwise. - */ -int migrate_me(void) -{ - struct task_struct *p = current; - struct migration_arg arg; - struct cpumask *cpumask; - struct cpumask *mask; - unsigned long flags; - unsigned int dest_cpu; - struct rq *rq; - - /* - * We can not migrate tasks bounded to a CPU or tasks not - * running. The movement of the task will wake it up. - */ - if (p->flags & PF_NO_SETAFFINITY || p->state) - return 0; - - mutex_lock(&sched_down_mutex); - rq = task_rq_lock(p, &flags); - - cpumask = this_cpu_ptr(&sched_cpumasks); - mask = &p->cpus_allowed; - - cpumask_andnot(cpumask, mask, &sched_down_cpumask); - - if (!cpumask_weight(cpumask)) { - /* It's only on this CPU? */ - task_rq_unlock(rq, p, &flags); - mutex_unlock(&sched_down_mutex); - return 0; - } - - dest_cpu = cpumask_any_and(cpu_active_mask, cpumask); - - arg.task = p; - arg.dest_cpu = dest_cpu; - - task_rq_unlock(rq, p, &flags); - - stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); - tlb_migrate_finish(p->mm); - mutex_unlock(&sched_down_mutex); - - return 1; -} - -/* - * This is how migration works: - * - * 1) we invoke migration_cpu_stop() on the target CPU using - * stop_one_cpu(). - * 2) stopper starts to run (implicitly forcing the migrated thread - * off the CPU) - * 3) it checks whether the migrated task is still in the wrong runqueue. - * 4) if it's in the wrong runqueue then the migration thread removes - * it and puts it into the right queue. - * 5) stopper completes and stop_one_cpu() returns and the migration - * is done. - */ - -/* - * Change a given task's CPU affinity. Migrate the thread to a - * proper CPU and schedule it away if the CPU it's executing on - * is removed from the allowed bitmask. - * - * NOTE: the caller must have a valid reference to the task, the - * task must not exit() & deallocate itself prematurely. The - * call is not atomic; no spinlocks may be held. - */ -int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) -{ - unsigned long flags; - struct rq *rq; - unsigned int dest_cpu; - int ret = 0; - - rq = task_rq_lock(p, &flags); - - if (cpumask_equal(&p->cpus_allowed, new_mask)) - goto out; - - if (!cpumask_intersects(new_mask, cpu_active_mask)) { - ret = -EINVAL; - goto out; - } - - do_set_cpus_allowed(p, new_mask); - - /* Can the task run on the task's current CPU? If so, we're done */ - if (cpumask_test_cpu(task_cpu(p), new_mask) || __migrate_disabled(p)) - goto out; - - dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); - if (task_running(rq, p) || p->state == TASK_WAKING) { - struct migration_arg arg = { p, dest_cpu }; - /* Need help from migration thread: drop lock and wait. */ - task_rq_unlock(rq, p, &flags); - stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); - tlb_migrate_finish(p->mm); - return 0; - } else if (task_on_rq_queued(p)) - rq = move_queued_task(p, dest_cpu); -out: - task_rq_unlock(rq, p, &flags); - - return ret; -} -EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); - -/* - * Move (not current) task off this cpu, onto dest cpu. We're doing - * this because either it can't run here any more (set_cpus_allowed() - * away from this CPU, or CPU going down), or because we're - * attempting to rebalance this task on exec (sched_exec). - * - * So we race with normal scheduler movements, but that's OK, as long - * as the task is no longer on this CPU. - * - * Returns non-zero if task was successfully migrated. - */ -static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) -{ - struct rq *rq; - int ret = 0; - - if (unlikely(!cpu_active(dest_cpu))) - return ret; - - rq = cpu_rq(src_cpu); - - raw_spin_lock(&p->pi_lock); - raw_spin_lock(&rq->lock); - /* Already moved. */ - if (task_cpu(p) != src_cpu) - goto done; - - /* Affinity changed (again). */ - if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) - goto fail; - - /* - * If we're not on a rq, the next wake-up will ensure we're - * placed properly. - */ - if (task_on_rq_queued(p)) - rq = move_queued_task(p, dest_cpu); -done: - ret = 1; -fail: - raw_spin_unlock(&rq->lock); - raw_spin_unlock(&p->pi_lock); - return ret; -} #ifdef CONFIG_NUMA_BALANCING /* Migrate current task p to target_cpu */ @@ -5231,7 +5390,7 @@ void sched_setnuma(struct task_struct *p, int nid) running = task_current(rq, p); if (queued) - dequeue_task(rq, p, 0); + dequeue_task(rq, p, DEQUEUE_SAVE); if (running) put_prev_task(rq, p); @@ -5240,38 +5399,12 @@ void sched_setnuma(struct task_struct *p, int nid) if (running) p->sched_class->set_curr_task(rq); if (queued) - enqueue_task(rq, p, 0); + enqueue_task(rq, p, ENQUEUE_RESTORE); task_rq_unlock(rq, p, &flags); } -#endif - -/* - * migration_cpu_stop - this will be executed by a highprio stopper thread - * and performs thread migration by bumping thread off CPU then - * 'pushing' onto another runqueue. - */ -static int migration_cpu_stop(void *data) -{ - struct migration_arg *arg = data; - - /* - * The original target cpu might have gone down and we might - * be on another cpu but it doesn't matter. - */ - local_irq_disable(); - /* - * We need to explicitly wake pending tasks before running - * __migrate_task() such that we will not miss enforcing cpus_allowed - * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. - */ - sched_ttwu_pending(); - __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu); - local_irq_enable(); - return 0; -} +#endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_HOTPLUG_CPU - static DEFINE_PER_CPU(struct mm_struct *, idle_last_mm); /* @@ -5333,9 +5466,9 @@ static struct task_struct fake_task = { * there's no concurrency possible, we hold the required locks anyway * because of lock validation efforts. */ -static void migrate_tasks(unsigned int dead_cpu) +static void migrate_tasks(struct rq *dead_rq) { - struct rq *rq = cpu_rq(dead_cpu); + struct rq *rq = dead_rq; struct task_struct *next, *stop = rq->stop; int dest_cpu; @@ -5357,7 +5490,7 @@ static void migrate_tasks(unsigned int dead_cpu) */ update_rq_clock(rq); - for ( ; ; ) { + for (;;) { /* * There's this thread running, bail when that's the only * remaining thread. @@ -5365,22 +5498,52 @@ static void migrate_tasks(unsigned int dead_cpu) if (rq->nr_running == 1) break; + /* + * pick_next_task assumes pinned rq->lock. + */ + lockdep_pin_lock(&rq->lock); next = pick_next_task(rq, &fake_task); BUG_ON(!next); next->sched_class->put_prev_task(rq, next); - /* Find suitable destination for @next, with force if needed. */ - dest_cpu = select_fallback_rq(dead_cpu, next); + /* + * Rules for changing task_struct::cpus_allowed are holding + * both pi_lock and rq->lock, such that holding either + * stabilizes the mask. + * + * Drop rq->lock is not quite as disastrous as it usually is + * because !cpu_active at this point, which means load-balance + * will not interfere. Also, stop-machine. + */ + lockdep_unpin_lock(&rq->lock); raw_spin_unlock(&rq->lock); + raw_spin_lock(&next->pi_lock); + raw_spin_lock(&rq->lock); - __migrate_task(next, dead_cpu, dest_cpu); + /* + * Since we're inside stop-machine, _nothing_ should have + * changed the task, WARN if weird stuff happened, because in + * that case the above rq->lock drop is a fail too. + */ + if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) { + raw_spin_unlock(&next->pi_lock); + continue; + } - raw_spin_lock(&rq->lock); + /* Find suitable destination for @next, with force if needed. */ + dest_cpu = select_fallback_rq(dead_rq->cpu, next); + + rq = __migrate_task(rq, next, dest_cpu); + if (rq != dead_rq) { + raw_spin_unlock(&rq->lock); + rq = dead_rq; + raw_spin_lock(&rq->lock); + } + raw_spin_unlock(&next->pi_lock); } rq->stop = stop; } - #endif /* CONFIG_HOTPLUG_CPU */ #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) @@ -5546,8 +5709,7 @@ static void register_sched_domain_sysctl(void) /* may be called multiple times per register */ static void unregister_sched_domain_sysctl(void) { - if (sd_sysctl_header) - unregister_sysctl_table(sd_sysctl_header); + unregister_sysctl_table(sd_sysctl_header); sd_sysctl_header = NULL; if (sd_ctl_dir[0].child) sd_free_ctl_entry(&sd_ctl_dir[0].child); @@ -5559,7 +5721,7 @@ static void register_sched_domain_sysctl(void) static void unregister_sched_domain_sysctl(void) { } -#endif +#endif /* CONFIG_SCHED_DEBUG && CONFIG_SYSCTL */ static void set_rq_online(struct rq *rq) { @@ -5628,7 +5790,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); } - migrate_tasks(cpu); + migrate_tasks(rq); BUG_ON(rq->nr_running != 1); /* the migration thread */ raw_spin_unlock_irqrestore(&rq->lock, flags); break; @@ -5658,7 +5820,7 @@ static struct notifier_block migration_notifier = { .priority = CPU_PRI_MIGRATION, }; -static void __cpuinit set_cpu_rq_start_time(void) +static void set_cpu_rq_start_time(void) { int cpu = smp_processor_id(); struct rq *rq = cpu_rq(cpu); @@ -5668,21 +5830,27 @@ static void __cpuinit set_cpu_rq_start_time(void) static int sched_cpu_active(struct notifier_block *nfb, unsigned long action, void *hcpu) { + int cpu = (long)hcpu; + switch (action & ~CPU_TASKS_FROZEN) { case CPU_STARTING: set_cpu_rq_start_time(); return NOTIFY_OK; + case CPU_ONLINE: /* * At this point a starting CPU has marked itself as online via * set_cpu_online(). But it might not yet have marked itself * as active, which is essential from here on. - * - * Thus, fall-through and help the starting CPU along. */ + set_cpu_active(cpu, true); + stop_machine_unpark(cpu); + return NOTIFY_OK; + case CPU_DOWN_FAILED: - set_cpu_active((long)hcpu, true); + set_cpu_active(cpu, true); return NOTIFY_OK; + default: return NOTIFY_DONE; } @@ -5718,9 +5886,6 @@ static int __init migration_init(void) return 0; } early_initcall(migration_init); -#endif - -#ifdef CONFIG_SMP static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ @@ -5953,13 +6118,13 @@ static int init_rootdomain(struct root_domain *rd) { memset(rd, 0, sizeof(*rd)); - if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) + if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL)) goto out; - if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) + if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL)) goto free_span; - if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL)) + if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL)) goto free_online; - if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) + if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) goto free_dlo_mask; init_dl_bw(&rd->dl_bw); @@ -6617,7 +6782,8 @@ static struct sched_domain_topology_level default_topology[] = { { NULL, }, }; -struct sched_domain_topology_level *sched_domain_topology = default_topology; +static struct sched_domain_topology_level *sched_domain_topology = + default_topology; #define for_each_sd_topology(tl) \ for (tl = sched_domain_topology; tl->mask; tl++) @@ -6695,8 +6861,10 @@ static void init_numa_topology_type(void) n = sched_max_numa_distance; - if (n <= 1) + if (sched_domains_numa_levels <= 1) { sched_numa_topology_type = NUMA_DIRECT; + return; + } for_each_online_node(a) { for_each_online_node(b) { @@ -6817,7 +6985,7 @@ static void sched_init_numa(void) sched_domains_numa_masks[i][j] = mask; - for (k = 0; k < nr_node_ids; k++) { + for_each_node(k) { if (node_distance(j, k) > sched_domains_numa_distance[i]) continue; @@ -6946,7 +7114,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map) struct sched_group *sg; struct sched_group_capacity *sgc; - sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), + sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), GFP_KERNEL, cpu_to_node(j)); if (!sd) return -ENOMEM; @@ -7420,8 +7588,6 @@ void __init sched_init_smp(void) } #endif /* CONFIG_SMP */ -const_debug unsigned int sysctl_timer_migration = 1; - int in_sched_functions(unsigned long addr) { return in_lock_functions(addr) || @@ -7551,7 +7717,7 @@ void __init sched_init(void) rq->sd = NULL; rq->rd = NULL; rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE; - rq->post_schedule = 0; + rq->balance_callback = NULL; rq->active_balance = 0; rq->next_balance = jiffies; rq->push_cpu = 0; @@ -7618,8 +7784,7 @@ void __init sched_init(void) #ifdef CONFIG_DEBUG_ATOMIC_SLEEP static inline int preempt_count_equals(int preempt_offset) { - int nested = (preempt_count() & ~PREEMPT_ACTIVE) + - sched_rcu_preempt_depth(); + int nested = preempt_count() + sched_rcu_preempt_depth(); return (nested == preempt_offset); } @@ -7682,32 +7847,12 @@ EXPORT_SYMBOL(___might_sleep); #endif #ifdef CONFIG_MAGIC_SYSRQ -static void normalize_task(struct rq *rq, struct task_struct *p) +void normalize_rt_tasks(void) { - const struct sched_class *prev_class = p->sched_class; + struct task_struct *g, *p; struct sched_attr attr = { .sched_policy = SCHED_NORMAL, }; - int old_prio = p->prio; - int queued; - - queued = task_on_rq_queued(p); - if (queued) - dequeue_task(rq, p, 0); - __setscheduler(rq, p, &attr, false); - if (queued) { - enqueue_task(rq, p, 0); - resched_curr(rq); - } - - check_class_changed(rq, p, prev_class, old_prio); -} - -void normalize_rt_tasks(void) -{ - struct task_struct *g, *p; - unsigned long flags; - struct rq *rq; read_lock(&tasklist_lock); for_each_process_thread(g, p) { @@ -7734,9 +7879,7 @@ void normalize_rt_tasks(void) continue; } - rq = task_rq_lock(p, &flags); - normalize_task(rq, p); - task_rq_unlock(rq, p, &flags); + __sched_setscheduler(p, &attr, false, false); } read_unlock(&tasklist_lock); } @@ -7888,7 +8031,7 @@ void sched_move_task(struct task_struct *tsk) queued = task_on_rq_queued(tsk); if (queued) - dequeue_task(rq, tsk, 0); + dequeue_task(rq, tsk, DEQUEUE_SAVE); if (unlikely(running)) put_prev_task(rq, tsk); @@ -7904,7 +8047,7 @@ void sched_move_task(struct task_struct *tsk) #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->task_move_group) - tsk->sched_class->task_move_group(tsk, queued); + tsk->sched_class->task_move_group(tsk); else #endif set_task_rq(tsk, task_cpu(tsk)); @@ -7912,7 +8055,7 @@ void sched_move_task(struct task_struct *tsk) if (unlikely(running)) tsk->sched_class->set_curr_task(rq); if (queued) - enqueue_task(rq, tsk, 0); + enqueue_task(rq, tsk, ENQUEUE_RESTORE); task_rq_unlock(rq, tsk, &flags); } @@ -8087,11 +8230,11 @@ static long sched_group_rt_runtime(struct task_group *tg) return rt_runtime_us; } -static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) +static int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us) { u64 rt_runtime, rt_period; - rt_period = (u64)rt_period_us * NSEC_PER_USEC; + rt_period = rt_period_us * NSEC_PER_USEC; rt_runtime = tg->rt_bandwidth.rt_runtime; return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); @@ -8340,17 +8483,17 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) sched_offline_group(tg); } -static void cpu_cgroup_fork(struct task_struct *task) +static void cpu_cgroup_fork(struct task_struct *task, void *private) { sched_move_task(task); } -static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) { struct task_struct *task; + struct cgroup_subsys_state *css; - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, css, tset) { #ifdef CONFIG_RT_GROUP_SCHED if (!sched_rt_can_attach(css_tg(css), task)) return -EINVAL; @@ -8363,30 +8506,15 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, return 0; } -static void cpu_cgroup_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void cpu_cgroup_attach(struct cgroup_taskset *tset) { struct task_struct *task; + struct cgroup_subsys_state *css; - cgroup_taskset_for_each(task, tset) + cgroup_taskset_for_each(task, css, tset) sched_move_task(task); } -static void cpu_cgroup_exit(struct cgroup_subsys_state *css, - struct cgroup_subsys_state *old_css, - struct task_struct *task) -{ - /* - * cgroup_exit() is called in the copy_process() failure path. - * Ignore this case since the task hasn't ran yet, this avoids - * trying to poke a half freed task state from generic code. - */ - if (!(task->flags & PF_EXITING)) - return; - - sched_move_task(task); -} - #ifdef CONFIG_FAIR_GROUP_SCHED static int cpu_shares_write_u64(struct cgroup_subsys_state *css, struct cftype *cftype, u64 shareval) @@ -8458,10 +8586,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) __refill_cfs_bandwidth_runtime(cfs_b); /* restart the period timer (if active) to handle new period expiry */ - if (runtime_enabled && cfs_b->timer_active) { - /* force a reprogram */ - __start_cfs_bandwidth(cfs_b, true); - } + if (runtime_enabled) + start_cfs_bandwidth(cfs_b); raw_spin_unlock_irq(&cfs_b->lock); for_each_online_cpu(i) { @@ -8720,7 +8846,6 @@ struct cgroup_subsys cpu_cgrp_subsys = { .fork = cpu_cgroup_fork, .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, - .exit = cpu_cgroup_exit, .legacy_cftypes = cpu_files, .early_init = 1, }; diff --git a/kernel/kernel/sched/cpudeadline.c b/kernel/kernel/sched/cpudeadline.c index c6acb0746..5be588204 100644 --- a/kernel/kernel/sched/cpudeadline.c +++ b/kernel/kernel/sched/cpudeadline.c @@ -31,11 +31,6 @@ static inline int right_child(int i) return (i << 1) + 2; } -static inline int dl_time_before(u64 a, u64 b) -{ - return (s64)(a - b) < 0; -} - static void cpudl_exchange(struct cpudl *cp, int a, int b) { int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; @@ -108,10 +103,10 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, const struct sched_dl_entity *dl_se = &p->dl; if (later_mask && - cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) { + cpumask_and(later_mask, cp->free_cpus, tsk_cpus_allowed(p))) { best_cpu = cpumask_any(later_mask); goto out; - } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && + } else if (cpumask_test_cpu(cpudl_maximum(cp), tsk_cpus_allowed(p)) && dl_time_before(dl_se->deadline, cp->elements[0].dl)) { best_cpu = cpudl_maximum(cp); if (later_mask) diff --git a/kernel/kernel/sched/cpudeadline.h b/kernel/kernel/sched/cpudeadline.h index 1a0a6ef2f..fcbdf83fe 100644 --- a/kernel/kernel/sched/cpudeadline.h +++ b/kernel/kernel/sched/cpudeadline.h @@ -2,6 +2,7 @@ #define _LINUX_CPUDL_H #include <linux/sched.h> +#include <linux/sched/deadline.h> #define IDX_INVALID -1 diff --git a/kernel/kernel/sched/cpupri.c b/kernel/kernel/sched/cpupri.c index 981fcd7dc..11e9705bf 100644 --- a/kernel/kernel/sched/cpupri.c +++ b/kernel/kernel/sched/cpupri.c @@ -103,11 +103,11 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, if (skip) continue; - if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) + if (cpumask_any_and(tsk_cpus_allowed(p), vec->mask) >= nr_cpu_ids) continue; if (lowest_mask) { - cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask); + cpumask_and(lowest_mask, tsk_cpus_allowed(p), vec->mask); /* * We have to ensure that we have at least one bit diff --git a/kernel/kernel/sched/cputime.c b/kernel/kernel/sched/cputime.c index 2da134c9b..0f75a38cf 100644 --- a/kernel/kernel/sched/cputime.c +++ b/kernel/kernel/sched/cputime.c @@ -444,6 +444,7 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) *ut = p->utime; *st = p->stime; } +EXPORT_SYMBOL_GPL(task_cputime_adjusted); void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { @@ -555,48 +556,43 @@ drop_precision: } /* - * Atomically advance counter to the new value. Interrupts, vcpu - * scheduling, and scaling inaccuracies can cause cputime_advance - * to be occasionally called with a new value smaller than counter. - * Let's enforce atomicity. + * Adjust tick based cputime random precision against scheduler runtime + * accounting. * - * Normally a caller will only go through this loop once, or not - * at all in case a previous caller updated counter the same jiffy. - */ -static void cputime_advance(cputime_t *counter, cputime_t new) -{ - cputime_t old; - - while (new > (old = ACCESS_ONCE(*counter))) - cmpxchg_cputime(counter, old, new); -} - -/* - * Adjust tick based cputime random precision against scheduler - * runtime accounting. + * Tick based cputime accounting depend on random scheduling timeslices of a + * task to be interrupted or not by the timer. Depending on these + * circumstances, the number of these interrupts may be over or + * under-optimistic, matching the real user and system cputime with a variable + * precision. + * + * Fix this by scaling these tick based values against the total runtime + * accounted by the CFS scheduler. + * + * This code provides the following guarantees: + * + * stime + utime == rtime + * stime_i+1 >= stime_i, utime_i+1 >= utime_i + * + * Assuming that rtime_i+1 >= rtime_i. */ static void cputime_adjust(struct task_cputime *curr, - struct cputime *prev, + struct prev_cputime *prev, cputime_t *ut, cputime_t *st) { cputime_t rtime, stime, utime; + unsigned long flags; - /* - * Tick based cputime accounting depend on random scheduling - * timeslices of a task to be interrupted or not by the timer. - * Depending on these circumstances, the number of these interrupts - * may be over or under-optimistic, matching the real user and system - * cputime with a variable precision. - * - * Fix this by scaling these tick based values against the total - * runtime accounted by the CFS scheduler. - */ + /* Serialize concurrent callers such that we can honour our guarantees */ + raw_spin_lock_irqsave(&prev->lock, flags); rtime = nsecs_to_cputime(curr->sum_exec_runtime); /* - * Update userspace visible utime/stime values only if actual execution - * time is bigger than already exported. Note that can happen, that we - * provided bigger values due to scaling inaccuracy on big numbers. + * This is possible under two circumstances: + * - rtime isn't monotonic after all (a bug); + * - we got reordered by the lock. + * + * In both cases this acts as a filter such that the rest of the code + * can assume it is monotonic regardless of anything else. */ if (prev->stime + prev->utime >= rtime) goto out; @@ -606,22 +602,46 @@ static void cputime_adjust(struct task_cputime *curr, if (utime == 0) { stime = rtime; - } else if (stime == 0) { - utime = rtime; - } else { - cputime_t total = stime + utime; + goto update; + } - stime = scale_stime((__force u64)stime, - (__force u64)rtime, (__force u64)total); - utime = rtime - stime; + if (stime == 0) { + utime = rtime; + goto update; } - cputime_advance(&prev->stime, stime); - cputime_advance(&prev->utime, utime); + stime = scale_stime((__force u64)stime, (__force u64)rtime, + (__force u64)(stime + utime)); + + /* + * Make sure stime doesn't go backwards; this preserves monotonicity + * for utime because rtime is monotonic. + * + * utime_i+1 = rtime_i+1 - stime_i + * = rtime_i+1 - (rtime_i - utime_i) + * = (rtime_i+1 - rtime_i) + utime_i + * >= utime_i + */ + if (stime < prev->stime) + stime = prev->stime; + utime = rtime - stime; + + /* + * Make sure utime doesn't go backwards; this still preserves + * monotonicity for stime, analogous argument to above. + */ + if (utime < prev->utime) { + utime = prev->utime; + stime = rtime - utime; + } +update: + prev->stime = stime; + prev->utime = utime; out: *ut = prev->utime; *st = prev->stime; + raw_spin_unlock_irqrestore(&prev->lock, flags); } void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) @@ -633,6 +653,7 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) task_cputime(p, &cputime.utime, &cputime.stime); cputime_adjust(&cputime, &p->prev_cputime, ut, st); } +EXPORT_SYMBOL_GPL(task_cputime_adjusted); void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { @@ -785,6 +806,9 @@ cputime_t task_gtime(struct task_struct *t) unsigned int seq; cputime_t gtime; + if (!context_tracking_is_enabled()) + return t->gtime; + do { seq = read_seqcount_begin(&t->vtime_seq); diff --git a/kernel/kernel/sched/deadline.c b/kernel/kernel/sched/deadline.c index 0c261c511..7a72e69fc 100644 --- a/kernel/kernel/sched/deadline.c +++ b/kernel/kernel/sched/deadline.c @@ -134,7 +134,7 @@ static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { struct task_struct *p = dl_task_of(dl_se); - if (p->nr_cpus_allowed > 1) + if (tsk_nr_cpus_allowed(p) > 1) dl_rq->dl_nr_migratory++; update_dl_migration(dl_rq); @@ -144,7 +144,7 @@ static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { struct task_struct *p = dl_task_of(dl_se); - if (p->nr_cpus_allowed > 1) + if (tsk_nr_cpus_allowed(p) > 1) dl_rq->dl_nr_migratory--; update_dl_migration(dl_rq); @@ -213,14 +213,28 @@ static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev) return dl_task(prev); } -static inline void set_post_schedule(struct rq *rq) +static DEFINE_PER_CPU(struct callback_head, dl_push_head); +static DEFINE_PER_CPU(struct callback_head, dl_pull_head); + +static void push_dl_tasks(struct rq *); +static void pull_dl_task(struct rq *); + +static inline void queue_push_tasks(struct rq *rq) { - rq->post_schedule = has_pushable_dl_tasks(rq); + if (!has_pushable_dl_tasks(rq)) + return; + + queue_balance_callback(rq, &per_cpu(dl_push_head, rq->cpu), push_dl_tasks); +} + +static inline void queue_pull_task(struct rq *rq) +{ + queue_balance_callback(rq, &per_cpu(dl_pull_head, rq->cpu), pull_dl_task); } static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq); -static void dl_task_offline_migration(struct rq *rq, struct task_struct *p) +static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p) { struct rq *later_rq = NULL; bool fallback = false; @@ -254,14 +268,19 @@ static void dl_task_offline_migration(struct rq *rq, struct task_struct *p) double_lock_balance(rq, later_rq); } + /* + * By now the task is replenished and enqueued; migrate it. + */ deactivate_task(rq, p, 0); set_task_cpu(p, later_rq->cpu); - activate_task(later_rq, p, ENQUEUE_REPLENISH); + activate_task(later_rq, p, 0); if (!fallback) resched_curr(later_rq); - double_unlock_balance(rq, later_rq); + double_unlock_balance(later_rq, rq); + + return later_rq; } #else @@ -291,12 +310,15 @@ static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev) return false; } -static inline int pull_dl_task(struct rq *rq) +static inline void pull_dl_task(struct rq *rq) +{ +} + +static inline void queue_push_tasks(struct rq *rq) { - return 0; } -static inline void set_post_schedule(struct rq *rq) +static inline void queue_pull_task(struct rq *rq) { } #endif /* CONFIG_SMP */ @@ -498,24 +520,23 @@ static void update_dl_entity(struct sched_dl_entity *dl_se, * actually started or not (i.e., the replenishment instant is in * the future or in the past). */ -static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted) +static int start_dl_timer(struct task_struct *p) { - struct dl_rq *dl_rq = dl_rq_of_se(dl_se); - struct rq *rq = rq_of_dl_rq(dl_rq); + struct sched_dl_entity *dl_se = &p->dl; + struct hrtimer *timer = &dl_se->dl_timer; + struct rq *rq = task_rq(p); ktime_t now, act; - ktime_t soft, hard; - unsigned long range; s64 delta; - if (boosted) - return 0; + lockdep_assert_held(&rq->lock); + /* * We want the timer to fire at the deadline, but considering * that it is actually coming from rq->clock and not from * hrtimer's time base reading. */ act = ns_to_ktime(dl_se->deadline); - now = hrtimer_cb_get_time(&dl_se->dl_timer); + now = hrtimer_cb_get_time(timer); delta = ktime_to_ns(now) - rq_clock(rq); act = ktime_add_ns(act, delta); @@ -527,15 +548,21 @@ static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted) if (ktime_us_delta(act, now) < 0) return 0; - hrtimer_set_expires(&dl_se->dl_timer, act); - - soft = hrtimer_get_softexpires(&dl_se->dl_timer); - hard = hrtimer_get_expires(&dl_se->dl_timer); - range = ktime_to_ns(ktime_sub(hard, soft)); - __hrtimer_start_range_ns(&dl_se->dl_timer, soft, - range, HRTIMER_MODE_ABS, 0); + /* + * !enqueued will guarantee another callback; even if one is already in + * progress. This ensures a balanced {get,put}_task_struct(). + * + * The race against __run_timer() clearing the enqueued state is + * harmless because we're holding task_rq()->lock, therefore the timer + * expiring after we've done the check will wait on its task_rq_lock() + * and observe our state. + */ + if (!hrtimer_is_queued(timer)) { + get_task_struct(p); + hrtimer_start(timer, act, HRTIMER_MODE_ABS); + } - return hrtimer_active(&dl_se->dl_timer); + return 1; } /* @@ -563,35 +590,40 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) rq = task_rq_lock(p, &flags); /* - * We need to take care of several possible races here: - * - * - the task might have changed its scheduling policy - * to something different than SCHED_DEADLINE - * - the task might have changed its reservation parameters - * (through sched_setattr()) - * - the task might have been boosted by someone else and - * might be in the boosting/deboosting path + * The task might have changed its scheduling policy to something + * different than SCHED_DEADLINE (through switched_fromd_dl()). + */ + if (!dl_task(p)) { + __dl_clear_params(p); + goto unlock; + } + + /* + * This is possible if switched_from_dl() raced against a running + * callback that took the above !dl_task() path and we've since then + * switched back into SCHED_DEADLINE. * - * In all this cases we bail out, as the task is already - * in the runqueue or is going to be enqueued back anyway. + * There's nothing to do except drop our task reference. */ - if (!dl_task(p) || dl_se->dl_new || - dl_se->dl_boosted || !dl_se->dl_throttled) + if (dl_se->dl_new) goto unlock; - sched_clock_tick(); - update_rq_clock(rq); + /* + * The task might have been boosted by someone else and might be in the + * boosting/deboosting path, its not throttled. + */ + if (dl_se->dl_boosted) + goto unlock; -#ifdef CONFIG_SMP /* - * If we find that the rq the task was on is no longer - * available, we need to select a new rq. + * Spurious timer due to start_dl_timer() race; or we already received + * a replenishment from rt_mutex_setprio(). */ - if (unlikely(!rq->online)) { - dl_task_offline_migration(rq, p); + if (!dl_se->dl_throttled) goto unlock; - } -#endif + + sched_clock_tick(); + update_rq_clock(rq); /* * If the throttle happened during sched-out; like: @@ -617,17 +649,45 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) check_preempt_curr_dl(rq, p, 0); else resched_curr(rq); + #ifdef CONFIG_SMP /* - * Queueing this task back might have overloaded rq, - * check if we need to kick someone away. + * Perform balancing operations here; after the replenishments. We + * cannot drop rq->lock before this, otherwise the assertion in + * start_dl_timer() about not missing updates is not true. + * + * If we find that the rq the task was on is no longer available, we + * need to select a new rq. + * + * XXX figure out if select_task_rq_dl() deals with offline cpus. */ - if (has_pushable_dl_tasks(rq)) + if (unlikely(!rq->online)) + rq = dl_task_offline_migration(rq, p); + + /* + * Queueing this task back might have overloaded rq, check if we need + * to kick someone away. + */ + if (has_pushable_dl_tasks(rq)) { + /* + * Nothing relies on rq->lock after this, so its safe to drop + * rq->lock. + */ + lockdep_unpin_lock(&rq->lock); push_dl_task(rq); + lockdep_pin_lock(&rq->lock); + } #endif + unlock: task_rq_unlock(rq, p, &flags); + /* + * This can free the task_struct, including this hrtimer, do not touch + * anything related to that after this. + */ + put_task_struct(p); + return HRTIMER_NORESTART; } @@ -641,7 +701,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se) } static -int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se) +int dl_runtime_exceeded(struct sched_dl_entity *dl_se) { return (dl_se->runtime <= 0); } @@ -685,10 +745,10 @@ static void update_curr_dl(struct rq *rq) sched_rt_avg_update(rq, delta_exec); dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec; - if (dl_runtime_exceeded(rq, dl_se)) { + if (dl_runtime_exceeded(dl_se)) { dl_se->dl_throttled = 1; __dequeue_task_dl(rq, curr, 0); - if (unlikely(!start_dl_timer(dl_se, curr->dl.dl_boosted))) + if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr))) enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); if (!is_leftmost(curr, &rq->dl)) @@ -901,7 +961,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) /* * Use the scheduling parameters of the top pi-waiter - * task if we have one and its (relative) deadline is + * task if we have one and its (absolute) deadline is * smaller than our one... OTW we keep our runtime and * deadline. */ @@ -930,7 +990,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) enqueue_dl_entity(&p->dl, pi_se, flags); - if (!task_current(rq, p) && p->nr_cpus_allowed > 1) + if (!task_current(rq, p) && tsk_nr_cpus_allowed(p) > 1) enqueue_pushable_dl_task(rq, p); } @@ -996,7 +1056,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) rq = cpu_rq(cpu); rcu_read_lock(); - curr = ACCESS_ONCE(rq->curr); /* unlocked access */ + curr = READ_ONCE(rq->curr); /* unlocked access */ /* * If we are dealing with a -deadline task, we must @@ -1008,12 +1068,15 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) * try to make it stay here, it might be important. */ if (unlikely(dl_task(curr)) && - (curr->nr_cpus_allowed < 2 || + (tsk_nr_cpus_allowed(curr) < 2 || !dl_entity_preempt(&p->dl, &curr->dl)) && - (p->nr_cpus_allowed > 1)) { + (tsk_nr_cpus_allowed(p) > 1)) { int target = find_later_rq(p); - if (target != -1) + if (target != -1 && + (dl_time_before(p->dl.deadline, + cpu_rq(target)->dl.earliest_dl.curr) || + (cpu_rq(target)->dl.dl_nr_running == 0))) cpu = target; } rcu_read_unlock(); @@ -1028,7 +1091,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) * Current can't be migrated, useless to reschedule, * let's hope p can move out. */ - if (rq->curr->nr_cpus_allowed == 1 || + if (tsk_nr_cpus_allowed(rq->curr) == 1 || cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1) return; @@ -1036,15 +1099,13 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) * p is migratable, so let's not schedule it and * see if it is pushed or pulled somewhere else. */ - if (p->nr_cpus_allowed != 1 && + if (tsk_nr_cpus_allowed(p) != 1 && cpudl_find(&rq->rd->cpudl, p, NULL) != -1) return; resched_curr(rq); } -static int pull_dl_task(struct rq *this_rq); - #endif /* CONFIG_SMP */ /* @@ -1101,7 +1162,15 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev) dl_rq = &rq->dl; if (need_pull_dl_task(rq, prev)) { + /* + * This is OK, because current is on_cpu, which avoids it being + * picked for load-balance and preemption/IRQs are still + * disabled avoiding further scheduler activity on it and we're + * being very careful to re-start the picking loop. + */ + lockdep_unpin_lock(&rq->lock); pull_dl_task(rq); + lockdep_pin_lock(&rq->lock); /* * pull_rt_task() can drop (and re-acquire) rq->lock; this * means a stop task can slip in, in which case we need to @@ -1135,7 +1204,7 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev) if (hrtick_enabled(rq)) start_hrtick_dl(rq, p); - set_post_schedule(rq); + queue_push_tasks(rq); return p; } @@ -1144,7 +1213,7 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p) { update_curr_dl(rq); - if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1) + if (on_dl_rq(&p->dl) && tsk_nr_cpus_allowed(p) > 1) enqueue_pushable_dl_task(rq, p); } @@ -1172,7 +1241,6 @@ static void task_fork_dl(struct task_struct *p) static void task_dead_dl(struct task_struct *p) { - struct hrtimer *timer = &p->dl.dl_timer; struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); /* @@ -1182,8 +1250,6 @@ static void task_dead_dl(struct task_struct *p) /* XXX we should retain the bw until 0-lag */ dl_b->total_bw -= p->dl.dl_bw; raw_spin_unlock_irq(&dl_b->lock); - - hrtimer_cancel(timer); } static void set_curr_task_dl(struct rq *rq) @@ -1231,6 +1297,32 @@ next_node: return NULL; } +/* + * Return the earliest pushable rq's task, which is suitable to be executed + * on the CPU, NULL otherwise: + */ +static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu) +{ + struct rb_node *next_node = rq->dl.pushable_dl_tasks_leftmost; + struct task_struct *p = NULL; + + if (!has_pushable_dl_tasks(rq)) + return NULL; + +next_node: + if (next_node) { + p = rb_entry(next_node, struct task_struct, pushable_dl_tasks); + + if (pick_dl_task(rq, p, cpu)) + return p; + + next_node = rb_next(next_node); + goto next_node; + } + + return NULL; +} + static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl); static int find_later_rq(struct task_struct *task) @@ -1244,7 +1336,7 @@ static int find_later_rq(struct task_struct *task) if (unlikely(!later_mask)) return -1; - if (task->nr_cpus_allowed == 1) + if (tsk_nr_cpus_allowed(task) == 1) return -1; /* @@ -1334,11 +1426,23 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) later_rq = cpu_rq(cpu); + if (later_rq->dl.dl_nr_running && + !dl_time_before(task->dl.deadline, + later_rq->dl.earliest_dl.curr)) { + /* + * Target rq has tasks of equal or earlier deadline, + * retrying does not release any lock and is unlikely + * to yield a different result. + */ + later_rq = NULL; + break; + } + /* Retry if something changed. */ if (double_lock_balance(rq, later_rq)) { if (unlikely(task_rq(task) != rq || !cpumask_test_cpu(later_rq->cpu, - &task->cpus_allowed) || + tsk_cpus_allowed(task)) || task_running(rq, task) || !task_on_rq_queued(task))) { double_unlock_balance(rq, later_rq); @@ -1377,7 +1481,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) BUG_ON(rq->cpu != task_cpu(p)); BUG_ON(task_current(rq, p)); - BUG_ON(p->nr_cpus_allowed <= 1); + BUG_ON(tsk_nr_cpus_allowed(p) <= 1); BUG_ON(!task_on_rq_queued(p)); BUG_ON(!dl_task(p)); @@ -1416,7 +1520,7 @@ retry: */ if (dl_task(rq->curr) && dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) && - rq->curr->nr_cpus_allowed > 1) { + tsk_nr_cpus_allowed(rq->curr) > 1) { resched_curr(rq); return 0; } @@ -1469,20 +1573,21 @@ out: static void push_dl_tasks(struct rq *rq) { - /* Terminates as it moves a -deadline task */ + /* push_dl_task() will return true if it moved a -deadline task */ while (push_dl_task(rq)) ; } -static int pull_dl_task(struct rq *this_rq) +static void pull_dl_task(struct rq *this_rq) { - int this_cpu = this_rq->cpu, ret = 0, cpu; + int this_cpu = this_rq->cpu, cpu; struct task_struct *p; + bool resched = false; struct rq *src_rq; u64 dmin = LONG_MAX; if (likely(!dl_overloaded(this_rq))) - return 0; + return; /* * Match the barrier from dl_set_overloaded; this guarantees that if we @@ -1515,7 +1620,7 @@ static int pull_dl_task(struct rq *this_rq) if (src_rq->dl.dl_nr_running <= 1) goto skip; - p = pick_next_earliest_dl_task(src_rq, this_cpu); + p = pick_earliest_pushable_dl_task(src_rq, this_cpu); /* * We found a task to be pulled if: @@ -1537,7 +1642,7 @@ static int pull_dl_task(struct rq *this_rq) src_rq->curr->dl.deadline)) goto skip; - ret = 1; + resched = true; deactivate_task(src_rq, p, 0); set_task_cpu(p, this_cpu); @@ -1550,12 +1655,8 @@ skip: double_unlock_balance(this_rq, src_rq); } - return ret; -} - -static void post_schedule_dl(struct rq *rq) -{ - push_dl_tasks(rq); + if (resched) + resched_curr(this_rq); } /* @@ -1566,10 +1667,9 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p) { if (!task_running(rq, p) && !test_tsk_need_resched(rq->curr) && - has_pushable_dl_tasks(rq) && - p->nr_cpus_allowed > 1 && + tsk_nr_cpus_allowed(p) > 1 && dl_task(rq->curr) && - (rq->curr->nr_cpus_allowed < 2 || + (tsk_nr_cpus_allowed(rq->curr) < 2 || !dl_entity_preempt(&p->dl, &rq->curr->dl))) { push_dl_tasks(rq); } @@ -1578,9 +1678,8 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p) static void set_cpus_allowed_dl(struct task_struct *p, const struct cpumask *new_mask) { - struct rq *rq; struct root_domain *src_rd; - int weight; + struct rq *rq; BUG_ON(!dl_task(p)); @@ -1606,37 +1705,7 @@ static void set_cpus_allowed_dl(struct task_struct *p, raw_spin_unlock(&src_dl_b->lock); } - /* - * Update only if the task is actually running (i.e., - * it is on the rq AND it is not throttled). - */ - if (!on_dl_rq(&p->dl)) - return; - - weight = cpumask_weight(new_mask); - - /* - * Only update if the process changes its state from whether it - * can migrate or not. - */ - if ((p->nr_cpus_allowed > 1) == (weight > 1)) - return; - - /* - * The process used to be able to migrate OR it can now migrate - */ - if (weight <= 1) { - if (!task_current(rq, p)) - dequeue_pushable_dl_task(rq, p); - BUG_ON(!rq->dl.dl_nr_migratory); - rq->dl.dl_nr_migratory--; - } else { - if (!task_current(rq, p)) - enqueue_pushable_dl_task(rq, p); - rq->dl.dl_nr_migratory++; - } - - update_dl_migration(&rq->dl); + set_cpus_allowed_common(p, new_mask); } /* Assumes rq->lock is held */ @@ -1660,7 +1729,7 @@ static void rq_offline_dl(struct rq *rq) cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu); } -void init_sched_dl_class(void) +void __init init_sched_dl_class(void) { unsigned int i; @@ -1671,37 +1740,16 @@ void init_sched_dl_class(void) #endif /* CONFIG_SMP */ -/* - * Ensure p's dl_timer is cancelled. May drop rq->lock for a while. - */ -static void cancel_dl_timer(struct rq *rq, struct task_struct *p) -{ - struct hrtimer *dl_timer = &p->dl.dl_timer; - - /* Nobody will change task's class if pi_lock is held */ - lockdep_assert_held(&p->pi_lock); - - if (hrtimer_active(dl_timer)) { - int ret = hrtimer_try_to_cancel(dl_timer); - - if (unlikely(ret == -1)) { - /* - * Note, p may migrate OR new deadline tasks - * may appear in rq when we are unlocking it. - * A caller of us must be fine with that. - */ - raw_spin_unlock(&rq->lock); - hrtimer_cancel(dl_timer); - raw_spin_lock(&rq->lock); - } - } -} - static void switched_from_dl(struct rq *rq, struct task_struct *p) { - /* XXX we should retain the bw until 0-lag */ - cancel_dl_timer(rq, p); - __dl_clear_params(p); + /* + * Start the deadline timer; if we switch back to dl before this we'll + * continue consuming our current CBS slice. If we stay outside of + * SCHED_DEADLINE until the deadline passes, the timer will reset the + * task. + */ + if (!start_dl_timer(p)) + __dl_clear_params(p); /* * Since this might be the only -deadline task on the rq, @@ -1711,8 +1759,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) if (!task_on_rq_queued(p) || rq->dl.dl_nr_running) return; - if (pull_dl_task(rq)) - resched_curr(rq); + queue_pull_task(rq); } /* @@ -1721,21 +1768,16 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) */ static void switched_to_dl(struct rq *rq, struct task_struct *p) { - int check_resched = 1; - if (task_on_rq_queued(p) && rq->curr != p) { #ifdef CONFIG_SMP - if (p->nr_cpus_allowed > 1 && rq->dl.overloaded && - push_dl_task(rq) && rq != task_rq(p)) - /* Only reschedule if pushing failed */ - check_resched = 0; -#endif /* CONFIG_SMP */ - if (check_resched) { - if (dl_task(rq->curr)) - check_preempt_curr_dl(rq, p, 0); - else - resched_curr(rq); - } + if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded) + queue_push_tasks(rq); +#else + if (dl_task(rq->curr)) + check_preempt_curr_dl(rq, p, 0); + else + resched_curr(rq); +#endif } } @@ -1755,15 +1797,14 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, * or lowering its prio, so... */ if (!rq->dl.overloaded) - pull_dl_task(rq); + queue_pull_task(rq); /* * If we now have a earlier deadline task than p, * then reschedule, provided p is still on this * runqueue. */ - if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) && - rq->curr == p) + if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline)) resched_curr(rq); #else /* @@ -1793,7 +1834,6 @@ const struct sched_class dl_sched_class = { .set_cpus_allowed = set_cpus_allowed_dl, .rq_online = rq_online_dl, .rq_offline = rq_offline_dl, - .post_schedule = post_schedule_dl, .task_woken = task_woken_dl, #endif diff --git a/kernel/kernel/sched/debug.c b/kernel/kernel/sched/debug.c index 34b00001b..a2d69b883 100644 --- a/kernel/kernel/sched/debug.c +++ b/kernel/kernel/sched/debug.c @@ -68,13 +68,8 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group #define PN(F) \ SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) - if (!se) { - struct sched_avg *avg = &cpu_rq(cpu)->avg; - P(avg->runnable_avg_sum); - P(avg->avg_period); + if (!se) return; - } - PN(se->exec_start); PN(se->vruntime); @@ -93,12 +88,8 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group #endif P(se->load.weight); #ifdef CONFIG_SMP - P(se->avg.runnable_avg_sum); - P(se->avg.running_avg_sum); - P(se->avg.avg_period); - P(se->avg.load_avg_contrib); - P(se->avg.utilization_avg_contrib); - P(se->avg.decay_count); + P(se->avg.load_avg); + P(se->avg.util_avg); #endif #undef PN #undef P @@ -132,15 +123,17 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) p->prio); #ifdef CONFIG_SCHEDSTATS SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", - SPLIT_NS(p->se.vruntime), + SPLIT_NS(p->se.statistics.wait_sum), SPLIT_NS(p->se.sum_exec_runtime), SPLIT_NS(p->se.statistics.sum_sleep_runtime)); #else - SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", - 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); + SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", + 0LL, 0L, + SPLIT_NS(p->se.sum_exec_runtime), + 0LL, 0L); #endif #ifdef CONFIG_NUMA_BALANCING - SEQ_printf(m, " %d", task_node(p)); + SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); #endif #ifdef CONFIG_CGROUP_SCHED SEQ_printf(m, " %s", task_group_path(task_group(p))); @@ -156,7 +149,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) SEQ_printf(m, "\nrunnable tasks:\n" " task PID tree-key switches prio" - " exec-runtime sum-exec sum-sleep\n" + " wait-time sum-exec sum-sleep\n" "------------------------------------------------------" "----------------------------------------------------\n"); @@ -212,26 +205,24 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); #ifdef CONFIG_SMP - SEQ_printf(m, " .%-30s: %ld\n", "runnable_load_avg", + SEQ_printf(m, " .%-30s: %lu\n", "load_avg", + cfs_rq->avg.load_avg); + SEQ_printf(m, " .%-30s: %lu\n", "runnable_load_avg", cfs_rq->runnable_load_avg); - SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg", - cfs_rq->blocked_load_avg); - SEQ_printf(m, " .%-30s: %ld\n", "utilization_load_avg", - cfs_rq->utilization_load_avg); + SEQ_printf(m, " .%-30s: %lu\n", "util_avg", + cfs_rq->avg.util_avg); + SEQ_printf(m, " .%-30s: %ld\n", "removed_load_avg", + atomic_long_read(&cfs_rq->removed_load_avg)); + SEQ_printf(m, " .%-30s: %ld\n", "removed_util_avg", + atomic_long_read(&cfs_rq->removed_util_avg)); #ifdef CONFIG_FAIR_GROUP_SCHED - SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib", - cfs_rq->tg_load_contrib); - SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib", - cfs_rq->tg_runnable_contrib); + SEQ_printf(m, " .%-30s: %lu\n", "tg_load_avg_contrib", + cfs_rq->tg_load_avg_contrib); SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg", atomic_long_read(&cfs_rq->tg->load_avg)); - SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg", - atomic_read(&cfs_rq->tg->runnable_avg)); #endif #endif #ifdef CONFIG_CFS_BANDWIDTH - SEQ_printf(m, " .%-30s: %d\n", "tg->cfs_bandwidth.timer_active", - cfs_rq->tg->cfs_bandwidth.timer_active); SEQ_printf(m, " .%-30s: %d\n", "throttled", cfs_rq->throttled); SEQ_printf(m, " .%-30s: %d\n", "throttle_count", @@ -520,11 +511,21 @@ __initcall(init_sched_debug_procfs); SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) +#ifdef CONFIG_NUMA_BALANCING +void print_numa_stats(struct seq_file *m, int node, unsigned long tsf, + unsigned long tpf, unsigned long gsf, unsigned long gpf) +{ + SEQ_printf(m, "numa_faults node=%d ", node); + SEQ_printf(m, "task_private=%lu task_shared=%lu ", tsf, tpf); + SEQ_printf(m, "group_private=%lu group_shared=%lu\n", gsf, gpf); +} +#endif + + static void sched_show_numa(struct task_struct *p, struct seq_file *m) { #ifdef CONFIG_NUMA_BALANCING struct mempolicy *pol; - int node, i; if (p->mm) P(mm->numa_scan_seq); @@ -536,26 +537,12 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m) mpol_get(pol); task_unlock(p); - SEQ_printf(m, "numa_migrations, %ld\n", xchg(&p->numa_pages_migrated, 0)); - - for_each_online_node(node) { - for (i = 0; i < 2; i++) { - unsigned long nr_faults = -1; - int cpu_current, home_node; - - if (p->numa_faults) - nr_faults = p->numa_faults[2*node + i]; - - cpu_current = !i ? (task_node(p) == node) : - (pol && node_isset(node, pol->v.nodes)); - - home_node = (p->numa_preferred_nid == node); - - SEQ_printf(m, "numa_faults_memory, %d, %d, %d, %d, %ld\n", - i, node, cpu_current, home_node, nr_faults); - } - } - + P(numa_pages_migrated); + P(numa_preferred_nid); + P(total_numa_faults); + SEQ_printf(m, "current_node=%d, numa_group_id=%d\n", + task_node(p), task_numa_group_id(p)); + show_numa_stats(p, m); mpol_put(pol); #endif } @@ -585,6 +572,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) nr_switches = p->nvcsw + p->nivcsw; #ifdef CONFIG_SCHEDSTATS + PN(se.statistics.sum_sleep_runtime); PN(se.statistics.wait_start); PN(se.statistics.sleep_start); PN(se.statistics.block_start); @@ -642,12 +630,11 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) P(se.load.weight); #ifdef CONFIG_SMP - P(se.avg.runnable_avg_sum); - P(se.avg.running_avg_sum); - P(se.avg.avg_period); - P(se.avg.load_avg_contrib); - P(se.avg.utilization_avg_contrib); - P(se.avg.decay_count); + P(se.avg.load_sum); + P(se.avg.util_sum); + P(se.avg.load_avg); + P(se.avg.util_avg); + P(se.avg.last_update_time); #endif P(policy); P(prio); diff --git a/kernel/kernel/sched/fair.c b/kernel/kernel/sched/fair.c index 92c53d074..8c71e627b 100644 --- a/kernel/kernel/sched/fair.c +++ b/kernel/kernel/sched/fair.c @@ -17,7 +17,7 @@ * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de> * * Adaptive scheduling granularity, math enhancements by Peter Zijlstra - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra */ #include <linux/latencytop.h> @@ -141,9 +141,9 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w) * * This idea comes from the SD scheduler of Con Kolivas: */ -static int get_update_sysctl_factor(void) +static unsigned int get_update_sysctl_factor(void) { - unsigned int cpus = min_t(int, num_online_cpus(), 8); + unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8); unsigned int factor; switch (sysctl_sched_tunable_scaling) { @@ -283,9 +283,6 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) return grp->my_q; } -static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, - int force_update); - static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { if (!cfs_rq->on_list) { @@ -305,8 +302,6 @@ static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) } cfs_rq->on_list = 1; - /* We should have no load, but we need to update last_decay. */ - update_cfs_rq_blocked_load(cfs_rq, 0); } } @@ -576,7 +571,7 @@ int sched_proc_update_handler(struct ctl_table *table, int write, loff_t *ppos) { int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - int factor = get_update_sysctl_factor(); + unsigned int factor = get_update_sysctl_factor(); if (ret || !write) return ret; @@ -616,15 +611,10 @@ static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se) */ static u64 __sched_period(unsigned long nr_running) { - u64 period = sysctl_sched_latency; - unsigned long nr_latency = sched_nr_latency; - - if (unlikely(nr_running > nr_latency)) { - period = sysctl_sched_min_granularity; - period *= nr_running; - } - - return period; + if (unlikely(nr_running > sched_nr_latency)) + return nr_running * sysctl_sched_min_granularity; + else + return sysctl_sched_latency; } /* @@ -669,22 +659,38 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) static int select_idle_sibling(struct task_struct *p, int cpu); static unsigned long task_h_load(struct task_struct *p); -static inline void __update_task_entity_contrib(struct sched_entity *se); -static inline void __update_task_entity_utilization(struct sched_entity *se); +/* + * We choose a half-life close to 1 scheduling period. + * Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are + * dependent on this value. + */ +#define LOAD_AVG_PERIOD 32 +#define LOAD_AVG_MAX 47742 /* maximum possible load avg */ +#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_AVG_MAX */ -/* Give new task start runnable values to heavy its load in infant time */ -void init_task_runnable_average(struct task_struct *p) +/* Give new sched_entity start runnable values to heavy its load in infant time */ +void init_entity_runnable_average(struct sched_entity *se) { - u32 slice; + struct sched_avg *sa = &se->avg; - slice = sched_slice(task_cfs_rq(p), &p->se) >> 10; - p->se.avg.runnable_avg_sum = p->se.avg.running_avg_sum = slice; - p->se.avg.avg_period = slice; - __update_task_entity_contrib(&p->se); - __update_task_entity_utilization(&p->se); + sa->last_update_time = 0; + /* + * sched_avg's period_contrib should be strictly less then 1024, so + * we give it 1023 to make sure it is almost a period (1024us), and + * will definitely be update (after enqueue). + */ + sa->period_contrib = 1023; + sa->load_avg = scale_load_down(se->load.weight); + sa->load_sum = sa->load_avg * LOAD_AVG_MAX; + sa->util_avg = scale_load_down(SCHED_LOAD_SCALE); + sa->util_sum = sa->util_avg * LOAD_AVG_MAX; + /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } + +static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq); +static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq); #else -void init_task_runnable_average(struct task_struct *p) +void init_entity_runnable_average(struct sched_entity *se) { } #endif @@ -834,7 +840,7 @@ static unsigned int task_nr_scan_windows(struct task_struct *p) static unsigned int task_scan_min(struct task_struct *p) { - unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size); + unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size); unsigned int scan, floor; unsigned int windows = 1; @@ -1198,11 +1204,9 @@ static void task_numa_assign(struct task_numa_env *env, static bool load_too_imbalanced(long src_load, long dst_load, struct task_numa_env *env) { + long imb, old_imb; + long orig_src_load, orig_dst_load; long src_capacity, dst_capacity; - long orig_src_load; - long load_a, load_b; - long moved_load; - long imb; /* * The load is corrected for the CPU capacity available on each node. @@ -1215,39 +1219,30 @@ static bool load_too_imbalanced(long src_load, long dst_load, dst_capacity = env->dst_stats.compute_capacity; /* We care about the slope of the imbalance, not the direction. */ - load_a = dst_load; - load_b = src_load; - if (load_a < load_b) - swap(load_a, load_b); + if (dst_load < src_load) + swap(dst_load, src_load); /* Is the difference below the threshold? */ - imb = load_a * src_capacity * 100 - - load_b * dst_capacity * env->imbalance_pct; + imb = dst_load * src_capacity * 100 - + src_load * dst_capacity * env->imbalance_pct; if (imb <= 0) return false; /* * The imbalance is above the allowed threshold. - * Allow a move that brings us closer to a balanced situation, - * without moving things past the point of balance. + * Compare it with the old imbalance. */ orig_src_load = env->src_stats.load; + orig_dst_load = env->dst_stats.load; - /* - * In a task swap, there will be one load moving from src to dst, - * and another moving back. This is the net sum of both moves. - * A simple task move will always have a positive value. - * Allow the move if it brings the system closer to a balanced - * situation, without crossing over the balance point. - */ - moved_load = orig_src_load - src_load; + if (orig_dst_load < orig_src_load) + swap(orig_dst_load, orig_src_load); - if (moved_load > 0) - /* Moving src -> dst. Did we overshoot balance? */ - return src_load * dst_capacity < dst_load * src_capacity; - else - /* Moving dst -> src. Did we overshoot balance? */ - return dst_load * src_capacity < src_load * dst_capacity; + old_imb = orig_dst_load * src_capacity * 100 - + orig_src_load * dst_capacity * env->imbalance_pct; + + /* Would this change make things worse? */ + return (imb > old_imb); } /* @@ -1409,6 +1404,31 @@ static void task_numa_find_cpu(struct task_numa_env *env, } } +/* Only move tasks to a NUMA node less busy than the current node. */ +static bool numa_has_capacity(struct task_numa_env *env) +{ + struct numa_stats *src = &env->src_stats; + struct numa_stats *dst = &env->dst_stats; + + if (src->has_free_capacity && !dst->has_free_capacity) + return false; + + /* + * Only consider a task move if the source has a higher load + * than the destination, corrected for CPU capacity on each node. + * + * src->load dst->load + * --------------------- vs --------------------- + * src->compute_capacity dst->compute_capacity + */ + if (src->load * dst->compute_capacity * env->imbalance_pct > + + dst->load * src->compute_capacity * 100) + return true; + + return false; +} + static int task_numa_migrate(struct task_struct *p) { struct task_numa_env env = { @@ -1463,7 +1483,8 @@ static int task_numa_migrate(struct task_struct *p) update_numa_stats(&env.dst_stats, env.dst_nid); /* Try to find a spot on the preferred nid. */ - task_numa_find_cpu(&env, taskimp, groupimp); + if (numa_has_capacity(&env)) + task_numa_find_cpu(&env, taskimp, groupimp); /* * Look at other nodes in these cases: @@ -1494,7 +1515,8 @@ static int task_numa_migrate(struct task_struct *p) env.dist = dist; env.dst_nid = nid; update_numa_stats(&env.dst_stats, env.dst_nid); - task_numa_find_cpu(&env, taskimp, groupimp); + if (numa_has_capacity(&env)) + task_numa_find_cpu(&env, taskimp, groupimp); } } @@ -1687,8 +1709,8 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) delta = runtime - p->last_sum_exec_runtime; *period = now - p->last_task_numa_placement; } else { - delta = p->se.avg.runnable_avg_sum; - *period = p->se.avg.avg_period; + delta = p->se.avg.load_sum / p->se.load.weight; + *period = LOAD_AVG_MAX; } p->last_sum_exec_runtime = runtime; @@ -1794,7 +1816,12 @@ static void task_numa_placement(struct task_struct *p) u64 runtime, period; spinlock_t *group_lock = NULL; - seq = ACCESS_ONCE(p->mm->numa_scan_seq); + /* + * The p->mm->numa_scan_seq field gets updated without + * exclusive access. Use READ_ONCE() here to ensure + * that the field is read in a single access: + */ + seq = READ_ONCE(p->mm->numa_scan_seq); if (p->numa_scan_seq == seq) return; p->numa_scan_seq = seq; @@ -1938,7 +1965,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, } rcu_read_lock(); - tsk = ACCESS_ONCE(cpu_rq(cpu)->curr); + tsk = READ_ONCE(cpu_rq(cpu)->curr); if (!cpupid_match_pid(tsk, cpupid)) goto no_join; @@ -2043,7 +2070,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) int local = !!(flags & TNF_FAULT_LOCAL); int priv; - if (!numabalancing_enabled) + if (!static_branch_likely(&sched_numa_balancing)) return; /* for example, ksmd faulting in a user's mm */ @@ -2107,7 +2134,15 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) static void reset_ptenuma_scan(struct task_struct *p) { - ACCESS_ONCE(p->mm->numa_scan_seq)++; + /* + * We only did a read acquisition of the mmap sem, so + * p->mm->numa_scan_seq is written to without exclusive access + * and the update is not guaranteed to be atomic. That's not + * much of an issue though, since this is just used for + * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not + * expensive, to avoid any form of compiler optimizations: + */ + WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1); p->mm->numa_scan_offset = 0; } @@ -2123,7 +2158,7 @@ void task_numa_work(struct callback_head *work) struct vm_area_struct *vma; unsigned long start, end; unsigned long nr_pte_updates = 0; - long pages; + long pages, virtpages; WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); @@ -2169,9 +2204,11 @@ void task_numa_work(struct callback_head *work) start = mm->numa_scan_offset; pages = sysctl_numa_balancing_scan_size; pages <<= 20 - PAGE_SHIFT; /* MB in pages */ + virtpages = pages * 8; /* Scan up to this much virtual space */ if (!pages) return; + down_read(&mm->mmap_sem); vma = find_vma(mm, start); if (!vma) { @@ -2206,18 +2243,22 @@ void task_numa_work(struct callback_head *work) start = max(start, vma->vm_start); end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); end = min(end, vma->vm_end); - nr_pte_updates += change_prot_numa(vma, start, end); + nr_pte_updates = change_prot_numa(vma, start, end); /* - * Scan sysctl_numa_balancing_scan_size but ensure that - * at least one PTE is updated so that unused virtual - * address space is quickly skipped. + * Try to scan sysctl_numa_balancing_size worth of + * hpages that have at least one present PTE that + * is not already pte-numa. If the VMA contains + * areas that are unused or already full of prot_numa + * PTEs, scan up to virtpages, to skip through those + * areas faster. */ if (nr_pte_updates) pages -= (end - start) >> PAGE_SHIFT; + virtpages -= (end - start) >> PAGE_SHIFT; start = end; - if (pages <= 0) + if (pages <= 0 || virtpages <= 0) goto out; cond_resched(); @@ -2261,7 +2302,7 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) now = curr->se.sum_exec_runtime; period = (u64)curr->numa_scan_period * NSEC_PER_MSEC; - if (now - curr->node_stamp > period) { + if (now > curr->node_stamp + period) { if (!curr->node_stamp) curr->numa_scan_period = task_scan_min(curr); curr->node_stamp += period; @@ -2323,12 +2364,12 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) long tg_weight; /* - * Use this CPU's actual weight instead of the last load_contribution - * to gain a more accurate current total weight. See - * update_cfs_rq_load_contribution(). + * Use this CPU's real-time load instead of the last load contribution + * as the updating of the contribution is delayed, and we will use the + * the real-time load to calc the share. See update_tg_load_avg(). */ tg_weight = atomic_long_read(&tg->load_avg); - tg_weight -= cfs_rq->tg_load_contrib; + tg_weight -= cfs_rq->tg_load_avg_contrib; tg_weight += cfs_rq->load.weight; return tg_weight; @@ -2401,14 +2442,6 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq) #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_SMP -/* - * We choose a half-life close to 1 scheduling period. - * Note: The tables below are dependent on this value. - */ -#define LOAD_AVG_PERIOD 32 -#define LOAD_AVG_MAX 47742 /* maximum possible load avg */ -#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */ - /* Precomputed fixed inverse multiplies for multiplication by y^n */ static const u32 runnable_avg_yN_inv[] = { 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, @@ -2457,9 +2490,8 @@ static __always_inline u64 decay_load(u64 val, u64 n) local_n %= LOAD_AVG_PERIOD; } - val *= runnable_avg_yN_inv[local_n]; - /* We don't use SRR here since we always want to round down. */ - return val >> 32; + val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32); + return val; } /* @@ -2490,6 +2522,12 @@ static u32 __compute_runnable_contrib(u64 n) return contrib + runnable_avg_yN_sum[n]; } +#if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10 +#error "load tracking assumes 2^10 as unit" +#endif + +#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) + /* * We can represent the historical contribution to runnable average as the * coefficients of a geometric series. To do this we sub-divide our runnable @@ -2518,23 +2556,22 @@ static u32 __compute_runnable_contrib(u64 n) * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] */ -static __always_inline int __update_entity_runnable_avg(u64 now, int cpu, - struct sched_avg *sa, - int runnable, - int running) +static __always_inline int +__update_load_avg(u64 now, int cpu, struct sched_avg *sa, + unsigned long weight, int running, struct cfs_rq *cfs_rq) { - u64 delta, periods; - u32 runnable_contrib; - int delta_w, decayed = 0; - unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu); + u64 delta, scaled_delta, periods; + u32 contrib; + unsigned int delta_w, scaled_delta_w, decayed = 0; + unsigned long scale_freq, scale_cpu; - delta = now - sa->last_runnable_update; + delta = now - sa->last_update_time; /* * This should only happen when time goes backwards, which it * unfortunately does during sched clock init when we swap over to TSC. */ if ((s64)delta < 0) { - sa->last_runnable_update = now; + sa->last_update_time = now; return 0; } @@ -2545,26 +2582,35 @@ static __always_inline int __update_entity_runnable_avg(u64 now, int cpu, delta >>= 10; if (!delta) return 0; - sa->last_runnable_update = now; + sa->last_update_time = now; + + scale_freq = arch_scale_freq_capacity(NULL, cpu); + scale_cpu = arch_scale_cpu_capacity(NULL, cpu); /* delta_w is the amount already accumulated against our next period */ - delta_w = sa->avg_period % 1024; + delta_w = sa->period_contrib; if (delta + delta_w >= 1024) { - /* period roll-over */ decayed = 1; + /* how much left for next period will start over, we don't know yet */ + sa->period_contrib = 0; + /* * Now that we know we're crossing a period boundary, figure * out how much from delta we need to complete the current * period and accrue it. */ delta_w = 1024 - delta_w; - if (runnable) - sa->runnable_avg_sum += delta_w; + scaled_delta_w = cap_scale(delta_w, scale_freq); + if (weight) { + sa->load_sum += weight * scaled_delta_w; + if (cfs_rq) { + cfs_rq->runnable_load_sum += + weight * scaled_delta_w; + } + } if (running) - sa->running_avg_sum += delta_w * scale_freq - >> SCHED_CAPACITY_SHIFT; - sa->avg_period += delta_w; + sa->util_sum += scaled_delta_w * scale_cpu; delta -= delta_w; @@ -2572,341 +2618,221 @@ static __always_inline int __update_entity_runnable_avg(u64 now, int cpu, periods = delta / 1024; delta %= 1024; - sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum, - periods + 1); - sa->running_avg_sum = decay_load(sa->running_avg_sum, - periods + 1); - sa->avg_period = decay_load(sa->avg_period, - periods + 1); + sa->load_sum = decay_load(sa->load_sum, periods + 1); + if (cfs_rq) { + cfs_rq->runnable_load_sum = + decay_load(cfs_rq->runnable_load_sum, periods + 1); + } + sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1); /* Efficiently calculate \sum (1..n_period) 1024*y^i */ - runnable_contrib = __compute_runnable_contrib(periods); - if (runnable) - sa->runnable_avg_sum += runnable_contrib; + contrib = __compute_runnable_contrib(periods); + contrib = cap_scale(contrib, scale_freq); + if (weight) { + sa->load_sum += weight * contrib; + if (cfs_rq) + cfs_rq->runnable_load_sum += weight * contrib; + } if (running) - sa->running_avg_sum += runnable_contrib * scale_freq - >> SCHED_CAPACITY_SHIFT; - sa->avg_period += runnable_contrib; + sa->util_sum += contrib * scale_cpu; } /* Remainder of delta accrued against u_0` */ - if (runnable) - sa->runnable_avg_sum += delta; + scaled_delta = cap_scale(delta, scale_freq); + if (weight) { + sa->load_sum += weight * scaled_delta; + if (cfs_rq) + cfs_rq->runnable_load_sum += weight * scaled_delta; + } if (running) - sa->running_avg_sum += delta * scale_freq - >> SCHED_CAPACITY_SHIFT; - sa->avg_period += delta; - - return decayed; -} - -/* Synchronize an entity's decay with its parenting cfs_rq.*/ -static inline u64 __synchronize_entity_decay(struct sched_entity *se) -{ - struct cfs_rq *cfs_rq = cfs_rq_of(se); - u64 decays = atomic64_read(&cfs_rq->decay_counter); + sa->util_sum += scaled_delta * scale_cpu; - decays -= se->avg.decay_count; - se->avg.decay_count = 0; - if (!decays) - return 0; + sa->period_contrib += delta; - se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); - se->avg.utilization_avg_contrib = - decay_load(se->avg.utilization_avg_contrib, decays); + if (decayed) { + sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX); + if (cfs_rq) { + cfs_rq->runnable_load_avg = + div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); + } + sa->util_avg = sa->util_sum / LOAD_AVG_MAX; + } - return decays; + return decayed; } #ifdef CONFIG_FAIR_GROUP_SCHED -static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, - int force_update) -{ - struct task_group *tg = cfs_rq->tg; - long tg_contrib; - - tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg; - tg_contrib -= cfs_rq->tg_load_contrib; - - if (!tg_contrib) - return; - - if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) { - atomic_long_add(tg_contrib, &tg->load_avg); - cfs_rq->tg_load_contrib += tg_contrib; - } -} - /* - * Aggregate cfs_rq runnable averages into an equivalent task_group - * representation for computing load contributions. + * Updating tg's load_avg is necessary before update_cfs_share (which is done) + * and effective_load (which is not done because it is too costly). */ -static inline void __update_tg_runnable_avg(struct sched_avg *sa, - struct cfs_rq *cfs_rq) +static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) { - struct task_group *tg = cfs_rq->tg; - long contrib; + long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib; - /* The fraction of a cpu used by this cfs_rq */ - contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT, - sa->avg_period + 1); - contrib -= cfs_rq->tg_runnable_contrib; - - if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) { - atomic_add(contrib, &tg->runnable_avg); - cfs_rq->tg_runnable_contrib += contrib; + if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) { + atomic_long_add(delta, &cfs_rq->tg->load_avg); + cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg; } } -static inline void __update_group_entity_contrib(struct sched_entity *se) -{ - struct cfs_rq *cfs_rq = group_cfs_rq(se); - struct task_group *tg = cfs_rq->tg; - int runnable_avg; - - u64 contrib; - - contrib = cfs_rq->tg_load_contrib * tg->shares; - se->avg.load_avg_contrib = div_u64(contrib, - atomic_long_read(&tg->load_avg) + 1); - - /* - * For group entities we need to compute a correction term in the case - * that they are consuming <1 cpu so that we would contribute the same - * load as a task of equal weight. - * - * Explicitly co-ordinating this measurement would be expensive, but - * fortunately the sum of each cpus contribution forms a usable - * lower-bound on the true value. - * - * Consider the aggregate of 2 contributions. Either they are disjoint - * (and the sum represents true value) or they are disjoint and we are - * understating by the aggregate of their overlap. - * - * Extending this to N cpus, for a given overlap, the maximum amount we - * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of - * cpus that overlap for this interval and w_i is the interval width. - * - * On a small machine; the first term is well-bounded which bounds the - * total error since w_i is a subset of the period. Whereas on a - * larger machine, while this first term can be larger, if w_i is the - * of consequential size guaranteed to see n_i*w_i quickly converge to - * our upper bound of 1-cpu. - */ - runnable_avg = atomic_read(&tg->runnable_avg); - if (runnable_avg < NICE_0_LOAD) { - se->avg.load_avg_contrib *= runnable_avg; - se->avg.load_avg_contrib >>= NICE_0_SHIFT; - } -} - -static inline void update_rq_runnable_avg(struct rq *rq, int runnable) -{ - __update_entity_runnable_avg(rq_clock_task(rq), cpu_of(rq), &rq->avg, - runnable, runnable); - __update_tg_runnable_avg(&rq->avg, &rq->cfs); -} #else /* CONFIG_FAIR_GROUP_SCHED */ -static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, - int force_update) {} -static inline void __update_tg_runnable_avg(struct sched_avg *sa, - struct cfs_rq *cfs_rq) {} -static inline void __update_group_entity_contrib(struct sched_entity *se) {} -static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} +static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} #endif /* CONFIG_FAIR_GROUP_SCHED */ -static inline void __update_task_entity_contrib(struct sched_entity *se) -{ - u32 contrib; - - /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ - contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight); - contrib /= (se->avg.avg_period + 1); - se->avg.load_avg_contrib = scale_load(contrib); -} +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); -/* Compute the current contribution to load_avg by se, return any delta */ -static long __update_entity_load_avg_contrib(struct sched_entity *se) +/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */ +static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) { - long old_contrib = se->avg.load_avg_contrib; + struct sched_avg *sa = &cfs_rq->avg; + int decayed, removed = 0; - if (entity_is_task(se)) { - __update_task_entity_contrib(se); - } else { - __update_tg_runnable_avg(&se->avg, group_cfs_rq(se)); - __update_group_entity_contrib(se); + if (atomic_long_read(&cfs_rq->removed_load_avg)) { + s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); + sa->load_avg = max_t(long, sa->load_avg - r, 0); + sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0); + removed = 1; } - return se->avg.load_avg_contrib - old_contrib; -} + if (atomic_long_read(&cfs_rq->removed_util_avg)) { + long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); + sa->util_avg = max_t(long, sa->util_avg - r, 0); + sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0); + } + decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, + scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq); -static inline void __update_task_entity_utilization(struct sched_entity *se) -{ - u32 contrib; +#ifndef CONFIG_64BIT + smp_wmb(); + cfs_rq->load_last_update_time_copy = sa->last_update_time; +#endif - /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ - contrib = se->avg.running_avg_sum * scale_load_down(SCHED_LOAD_SCALE); - contrib /= (se->avg.avg_period + 1); - se->avg.utilization_avg_contrib = scale_load(contrib); + return decayed || removed; } -static long __update_entity_utilization_avg_contrib(struct sched_entity *se) +/* Update task and its cfs_rq load average */ +static inline void update_load_avg(struct sched_entity *se, int update_tg) { - long old_contrib = se->avg.utilization_avg_contrib; - - if (entity_is_task(se)) - __update_task_entity_utilization(se); - else - se->avg.utilization_avg_contrib = - group_cfs_rq(se)->utilization_load_avg; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 now = cfs_rq_clock_task(cfs_rq); + int cpu = cpu_of(rq_of(cfs_rq)); - return se->avg.utilization_avg_contrib - old_contrib; -} + /* + * Track task load average for carrying it to new CPU after migrated, and + * track group sched_entity load average for task_h_load calc in migration + */ + __update_load_avg(now, cpu, &se->avg, + se->on_rq * scale_load_down(se->load.weight), + cfs_rq->curr == se, NULL); -static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq, - long load_contrib) -{ - if (likely(load_contrib < cfs_rq->blocked_load_avg)) - cfs_rq->blocked_load_avg -= load_contrib; - else - cfs_rq->blocked_load_avg = 0; + if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg) + update_tg_load_avg(cfs_rq, 0); } -static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); - -/* Update a sched_entity's runnable average */ -static inline void update_entity_load_avg(struct sched_entity *se, - int update_cfs_rq) +static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - struct cfs_rq *cfs_rq = cfs_rq_of(se); - long contrib_delta, utilization_delta; - int cpu = cpu_of(rq_of(cfs_rq)); - u64 now; + if (!sched_feat(ATTACH_AGE_LOAD)) + goto skip_aging; /* - * For a group entity we need to use their owned cfs_rq_clock_task() in - * case they are the parent of a throttled hierarchy. + * If we got migrated (either between CPUs or between cgroups) we'll + * have aged the average right before clearing @last_update_time. */ - if (entity_is_task(se)) - now = cfs_rq_clock_task(cfs_rq); - else - now = cfs_rq_clock_task(group_cfs_rq(se)); + if (se->avg.last_update_time) { + __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), + &se->avg, 0, 0, NULL); - if (!__update_entity_runnable_avg(now, cpu, &se->avg, se->on_rq, - cfs_rq->curr == se)) - return; + /* + * XXX: we could have just aged the entire load away if we've been + * absent from the fair class for too long. + */ + } - contrib_delta = __update_entity_load_avg_contrib(se); - utilization_delta = __update_entity_utilization_avg_contrib(se); +skip_aging: + se->avg.last_update_time = cfs_rq->avg.last_update_time; + cfs_rq->avg.load_avg += se->avg.load_avg; + cfs_rq->avg.load_sum += se->avg.load_sum; + cfs_rq->avg.util_avg += se->avg.util_avg; + cfs_rq->avg.util_sum += se->avg.util_sum; +} - if (!update_cfs_rq) - return; +static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), + &se->avg, se->on_rq * scale_load_down(se->load.weight), + cfs_rq->curr == se, NULL); - if (se->on_rq) { - cfs_rq->runnable_load_avg += contrib_delta; - cfs_rq->utilization_load_avg += utilization_delta; - } else { - subtract_blocked_load_contrib(cfs_rq, -contrib_delta); - } + cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0); + cfs_rq->avg.load_sum = max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0); + cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0); + cfs_rq->avg.util_sum = max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0); } -/* - * Decay the load contributed by all blocked children and account this so that - * their contribution may appropriately discounted when they wake up. - */ -static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) +/* Add the load generated by se into cfs_rq's load average */ +static inline void +enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - u64 now = cfs_rq_clock_task(cfs_rq) >> 20; - u64 decays; - - decays = now - cfs_rq->last_decay; - if (!decays && !force_update) - return; + struct sched_avg *sa = &se->avg; + u64 now = cfs_rq_clock_task(cfs_rq); + int migrated, decayed; - if (atomic_long_read(&cfs_rq->removed_load)) { - unsigned long removed_load; - removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0); - subtract_blocked_load_contrib(cfs_rq, removed_load); + migrated = !sa->last_update_time; + if (!migrated) { + __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, + se->on_rq * scale_load_down(se->load.weight), + cfs_rq->curr == se, NULL); } - if (decays) { - cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg, - decays); - atomic64_add(decays, &cfs_rq->decay_counter); - cfs_rq->last_decay = now; - } + decayed = update_cfs_rq_load_avg(now, cfs_rq); + + cfs_rq->runnable_load_avg += sa->load_avg; + cfs_rq->runnable_load_sum += sa->load_sum; + + if (migrated) + attach_entity_load_avg(cfs_rq, se); - __update_cfs_rq_tg_load_contrib(cfs_rq, force_update); + if (decayed || migrated) + update_tg_load_avg(cfs_rq, 0); } -/* Add the load generated by se into cfs_rq's child load-average */ -static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, - struct sched_entity *se, - int wakeup) +/* Remove the runnable load generated by se from cfs_rq's runnable load average */ +static inline void +dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - /* - * We track migrations using entity decay_count <= 0, on a wake-up - * migration we use a negative decay count to track the remote decays - * accumulated while sleeping. - * - * Newly forked tasks are enqueued with se->avg.decay_count == 0, they - * are seen by enqueue_entity_load_avg() as a migration with an already - * constructed load_avg_contrib. - */ - if (unlikely(se->avg.decay_count <= 0)) { - se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq)); - if (se->avg.decay_count) { - /* - * In a wake-up migration we have to approximate the - * time sleeping. This is because we can't synchronize - * clock_task between the two cpus, and it is not - * guaranteed to be read-safe. Instead, we can - * approximate this using our carried decays, which are - * explicitly atomically readable. - */ - se->avg.last_runnable_update -= (-se->avg.decay_count) - << 20; - update_entity_load_avg(se, 0); - /* Indicate that we're now synchronized and on-rq */ - se->avg.decay_count = 0; - } - wakeup = 0; - } else { - __synchronize_entity_decay(se); - } - - /* migrated tasks did not contribute to our blocked load */ - if (wakeup) { - subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib); - update_entity_load_avg(se, 0); - } + update_load_avg(se, 1); - cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; - cfs_rq->utilization_load_avg += se->avg.utilization_avg_contrib; - /* we force update consideration on load-balancer moves */ - update_cfs_rq_blocked_load(cfs_rq, !wakeup); + cfs_rq->runnable_load_avg = + max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0); + cfs_rq->runnable_load_sum = + max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0); } /* - * Remove se's load from this cfs_rq child load-average, if the entity is - * transitioning to a blocked state we track its projected decay using - * blocked_load_avg. + * Task first catches up with cfs_rq, and then subtract + * itself from the cfs_rq (task must be off the queue now). */ -static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, - struct sched_entity *se, - int sleep) +void remove_entity_load_avg(struct sched_entity *se) { - update_entity_load_avg(se, 1); - /* we force update consideration on load-balancer moves */ - update_cfs_rq_blocked_load(cfs_rq, !sleep); + struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 last_update_time; + +#ifndef CONFIG_64BIT + u64 last_update_time_copy; + + do { + last_update_time_copy = cfs_rq->load_last_update_time_copy; + smp_rmb(); + last_update_time = cfs_rq->avg.last_update_time; + } while (last_update_time != last_update_time_copy); +#else + last_update_time = cfs_rq->avg.last_update_time; +#endif - cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; - cfs_rq->utilization_load_avg -= se->avg.utilization_avg_contrib; - if (sleep) { - cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; - se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); - } /* migrations, e.g. sleep=0 leave decay_count == 0 */ + __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); + atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg); + atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg); } /* @@ -2916,7 +2842,6 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, */ void idle_enter_fair(struct rq *this_rq) { - update_rq_runnable_avg(this_rq, 1); } /* @@ -2926,24 +2851,33 @@ void idle_enter_fair(struct rq *this_rq) */ void idle_exit_fair(struct rq *this_rq) { - update_rq_runnable_avg(this_rq, 0); +} + +static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq) +{ + return cfs_rq->runnable_load_avg; +} + +static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) +{ + return cfs_rq->avg.load_avg; } static int idle_balance(struct rq *this_rq); #else /* CONFIG_SMP */ -static inline void update_entity_load_avg(struct sched_entity *se, - int update_cfs_rq) {} -static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} -static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, - struct sched_entity *se, - int wakeup) {} -static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, - struct sched_entity *se, - int sleep) {} -static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, - int force_update) {} +static inline void update_load_avg(struct sched_entity *se, int update_tg) {} +static inline void +enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} +static inline void +dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} +static inline void remove_entity_load_avg(struct sched_entity *se) {} + +static inline void +attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} +static inline void +detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} static inline int idle_balance(struct rq *rq) { @@ -3075,7 +3009,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); - enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP); + enqueue_entity_load_avg(cfs_rq, se); account_entity_enqueue(cfs_rq, se); update_cfs_shares(cfs_rq); @@ -3150,7 +3084,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); - dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP); + dequeue_entity_load_avg(cfs_rq, se); update_stats_dequeue(cfs_rq, se); if (flags & DEQUEUE_SLEEP) { @@ -3240,7 +3174,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) */ update_stats_wait_end(cfs_rq, se); __dequeue_entity(cfs_rq, se); - update_entity_load_avg(se, 1); + update_load_avg(se, 1); } update_stats_curr_start(cfs_rq, se); @@ -3340,7 +3274,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) /* Put 'current' back into the tree. */ __enqueue_entity(cfs_rq, prev); /* in !on_rq case, update occurred at dequeue */ - update_entity_load_avg(prev, 1); + update_load_avg(prev, 0); } cfs_rq->curr = NULL; } @@ -3356,8 +3290,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) /* * Ensure that runnable average is periodically updated. */ - update_entity_load_avg(curr, 1); - update_cfs_rq_blocked_load(cfs_rq, 1); + update_load_avg(curr, 1); update_cfs_shares(cfs_rq); #ifdef CONFIG_SCHED_HRTICK @@ -3476,16 +3409,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) if (cfs_b->quota == RUNTIME_INF) amount = min_amount; else { - /* - * If the bandwidth pool has become inactive, then at least one - * period must have elapsed since the last consumption. - * Refresh the global state and ensure bandwidth timer becomes - * active. - */ - if (!cfs_b->timer_active) { - __refill_cfs_bandwidth_runtime(cfs_b); - __start_cfs_bandwidth(cfs_b, false); - } + start_cfs_bandwidth(cfs_b); if (cfs_b->runtime > 0) { amount = min(cfs_b->runtime, min_amount); @@ -3634,6 +3558,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; long task_delta, dequeue = 1; + bool empty; se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; @@ -3663,13 +3588,21 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->throttled = 1; cfs_rq->throttled_clock = rq_clock(rq); raw_spin_lock(&cfs_b->lock); + empty = list_empty(&cfs_b->throttled_cfs_rq); + /* * Add to the _head_ of the list, so that an already-started * distribute_cfs_runtime will not see us */ list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); - if (!cfs_b->timer_active) - __start_cfs_bandwidth(cfs_b, false); + + /* + * If we're the first throttled task, make sure the bandwidth + * timer is running. + */ + if (empty) + start_cfs_bandwidth(cfs_b); + raw_spin_unlock(&cfs_b->lock); } @@ -3784,13 +3717,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) if (cfs_b->idle && !throttled) goto out_deactivate; - /* - * if we have relooped after returning idle once, we need to update our - * status as actually running, so that other cpus doing - * __start_cfs_bandwidth will stop trying to cancel us. - */ - cfs_b->timer_active = 1; - __refill_cfs_bandwidth_runtime(cfs_b); if (!throttled) { @@ -3835,7 +3761,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) return 0; out_deactivate: - cfs_b->timer_active = 0; return 1; } @@ -3850,7 +3775,7 @@ static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC; * Are we near the end of the current quota period? * * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the - * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of + * hrtimer base being cleared by hrtimer_start. In the case of * migrate_hrtimers, base is never cleared, so we are fine. */ static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire) @@ -3878,8 +3803,9 @@ static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b) if (runtime_refresh_within(cfs_b, min_left)) return; - start_bandwidth_timer(&cfs_b->slack_timer, - ns_to_ktime(cfs_bandwidth_slack_period)); + hrtimer_start(&cfs_b->slack_timer, + ns_to_ktime(cfs_bandwidth_slack_period), + HRTIMER_MODE_REL); } /* we know any runtime found here is valid as update_curr() precedes return */ @@ -3999,6 +3925,7 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) { struct cfs_bandwidth *cfs_b = container_of(timer, struct cfs_bandwidth, slack_timer); + do_sched_cfs_slack_timer(cfs_b); return HRTIMER_NORESTART; @@ -4008,20 +3935,19 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) { struct cfs_bandwidth *cfs_b = container_of(timer, struct cfs_bandwidth, period_timer); - ktime_t now; int overrun; int idle = 0; raw_spin_lock(&cfs_b->lock); for (;;) { - now = hrtimer_cb_get_time(timer); - overrun = hrtimer_forward(timer, now, cfs_b->period); - + overrun = hrtimer_forward_now(timer, cfs_b->period); if (!overrun) break; idle = do_sched_cfs_period_timer(cfs_b, overrun); } + if (idle) + cfs_b->period_active = 0; raw_spin_unlock(&cfs_b->lock); return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; @@ -4035,7 +3961,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) cfs_b->period = ns_to_ktime(default_cfs_period()); INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); - hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); cfs_b->period_timer.function = sched_cfs_period_timer; hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); cfs_b->slack_timer.function = sched_cfs_slack_timer; @@ -4047,28 +3973,15 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) INIT_LIST_HEAD(&cfs_rq->throttled_list); } -/* requires cfs_b->lock, may release to reprogram timer */ -void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force) +void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) { - /* - * The timer may be active because we're trying to set a new bandwidth - * period or because we're racing with the tear-down path - * (timer_active==0 becomes visible before the hrtimer call-back - * terminates). In either case we ensure that it's re-programmed - */ - while (unlikely(hrtimer_active(&cfs_b->period_timer)) && - hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) { - /* bounce the lock to allow do_sched_cfs_period_timer to run */ - raw_spin_unlock(&cfs_b->lock); - cpu_relax(); - raw_spin_lock(&cfs_b->lock); - /* if someone else restarted the timer then we're done */ - if (!force && cfs_b->timer_active) - return; - } + lockdep_assert_held(&cfs_b->lock); - cfs_b->timer_active = 1; - start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period); + if (!cfs_b->period_active) { + cfs_b->period_active = 1; + hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period); + hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED); + } } static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) @@ -4250,14 +4163,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; + update_load_avg(se, 1); update_cfs_shares(cfs_rq); - update_entity_load_avg(se, 1); } - if (!se) { - update_rq_runnable_avg(rq, rq->nr_running); + if (!se) add_nr_running(rq, 1); - } + hrtick_update(rq); } @@ -4311,22 +4223,204 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; + update_load_avg(se, 1); update_cfs_shares(cfs_rq); - update_entity_load_avg(se, 1); } - if (!se) { + if (!se) sub_nr_running(rq, 1); - update_rq_runnable_avg(rq, 1); - } + hrtick_update(rq); } #ifdef CONFIG_SMP + +/* + * per rq 'load' arrray crap; XXX kill this. + */ + +/* + * The exact cpuload at various idx values, calculated at every tick would be + * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load + * + * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called + * on nth tick when cpu may be busy, then we have: + * load = ((2^idx - 1) / 2^idx)^(n-1) * load + * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load + * + * decay_load_missed() below does efficient calculation of + * load = ((2^idx - 1) / 2^idx)^(n-1) * load + * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load + * + * The calculation is approximated on a 128 point scale. + * degrade_zero_ticks is the number of ticks after which load at any + * particular idx is approximated to be zero. + * degrade_factor is a precomputed table, a row for each load idx. + * Each column corresponds to degradation factor for a power of two ticks, + * based on 128 point scale. + * Example: + * row 2, col 3 (=12) says that the degradation at load idx 2 after + * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). + * + * With this power of 2 load factors, we can degrade the load n times + * by looking at 1 bits in n and doing as many mult/shift instead of + * n mult/shifts needed by the exact degradation. + */ +#define DEGRADE_SHIFT 7 +static const unsigned char + degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; +static const unsigned char + degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { + {0, 0, 0, 0, 0, 0, 0, 0}, + {64, 32, 8, 0, 0, 0, 0, 0}, + {96, 72, 40, 12, 1, 0, 0}, + {112, 98, 75, 43, 15, 1, 0}, + {120, 112, 98, 76, 45, 16, 2} }; + +/* + * Update cpu_load for any missed ticks, due to tickless idle. The backlog + * would be when CPU is idle and so we just decay the old load without + * adding any new load. + */ +static unsigned long +decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) +{ + int j = 0; + + if (!missed_updates) + return load; + + if (missed_updates >= degrade_zero_ticks[idx]) + return 0; + + if (idx == 1) + return load >> missed_updates; + + while (missed_updates) { + if (missed_updates % 2) + load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; + + missed_updates >>= 1; + j++; + } + return load; +} + +/* + * Update rq->cpu_load[] statistics. This function is usually called every + * scheduler tick (TICK_NSEC). With tickless idle this will not be called + * every tick. We fix it up based on jiffies. + */ +static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, + unsigned long pending_updates) +{ + int i, scale; + + this_rq->nr_load_updates++; + + /* Update our load: */ + this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ + for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { + unsigned long old_load, new_load; + + /* scale is effectively 1 << i now, and >> i divides by scale */ + + old_load = this_rq->cpu_load[i]; + old_load = decay_load_missed(old_load, pending_updates - 1, i); + new_load = this_load; + /* + * Round up the averaging division if load is increasing. This + * prevents us from getting stuck on 9 if the load is 10, for + * example. + */ + if (new_load > old_load) + new_load += scale - 1; + + this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; + } + + sched_avg_update(this_rq); +} + /* Used instead of source_load when we know the type == 0 */ static unsigned long weighted_cpuload(const int cpu) { - return cpu_rq(cpu)->cfs.runnable_load_avg; + return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs); +} + +#ifdef CONFIG_NO_HZ_COMMON +/* + * There is no sane way to deal with nohz on smp when using jiffies because the + * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading + * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. + * + * Therefore we cannot use the delta approach from the regular tick since that + * would seriously skew the load calculation. However we'll make do for those + * updates happening while idle (nohz_idle_balance) or coming out of idle + * (tick_nohz_idle_exit). + * + * This means we might still be one tick off for nohz periods. + */ + +/* + * Called from nohz_idle_balance() to update the load ratings before doing the + * idle balance. + */ +static void update_idle_cpu_load(struct rq *this_rq) +{ + unsigned long curr_jiffies = READ_ONCE(jiffies); + unsigned long load = weighted_cpuload(cpu_of(this_rq)); + unsigned long pending_updates; + + /* + * bail if there's load or we're actually up-to-date. + */ + if (load || curr_jiffies == this_rq->last_load_update_tick) + return; + + pending_updates = curr_jiffies - this_rq->last_load_update_tick; + this_rq->last_load_update_tick = curr_jiffies; + + __update_cpu_load(this_rq, load, pending_updates); +} + +/* + * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. + */ +void update_cpu_load_nohz(void) +{ + struct rq *this_rq = this_rq(); + unsigned long curr_jiffies = READ_ONCE(jiffies); + unsigned long pending_updates; + + if (curr_jiffies == this_rq->last_load_update_tick) + return; + + raw_spin_lock(&this_rq->lock); + pending_updates = curr_jiffies - this_rq->last_load_update_tick; + if (pending_updates) { + this_rq->last_load_update_tick = curr_jiffies; + /* + * We were idle, this means load 0, the current load might be + * !0 due to remote wakeups and the sort. + */ + __update_cpu_load(this_rq, 0, pending_updates); + } + raw_spin_unlock(&this_rq->lock); +} +#endif /* CONFIG_NO_HZ */ + +/* + * Called from scheduler_tick() + */ +void update_cpu_load_active(struct rq *this_rq) +{ + unsigned long load = weighted_cpuload(cpu_of(this_rq)); + /* + * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). + */ + this_rq->last_load_update_tick = jiffies; + __update_cpu_load(this_rq, load, 1); } /* @@ -4375,8 +4469,8 @@ static unsigned long capacity_orig_of(int cpu) static unsigned long cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); - unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running); - unsigned long load_avg = rq->cfs.runnable_load_avg; + unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running); + unsigned long load_avg = weighted_cpuload(cpu); if (nr_running) return load_avg / nr_running; @@ -4495,7 +4589,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) /* * w = rw_i + @wl */ - w = se->my_q->load.weight + wl; + w = cfs_rq_load_avg(se->my_q) + wl; /* * wl = S * s'_i; see (2) @@ -4516,7 +4610,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) /* * wl = dw_i = S * (s'_i - s_i); see (3) */ - wl -= se->load.weight; + wl -= se->avg.load_avg; /* * Recursively apply this logic to all parent groups to compute @@ -4539,26 +4633,29 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) #endif +/* + * Detect M:N waker/wakee relationships via a switching-frequency heuristic. + * A waker of many should wake a different task than the one last awakened + * at a frequency roughly N times higher than one of its wakees. In order + * to determine whether we should let the load spread vs consolodating to + * shared cache, we look for a minimum 'flip' frequency of llc_size in one + * partner, and a factor of lls_size higher frequency in the other. With + * both conditions met, we can be relatively sure that the relationship is + * non-monogamous, with partner count exceeding socket size. Waker/wakee + * being client/server, worker/dispatcher, interrupt source or whatever is + * irrelevant, spread criteria is apparent partner count exceeds socket size. + */ static int wake_wide(struct task_struct *p) { + unsigned int master = current->wakee_flips; + unsigned int slave = p->wakee_flips; int factor = this_cpu_read(sd_llc_size); - /* - * Yeah, it's the switching-frequency, could means many wakee or - * rapidly switch, use factor here will just help to automatically - * adjust the loose-degree, so bigger node will lead to more pull. - */ - if (p->wakee_flips > factor) { - /* - * wakee is somewhat hot, it needs certain amount of cpu - * resource, so if waker is far more hot, prefer to leave - * it alone. - */ - if (current->wakee_flips > (factor * p->wakee_flips)) - return 1; - } - - return 0; + if (master < slave) + swap(master, slave); + if (slave < factor || master < slave * factor) + return 0; + return 1; } static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) @@ -4570,13 +4667,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) unsigned long weight; int balanced; - /* - * If we wake multiple tasks be careful to not bounce - * ourselves around too much. - */ - if (wake_wide(p)) - return 0; - idx = sd->wake_idx; this_cpu = smp_processor_id(); prev_cpu = task_cpu(p); @@ -4590,14 +4680,14 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) */ if (sync) { tg = task_group(current); - weight = current->se.load.weight; + weight = current->se.avg.load_avg; this_load += effective_load(tg, this_cpu, -weight, -weight); load += effective_load(tg, prev_cpu, 0, -weight); } tg = task_group(p); - weight = p->se.load.weight; + weight = p->se.avg.load_avg; /* * In low-load situations, where prev_cpu is idle and this_cpu is idle @@ -4785,32 +4875,39 @@ next: done: return target; } + /* - * get_cpu_usage returns the amount of capacity of a CPU that is used by CFS + * cpu_util returns the amount of capacity of a CPU that is used by CFS * tasks. The unit of the return value must be the one of capacity so we can - * compare the usage with the capacity of the CPU that is available for CFS - * task (ie cpu_capacity). - * cfs.utilization_load_avg is the sum of running time of runnable tasks on a - * CPU. It represents the amount of utilization of a CPU in the range - * [0..SCHED_LOAD_SCALE]. The usage of a CPU can't be higher than the full - * capacity of the CPU because it's about the running time on this CPU. - * Nevertheless, cfs.utilization_load_avg can be higher than SCHED_LOAD_SCALE - * because of unfortunate rounding in avg_period and running_load_avg or just - * after migrating tasks until the average stabilizes with the new running - * time. So we need to check that the usage stays into the range - * [0..cpu_capacity_orig] and cap if necessary. - * Without capping the usage, a group could be seen as overloaded (CPU0 usage - * at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity + * compare the utilization with the capacity of the CPU that is available for + * CFS task (ie cpu_capacity). + * + * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the + * recent utilization of currently non-runnable tasks on a CPU. It represents + * the amount of utilization of a CPU in the range [0..capacity_orig] where + * capacity_orig is the cpu_capacity available at the highest frequency + * (arch_scale_freq_capacity()). + * The utilization of a CPU converges towards a sum equal to or less than the + * current capacity (capacity_curr <= capacity_orig) of the CPU because it is + * the running time on this CPU scaled by capacity_curr. + * + * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even + * higher than capacity_orig because of unfortunate rounding in + * cfs.avg.util_avg or just after migrating tasks and new task wakeups until + * the average stabilizes with the new running time. We need to check that the + * utilization stays within the range of [0..capacity_orig] and cap it if + * necessary. Without utilization capping, a group could be seen as overloaded + * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of + * available capacity. We allow utilization to overshoot capacity_curr (but not + * capacity_orig) as it useful for predicting the capacity required after task + * migrations (scheduler-driven DVFS). */ -static int get_cpu_usage(int cpu) +static int cpu_util(int cpu) { - unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg; + unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; unsigned long capacity = capacity_orig_of(cpu); - if (usage >= SCHED_LOAD_SCALE) - return capacity; - - return (usage * capacity) >> SCHED_LOAD_SHIFT; + return (util >= capacity) ? capacity : util; } /* @@ -4830,17 +4927,17 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f { struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; int cpu = smp_processor_id(); - int new_cpu = cpu; + int new_cpu = prev_cpu; int want_affine = 0; int sync = wake_flags & WF_SYNC; if (sd_flag & SD_BALANCE_WAKE) - want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); + want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); rcu_read_lock(); for_each_domain(cpu, tmp) { if (!(tmp->flags & SD_LOAD_BALANCE)) - continue; + break; /* * If both cpu and prev_cpu are part of this domain, @@ -4854,17 +4951,21 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (tmp->flags & sd_flag) sd = tmp; + else if (!want_affine) + break; } - if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync)) - prev_cpu = cpu; - - if (sd_flag & SD_BALANCE_WAKE) { - new_cpu = select_idle_sibling(p, prev_cpu); - goto unlock; + if (affine_sd) { + sd = NULL; /* Prefer wake_affine over balance flags */ + if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) + new_cpu = cpu; } - while (sd) { + if (!sd) { + if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ + new_cpu = select_idle_sibling(p, new_cpu); + + } else while (sd) { struct sched_group *group; int weight; @@ -4898,7 +4999,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f } /* while loop will break here if sd == NULL */ } -unlock: rcu_read_unlock(); return new_cpu; @@ -4910,26 +5010,27 @@ unlock: * previous cpu. However, the caller only guarantees p->pi_lock is held; no * other assumptions, including the state of rq->lock, should be made. */ -static void -migrate_task_rq_fair(struct task_struct *p, int next_cpu) +static void migrate_task_rq_fair(struct task_struct *p) { - struct sched_entity *se = &p->se; - struct cfs_rq *cfs_rq = cfs_rq_of(se); - /* - * Load tracking: accumulate removed load so that it can be processed - * when we next update owning cfs_rq under rq->lock. Tasks contribute - * to blocked load iff they have a positive decay-count. It can never - * be negative here since on-rq tasks have decay-count == 0. + * We are supposed to update the task to "current" time, then its up to date + * and ready to go to new CPU/cfs_rq. But we have difficulty in getting + * what current time is, so simply throw away the out-of-date time. This + * will result in the wakee task is less decayed, but giving the wakee more + * load sounds not bad. */ - if (se->avg.decay_count) { - se->avg.decay_count = -__synchronize_entity_decay(se); - atomic_long_add(se->avg.load_avg_contrib, - &cfs_rq->removed_load); - } + remove_entity_load_avg(&p->se); + + /* Tell new CPU we are migrated */ + p->se.avg.last_update_time = 0; /* We have migrated, no longer consider this task hot */ - se->exec_start = 0; + p->se.exec_start = 0; +} + +static void task_dead_fair(struct task_struct *p) +{ + remove_entity_load_avg(&p->se); } #endif /* CONFIG_SMP */ @@ -5126,18 +5227,21 @@ again: * entity, update_curr() will update its vruntime, otherwise * forget we've ever seen it. */ - if (curr && curr->on_rq) - update_curr(cfs_rq); - else - curr = NULL; + if (curr) { + if (curr->on_rq) + update_curr(cfs_rq); + else + curr = NULL; - /* - * This call to check_cfs_rq_runtime() will do the throttle and - * dequeue its entity in the parent(s). Therefore the 'simple' - * nr_running test will indeed be correct. - */ - if (unlikely(check_cfs_rq_runtime(cfs_rq))) - goto simple; + /* + * This call to check_cfs_rq_runtime() will do the + * throttle and dequeue its entity in the parent(s). + * Therefore the 'simple' nr_running test will indeed + * be correct. + */ + if (unlikely(check_cfs_rq_runtime(cfs_rq))) + goto simple; + } se = pick_next_entity(cfs_rq, curr); cfs_rq = group_cfs_rq(se); @@ -5198,7 +5302,15 @@ simple: return p; idle: + /* + * This is OK, because current is on_cpu, which avoids it being picked + * for load-balance and preemption/IRQs are still disabled avoiding + * further scheduler activity on it and we're being very careful to + * re-start the picking loop. + */ + lockdep_unpin_lock(&rq->lock); new_tasks = idle_balance(rq); + lockdep_pin_lock(&rq->lock); /* * Because idle_balance() releases (and re-acquires) rq->lock, it is * possible for any higher priority task to appear. In that case we @@ -5467,90 +5579,57 @@ static int task_hot(struct task_struct *p, struct lb_env *env) } #ifdef CONFIG_NUMA_BALANCING -/* Returns true if the destination node has incurred more faults */ -static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) +/* + * Returns 1, if task migration degrades locality + * Returns 0, if task migration improves locality i.e migration preferred. + * Returns -1, if task migration is not affected by locality. + */ +static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) { struct numa_group *numa_group = rcu_dereference(p->numa_group); + unsigned long src_faults, dst_faults; int src_nid, dst_nid; - if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults || - !(env->sd->flags & SD_NUMA)) { - return false; - } + if (!static_branch_likely(&sched_numa_balancing)) + return -1; + + if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) + return -1; src_nid = cpu_to_node(env->src_cpu); dst_nid = cpu_to_node(env->dst_cpu); if (src_nid == dst_nid) - return false; - - if (numa_group) { - /* Task is already in the group's interleave set. */ - if (node_isset(src_nid, numa_group->active_nodes)) - return false; - - /* Task is moving into the group's interleave set. */ - if (node_isset(dst_nid, numa_group->active_nodes)) - return true; + return -1; - return group_faults(p, dst_nid) > group_faults(p, src_nid); + /* Migrating away from the preferred node is always bad. */ + if (src_nid == p->numa_preferred_nid) { + if (env->src_rq->nr_running > env->src_rq->nr_preferred_running) + return 1; + else + return -1; } /* Encourage migration to the preferred node. */ if (dst_nid == p->numa_preferred_nid) - return true; - - return task_faults(p, dst_nid) > task_faults(p, src_nid); -} - - -static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) -{ - struct numa_group *numa_group = rcu_dereference(p->numa_group); - int src_nid, dst_nid; - - if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) - return false; - - if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) - return false; - - src_nid = cpu_to_node(env->src_cpu); - dst_nid = cpu_to_node(env->dst_cpu); - - if (src_nid == dst_nid) - return false; + return 0; if (numa_group) { - /* Task is moving within/into the group's interleave set. */ - if (node_isset(dst_nid, numa_group->active_nodes)) - return false; - - /* Task is moving out of the group's interleave set. */ - if (node_isset(src_nid, numa_group->active_nodes)) - return true; - - return group_faults(p, dst_nid) < group_faults(p, src_nid); + src_faults = group_faults(p, src_nid); + dst_faults = group_faults(p, dst_nid); + } else { + src_faults = task_faults(p, src_nid); + dst_faults = task_faults(p, dst_nid); } - /* Migrating away from the preferred node is always bad. */ - if (src_nid == p->numa_preferred_nid) - return true; - - return task_faults(p, dst_nid) < task_faults(p, src_nid); + return dst_faults < src_faults; } #else -static inline bool migrate_improves_locality(struct task_struct *p, - struct lb_env *env) -{ - return false; -} - -static inline bool migrate_degrades_locality(struct task_struct *p, +static inline int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) { - return false; + return -1; } #endif @@ -5560,7 +5639,7 @@ static inline bool migrate_degrades_locality(struct task_struct *p, static int can_migrate_task(struct task_struct *p, struct lb_env *env) { - int tsk_cache_hot = 0; + int tsk_cache_hot; lockdep_assert_held(&env->src_rq->lock); @@ -5618,13 +5697,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * 2) task is cache cold, or * 3) too many balance attempts have failed. */ - tsk_cache_hot = task_hot(p, env); - if (!tsk_cache_hot) - tsk_cache_hot = migrate_degrades_locality(p, env); + tsk_cache_hot = migrate_degrades_locality(p, env); + if (tsk_cache_hot == -1) + tsk_cache_hot = task_hot(p, env); - if (migrate_improves_locality(p, env) || !tsk_cache_hot || + if (tsk_cache_hot <= 0 || env->sd->nr_balance_failed > env->sd->cache_nice_tries) { - if (tsk_cache_hot) { + if (tsk_cache_hot == 1) { schedstat_inc(env->sd, lb_hot_gained[env->idle]); schedstat_inc(p, se.statistics.nr_forced_migrations); } @@ -5698,6 +5777,13 @@ static int detach_tasks(struct lb_env *env) return 0; while (!list_empty(tasks)) { + /* + * We don't want to steal all, otherwise we may be treated likewise, + * which could at worst lead to a livelock crash. + */ + if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1) + break; + p = list_first_entry(tasks, struct task_struct, se.group_node); env->loop++; @@ -5807,39 +5893,6 @@ static void attach_tasks(struct lb_env *env) } #ifdef CONFIG_FAIR_GROUP_SCHED -/* - * update tg->load_weight by folding this cpu's load_avg - */ -static void __update_blocked_averages_cpu(struct task_group *tg, int cpu) -{ - struct sched_entity *se = tg->se[cpu]; - struct cfs_rq *cfs_rq = tg->cfs_rq[cpu]; - - /* throttled entities do not contribute to load */ - if (throttled_hierarchy(cfs_rq)) - return; - - update_cfs_rq_blocked_load(cfs_rq, 1); - - if (se) { - update_entity_load_avg(se, 1); - /* - * We pivot on our runnable average having decayed to zero for - * list removal. This generally implies that all our children - * have also been removed (modulo rounding error or bandwidth - * control); however, such cases are rare and we can fix these - * at enqueue. - * - * TODO: fix up out-of-order children on enqueue. - */ - if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running) - list_del_leaf_cfs_rq(cfs_rq); - } else { - struct rq *rq = rq_of(cfs_rq); - update_rq_runnable_avg(rq, rq->nr_running); - } -} - static void update_blocked_averages(int cpu) { struct rq *rq = cpu_rq(cpu); @@ -5848,19 +5901,19 @@ static void update_blocked_averages(int cpu) raw_spin_lock_irqsave(&rq->lock, flags); update_rq_clock(rq); + /* * Iterates the task_group tree in a bottom up fashion, see * list_add_leaf_cfs_rq() for details. */ for_each_leaf_cfs_rq(rq, cfs_rq) { - /* - * Note: We may want to consider periodically releasing - * rq->lock about these updates so that creating many task - * groups does not result in continually extending hold time. - */ - __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu); - } + /* throttled entities do not contribute to load */ + if (throttled_hierarchy(cfs_rq)) + continue; + if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq)) + update_tg_load_avg(cfs_rq, 0); + } raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -5888,14 +5941,14 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq) } if (!se) { - cfs_rq->h_load = cfs_rq->runnable_load_avg; + cfs_rq->h_load = cfs_rq_load_avg(cfs_rq); cfs_rq->last_h_load_update = now; } while ((se = cfs_rq->h_load_next) != NULL) { load = cfs_rq->h_load; - load = div64_ul(load * se->avg.load_avg_contrib, - cfs_rq->runnable_load_avg + 1); + load = div64_ul(load * se->avg.load_avg, + cfs_rq_load_avg(cfs_rq) + 1); cfs_rq = group_cfs_rq(se); cfs_rq->h_load = load; cfs_rq->last_h_load_update = now; @@ -5907,17 +5960,25 @@ static unsigned long task_h_load(struct task_struct *p) struct cfs_rq *cfs_rq = task_cfs_rq(p); update_cfs_rq_h_load(cfs_rq); - return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load, - cfs_rq->runnable_load_avg + 1); + return div64_ul(p->se.avg.load_avg * cfs_rq->h_load, + cfs_rq_load_avg(cfs_rq) + 1); } #else static inline void update_blocked_averages(int cpu) { + struct rq *rq = cpu_rq(cpu); + struct cfs_rq *cfs_rq = &rq->cfs; + unsigned long flags; + + raw_spin_lock_irqsave(&rq->lock, flags); + update_rq_clock(rq); + update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); + raw_spin_unlock_irqrestore(&rq->lock, flags); } static unsigned long task_h_load(struct task_struct *p) { - return p->se.avg.load_avg_contrib; + return p->se.avg.load_avg; } #endif @@ -5938,7 +5999,7 @@ struct sg_lb_stats { unsigned long sum_weighted_load; /* Weighted load of group's tasks */ unsigned long load_per_task; unsigned long group_capacity; - unsigned long group_usage; /* Total usage of the group */ + unsigned long group_util; /* Total utilization of the group */ unsigned int sum_nr_running; /* Nr tasks running in the group */ unsigned int idle_cpus; unsigned int group_weight; @@ -6014,19 +6075,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd, return load_idx; } -static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu) -{ - if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) - return sd->smt_gain / sd->span_weight; - - return SCHED_CAPACITY_SCALE; -} - -unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) -{ - return default_scale_cpu_capacity(sd, cpu); -} - static unsigned long scale_rt_capacity(int cpu) { struct rq *rq = cpu_rq(cpu); @@ -6037,8 +6085,8 @@ static unsigned long scale_rt_capacity(int cpu) * Since we're reading these variables without serialization make sure * we read them once before doing sanity checks on them. */ - age_stamp = ACCESS_ONCE(rq->age_stamp); - avg = ACCESS_ONCE(rq->rt_avg); + age_stamp = READ_ONCE(rq->age_stamp); + avg = READ_ONCE(rq->rt_avg); delta = __rq_clock_broken(rq) - age_stamp; if (unlikely(delta < 0)) @@ -6056,16 +6104,9 @@ static unsigned long scale_rt_capacity(int cpu) static void update_cpu_capacity(struct sched_domain *sd, int cpu) { - unsigned long capacity = SCHED_CAPACITY_SCALE; + unsigned long capacity = arch_scale_cpu_capacity(sd, cpu); struct sched_group *sdg = sd->groups; - if (sched_feat(ARCH_CAPACITY)) - capacity *= arch_scale_cpu_capacity(sd, cpu); - else - capacity *= default_scale_cpu_capacity(sd, cpu); - - capacity >>= SCHED_CAPACITY_SHIFT; - cpu_rq(cpu)->cpu_capacity_orig = capacity; capacity *= scale_rt_capacity(cpu); @@ -6191,8 +6232,8 @@ static inline int sg_imbalanced(struct sched_group *group) * group_has_capacity returns true if the group has spare capacity that could * be used by some tasks. * We consider that a group has spare capacity if the * number of task is - * smaller than the number of CPUs or if the usage is lower than the available - * capacity for CFS tasks. + * smaller than the number of CPUs or if the utilization is lower than the + * available capacity for CFS tasks. * For the latter, we use a threshold to stabilize the state, to take into * account the variance of the tasks' load and to return true if the available * capacity in meaningful for the load balancer. @@ -6206,7 +6247,7 @@ group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs) return true; if ((sgs->group_capacity * 100) > - (sgs->group_usage * env->sd->imbalance_pct)) + (sgs->group_util * env->sd->imbalance_pct)) return true; return false; @@ -6227,15 +6268,15 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) return false; if ((sgs->group_capacity * 100) < - (sgs->group_usage * env->sd->imbalance_pct)) + (sgs->group_util * env->sd->imbalance_pct)) return true; return false; } -static enum group_type group_classify(struct lb_env *env, - struct sched_group *group, - struct sg_lb_stats *sgs) +static inline enum +group_type group_classify(struct sched_group *group, + struct sg_lb_stats *sgs) { if (sgs->group_no_capacity) return group_overloaded; @@ -6275,7 +6316,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, load = source_load(i, load_idx); sgs->group_load += load; - sgs->group_usage += get_cpu_usage(i); + sgs->group_util += cpu_util(i); sgs->sum_nr_running += rq->cfs.h_nr_running; if (rq->nr_running > 1) @@ -6300,7 +6341,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_weight = group->group_weight; sgs->group_no_capacity = group_is_overloaded(env, sgs); - sgs->group_type = group_classify(env, group, sgs); + sgs->group_type = group_classify(group, sgs); } /** @@ -6434,7 +6475,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd group_has_capacity(env, &sds->local_stat) && (sgs->sum_nr_running > 1)) { sgs->group_no_capacity = 1; - sgs->group_type = group_overloaded; + sgs->group_type = group_classify(sg, sgs); } if (update_sd_pick_busiest(env, sds, sg, sgs)) { @@ -7226,9 +7267,6 @@ static int idle_balance(struct rq *this_rq) goto out; } - /* - * Drop the rq->lock, but keep IRQ/preempt disabled. - */ raw_spin_unlock(&this_rq->lock); update_blocked_averages(this_cpu); @@ -7617,8 +7655,22 @@ out: * When the cpu is attached to null domain for ex, it will not be * updated. */ - if (likely(update_next_balance)) + if (likely(update_next_balance)) { rq->next_balance = next_balance; + +#ifdef CONFIG_NO_HZ_COMMON + /* + * If this CPU has been elected to perform the nohz idle + * balance. Other idle CPUs have already rebalanced with + * nohz_idle_balance() and nohz.next_balance has been + * updated accordingly. This CPU is now running the idle load + * balance for itself and we need to update the + * nohz.next_balance accordingly. + */ + if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance)) + nohz.next_balance = rq->next_balance; +#endif + } } #ifdef CONFIG_NO_HZ_COMMON @@ -7631,6 +7683,9 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) int this_cpu = this_rq->cpu; struct rq *rq; int balance_cpu; + /* Earliest time when we have to do rebalance again */ + unsigned long next_balance = jiffies + 60*HZ; + int update_next_balance = 0; if (idle != CPU_IDLE || !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu))) @@ -7662,10 +7717,19 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) rebalance_domains(rq, CPU_IDLE); } - if (time_after(this_rq->next_balance, rq->next_balance)) - this_rq->next_balance = rq->next_balance; + if (time_after(next_balance, rq->next_balance)) { + next_balance = rq->next_balance; + update_next_balance = 1; + } } - nohz.next_balance = this_rq->next_balance; + + /* + * next_balance will be updated only when there is a need. + * When the CPU is attached to null domain for ex, it will not be + * updated. + */ + if (likely(update_next_balance)) + nohz.next_balance = next_balance; end: clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)); } @@ -7818,10 +7882,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) entity_tick(cfs_rq, se, queued); } - if (numabalancing_enabled) + if (static_branch_unlikely(&sched_numa_balancing)) task_tick_numa(rq, curr); - - update_rq_runnable_avg(rq, 1); } /* @@ -7896,21 +7958,39 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) check_preempt_curr(rq, p, 0); } -static void switched_from_fair(struct rq *rq, struct task_struct *p) +static inline bool vruntime_normalized(struct task_struct *p) { struct sched_entity *se = &p->se; - struct cfs_rq *cfs_rq = cfs_rq_of(se); /* - * Ensure the task's vruntime is normalized, so that when it's - * switched back to the fair class the enqueue_entity(.flags=0) will - * do the right thing. + * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases, + * the dequeue_entity(.flags=0) will already have normalized the + * vruntime. + */ + if (p->on_rq) + return true; + + /* + * When !on_rq, vruntime of the task has usually NOT been normalized. + * But there are some cases where it has already been normalized: * - * If it's queued, then the dequeue_entity(.flags=0) will already - * have normalized the vruntime, if it's !queued, then only when - * the task is sleeping will it still have non-normalized vruntime. + * - A forked child which is waiting for being woken up by + * wake_up_new_task(). + * - A task which has been woken up by try_to_wake_up() and + * waiting for actually being woken up by sched_ttwu_pending(). */ - if (!task_on_rq_queued(p) && p->state != TASK_RUNNING) { + if (!se->sum_exec_runtime || p->state == TASK_WAKING) + return true; + + return false; +} + +static void detach_task_cfs_rq(struct task_struct *p) +{ + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + if (!vruntime_normalized(p)) { /* * Fix up our vruntime so that the current sleep doesn't * cause 'unlimited' sleep bonus. @@ -7919,44 +7999,50 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) se->vruntime -= cfs_rq->min_vruntime; } -#ifdef CONFIG_SMP - /* - * Remove our load from contribution when we leave sched_fair - * and ensure we don't carry in an old decay_count if we - * switch back. - */ - if (se->avg.decay_count) { - __synchronize_entity_decay(se); - subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib); - } -#endif + /* Catch up with the cfs_rq and remove our load when we leave */ + detach_entity_load_avg(cfs_rq, se); } -/* - * We switched to the sched_fair class. - */ -static void switched_to_fair(struct rq *rq, struct task_struct *p) +static void attach_task_cfs_rq(struct task_struct *p) { -#ifdef CONFIG_FAIR_GROUP_SCHED struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + +#ifdef CONFIG_FAIR_GROUP_SCHED /* * Since the real-depth could have been changed (only FAIR * class maintain depth value), reset depth properly. */ se->depth = se->parent ? se->parent->depth + 1 : 0; #endif - if (!task_on_rq_queued(p)) - return; - /* - * We were most likely switched from sched_rt, so - * kick off the schedule if running, otherwise just see - * if we can still preempt the current task. - */ - if (rq->curr == p) - resched_curr(rq); - else - check_preempt_curr(rq, p, 0); + /* Synchronize task with its cfs_rq */ + attach_entity_load_avg(cfs_rq, se); + + if (!vruntime_normalized(p)) + se->vruntime += cfs_rq->min_vruntime; +} + +static void switched_from_fair(struct rq *rq, struct task_struct *p) +{ + detach_task_cfs_rq(p); +} + +static void switched_to_fair(struct rq *rq, struct task_struct *p) +{ + attach_task_cfs_rq(p); + + if (task_on_rq_queued(p)) { + /* + * We were most likely switched from sched_rt, so + * kick off the schedule if running, otherwise just see + * if we can still preempt the current task. + */ + if (rq->curr == p) + resched_curr(rq); + else + check_preempt_curr(rq, p, 0); + } } /* Account for a task changing its policy or group. @@ -7985,62 +8071,22 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; #endif #ifdef CONFIG_SMP - atomic64_set(&cfs_rq->decay_counter, 1); - atomic_long_set(&cfs_rq->removed_load, 0); + atomic_long_set(&cfs_rq->removed_load_avg, 0); + atomic_long_set(&cfs_rq->removed_util_avg, 0); #endif } #ifdef CONFIG_FAIR_GROUP_SCHED -static void task_move_group_fair(struct task_struct *p, int queued) +static void task_move_group_fair(struct task_struct *p) { - struct sched_entity *se = &p->se; - struct cfs_rq *cfs_rq; - - /* - * If the task was not on the rq at the time of this cgroup movement - * it must have been asleep, sleeping tasks keep their ->vruntime - * absolute on their old rq until wakeup (needed for the fair sleeper - * bonus in place_entity()). - * - * If it was on the rq, we've just 'preempted' it, which does convert - * ->vruntime to a relative base. - * - * Make sure both cases convert their relative position when migrating - * to another cgroup's rq. This does somewhat interfere with the - * fair sleeper stuff for the first placement, but who cares. - */ - /* - * When !queued, vruntime of the task has usually NOT been normalized. - * But there are some cases where it has already been normalized: - * - * - Moving a forked child which is waiting for being woken up by - * wake_up_new_task(). - * - Moving a task which has been woken up by try_to_wake_up() and - * waiting for actually being woken up by sched_ttwu_pending(). - * - * To prevent boost or penalty in the new cfs_rq caused by delta - * min_vruntime between the two cfs_rqs, we skip vruntime adjustment. - */ - if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING)) - queued = 1; - - if (!queued) - se->vruntime -= cfs_rq_of(se)->min_vruntime; + detach_task_cfs_rq(p); set_task_rq(p, task_cpu(p)); - se->depth = se->parent ? se->parent->depth + 1 : 0; - if (!queued) { - cfs_rq = cfs_rq_of(se); - se->vruntime += cfs_rq->min_vruntime; + #ifdef CONFIG_SMP - /* - * migrate_task_rq_fair() will have removed our previous - * contribution, but we must synchronize for ongoing future - * decay. - */ - se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); - cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; + /* Tell se's cfs_rq has been changed -- migrated */ + p->se.avg.last_update_time = 0; #endif - } + attach_task_cfs_rq(p); } void free_fair_sched_group(struct task_group *tg) @@ -8052,8 +8098,11 @@ void free_fair_sched_group(struct task_group *tg) for_each_possible_cpu(i) { if (tg->cfs_rq) kfree(tg->cfs_rq[i]); - if (tg->se) + if (tg->se) { + if (tg->se[i]) + remove_entity_load_avg(tg->se[i]); kfree(tg->se[i]); + } } kfree(tg->cfs_rq); @@ -8090,6 +8139,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) init_cfs_rq(cfs_rq); init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); + init_entity_runnable_average(se); } return 1; @@ -8239,6 +8289,8 @@ const struct sched_class fair_sched_class = { .rq_offline = rq_offline_fair, .task_waking = task_waking_fair, + .task_dead = task_dead_fair, + .set_cpus_allowed = set_cpus_allowed_common, #endif .set_curr_task = set_curr_task_fair, @@ -8268,7 +8320,27 @@ void print_cfs_stats(struct seq_file *m, int cpu) print_cfs_rq(m, cpu, cfs_rq); rcu_read_unlock(); } -#endif + +#ifdef CONFIG_NUMA_BALANCING +void show_numa_stats(struct task_struct *p, struct seq_file *m) +{ + int node; + unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0; + + for_each_online_node(node) { + if (p->numa_faults) { + tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)]; + tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)]; + } + if (p->numa_group) { + gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)], + gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)]; + } + print_numa_stats(m, node, tsf, tpf, gsf, gpf); + } +} +#endif /* CONFIG_NUMA_BALANCING */ +#endif /* CONFIG_SCHED_DEBUG */ __init void init_sched_fair_class(void) { diff --git a/kernel/kernel/sched/features.h b/kernel/kernel/sched/features.h index 0ea4e3775..6d28fcd08 100644 --- a/kernel/kernel/sched/features.h +++ b/kernel/kernel/sched/features.h @@ -36,11 +36,6 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true) */ SCHED_FEAT(WAKEUP_PREEMPTION, true) -/* - * Use arch dependent cpu capacity functions - */ -SCHED_FEAT(ARCH_CAPACITY, true) - SCHED_FEAT(HRTICK, false) SCHED_FEAT(DOUBLE_TICK, false) SCHED_FEAT(LB_BIAS, true) @@ -80,27 +75,5 @@ SCHED_FEAT(RT_PUSH_IPI, true) SCHED_FEAT(FORCE_SD_OVERLAP, false) SCHED_FEAT(RT_RUNTIME_SHARE, true) SCHED_FEAT(LB_MIN, false) +SCHED_FEAT(ATTACH_AGE_LOAD, true) -/* - * Apply the automatic NUMA scheduling policy. Enabled automatically - * at runtime if running on a NUMA machine. Can be controlled via - * numa_balancing= - */ -#ifdef CONFIG_NUMA_BALANCING -SCHED_FEAT(NUMA, false) - -/* - * NUMA_FAVOUR_HIGHER will favor moving tasks towards nodes where a - * higher number of hinting faults are recorded during active load - * balancing. - */ -SCHED_FEAT(NUMA_FAVOUR_HIGHER, true) - -/* - * NUMA_RESIST_LOWER will resist moving tasks towards nodes where a - * lower number of hinting faults have been recorded. As this has - * the potential to prevent a task ever migrating to a new node - * due to CPU overload it is disabled by default. - */ -SCHED_FEAT(NUMA_RESIST_LOWER, false) -#endif diff --git a/kernel/kernel/sched/idle.c b/kernel/kernel/sched/idle.c index fefcb1fa5..4a2ef5a02 100644 --- a/kernel/kernel/sched/idle.c +++ b/kernel/kernel/sched/idle.c @@ -15,6 +15,15 @@ #include "sched.h" +/** + * sched_idle_set_state - Record idle state for the current CPU. + * @idle_state: State to record. + */ +void sched_idle_set_state(struct cpuidle_state *idle_state) +{ + idle_set_state(this_rq(), idle_state); +} + static int __read_mostly cpu_idle_force_poll; void cpu_idle_poll_ctrl(bool enable) @@ -48,9 +57,11 @@ static inline int cpu_idle_poll(void) rcu_idle_enter(); trace_cpu_idle_rcuidle(0, smp_processor_id()); local_irq_enable(); + stop_critical_timings(); while (!tif_need_resched() && (cpu_idle_force_poll || tick_check_broadcast_expired())) cpu_relax(); + start_critical_timings(); trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); rcu_idle_exit(); return 1; @@ -68,6 +79,49 @@ void __weak arch_cpu_idle(void) } /** + * default_idle_call - Default CPU idle routine. + * + * To use when the cpuidle framework cannot be used. + */ +void default_idle_call(void) +{ + if (current_clr_polling_and_test()) { + local_irq_enable(); + } else { + stop_critical_timings(); + arch_cpu_idle(); + start_critical_timings(); + } +} + +static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev, + int next_state) +{ + /* Fall back to the default arch idle method on errors. */ + if (next_state < 0) { + default_idle_call(); + return next_state; + } + + /* + * The idle task must be scheduled, it is pointless to go to idle, just + * update no idle residency and return. + */ + if (current_clr_polling_and_test()) { + dev->last_residency = 0; + local_irq_enable(); + return -EBUSY; + } + + /* + * Enter the idle state previously returned by the governor decision. + * This function will block until an interrupt occurs and will take + * care of re-enabling the local interrupts + */ + return cpuidle_enter(drv, dev, next_state); +} + +/** * cpuidle_idle_call - the main idle function * * NOTE: no locks or semaphores should be used here @@ -81,7 +135,6 @@ static void cpuidle_idle_call(void) struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); int next_state, entered_state; - bool reflect; /* * Check if the idle task must be rescheduled. If it is the @@ -93,20 +146,16 @@ static void cpuidle_idle_call(void) } /* - * During the idle period, stop measuring the disabled irqs - * critical sections latencies - */ - stop_critical_timings(); - - /* * Tell the RCU framework we are entering an idle section, * so no more rcu read side critical sections and one more * step to the grace period */ rcu_idle_enter(); - if (cpuidle_not_available(drv, dev)) - goto use_default; + if (cpuidle_not_available(drv, dev)) { + default_idle_call(); + goto exit_idle; + } /* * Suspend-to-idle ("freeze") is a system state in which all user space @@ -124,52 +173,19 @@ static void cpuidle_idle_call(void) goto exit_idle; } - reflect = false; next_state = cpuidle_find_deepest_state(drv, dev); + call_cpuidle(drv, dev, next_state); } else { - reflect = true; /* * Ask the cpuidle framework to choose a convenient idle state. */ next_state = cpuidle_select(drv, dev); - } - /* Fall back to the default arch idle method on errors. */ - if (next_state < 0) - goto use_default; - - /* - * The idle task must be scheduled, it is pointless to - * go to idle, just update no idle residency and get - * out of this function - */ - if (current_clr_polling_and_test()) { - dev->last_residency = 0; - entered_state = next_state; - local_irq_enable(); - goto exit_idle; - } - - /* Take note of the planned idle state. */ - idle_set_state(this_rq(), &drv->states[next_state]); - - /* - * Enter the idle state previously returned by the governor decision. - * This function will block until an interrupt occurs and will take - * care of re-enabling the local interrupts - */ - entered_state = cpuidle_enter(drv, dev, next_state); - - /* The cpu is no longer idle or about to enter idle. */ - idle_set_state(this_rq(), NULL); - - if (entered_state == -EBUSY) - goto use_default; - - /* - * Give the governor an opportunity to reflect on the outcome - */ - if (reflect) + entered_state = call_cpuidle(drv, dev, next_state); + /* + * Give the governor an opportunity to reflect on the outcome + */ cpuidle_reflect(dev, entered_state); + } exit_idle: __current_set_polling(); @@ -181,20 +197,6 @@ exit_idle: local_irq_enable(); rcu_idle_exit(); - start_critical_timings(); - return; - -use_default: - /* - * We can't use the cpuidle framework, let's use the default - * idle routine. - */ - if (current_clr_polling_and_test()) - local_irq_enable(); - else - arch_cpu_idle(); - - goto exit_idle; } DEFINE_PER_CPU(bool, cpu_dead_idle); diff --git a/kernel/kernel/sched/idle_task.c b/kernel/kernel/sched/idle_task.c index c65dac8c9..c4ae0f1fd 100644 --- a/kernel/kernel/sched/idle_task.c +++ b/kernel/kernel/sched/idle_task.c @@ -96,6 +96,7 @@ const struct sched_class idle_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_idle, + .set_cpus_allowed = set_cpus_allowed_common, #endif .set_curr_task = set_curr_task_idle, diff --git a/kernel/kernel/sched/proc.c b/kernel/kernel/sched/loadavg.c index 8ecd552fe..ef7159012 100644 --- a/kernel/kernel/sched/proc.c +++ b/kernel/kernel/sched/loadavg.c @@ -1,7 +1,9 @@ /* - * kernel/sched/proc.c + * kernel/sched/loadavg.c * - * Kernel load calculations, forked from sched/core.c + * This file contains the magic bits required to compute the global loadavg + * figure. Its a silly number but people think its important. We go through + * great pains to make it work on big machines and tickless kernels. */ #include <linux/export.h> @@ -81,7 +83,7 @@ long calc_load_fold_active(struct rq *this_rq) long nr_active, delta = 0; nr_active = this_rq->nr_running; - nr_active += (long) this_rq->nr_uninterruptible; + nr_active += (long)this_rq->nr_uninterruptible; if (nr_active != this_rq->calc_load_active) { delta = nr_active - this_rq->calc_load_active; @@ -186,6 +188,7 @@ void calc_load_enter_idle(void) delta = calc_load_fold_active(this_rq); if (delta) { int idx = calc_load_write_idx(); + atomic_long_add(delta, &calc_load_idle[idx]); } } @@ -241,18 +244,20 @@ fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) { unsigned long result = 1UL << frac_bits; - if (n) for (;;) { - if (n & 1) { - result *= x; - result += 1UL << (frac_bits - 1); - result >>= frac_bits; + if (n) { + for (;;) { + if (n & 1) { + result *= x; + result += 1UL << (frac_bits - 1); + result >>= frac_bits; + } + n >>= 1; + if (!n) + break; + x *= x; + x += 1UL << (frac_bits - 1); + x >>= frac_bits; } - n >>= 1; - if (!n) - break; - x *= x; - x += 1UL << (frac_bits - 1); - x >>= frac_bits; } return result; @@ -285,7 +290,6 @@ static unsigned long calc_load_n(unsigned long load, unsigned long exp, unsigned long active, unsigned int n) { - return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); } @@ -339,6 +343,8 @@ static inline void calc_global_nohz(void) { } /* * calc_load - update the avenrun load estimates 10 ticks after the * CPUs have updated calc_load_tasks. + * + * Called from the global timer code. */ void calc_global_load(unsigned long ticks) { @@ -370,10 +376,10 @@ void calc_global_load(unsigned long ticks) } /* - * Called from update_cpu_load() to periodically update this CPU's + * Called from scheduler_tick() to periodically update this CPU's * active count. */ -static void calc_load_account_active(struct rq *this_rq) +void calc_global_load_tick(struct rq *this_rq) { long delta; @@ -386,199 +392,3 @@ static void calc_load_account_active(struct rq *this_rq) this_rq->calc_load_update += LOAD_FREQ; } - -/* - * End of global load-average stuff - */ - -/* - * The exact cpuload at various idx values, calculated at every tick would be - * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load - * - * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called - * on nth tick when cpu may be busy, then we have: - * load = ((2^idx - 1) / 2^idx)^(n-1) * load - * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load - * - * decay_load_missed() below does efficient calculation of - * load = ((2^idx - 1) / 2^idx)^(n-1) * load - * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load - * - * The calculation is approximated on a 128 point scale. - * degrade_zero_ticks is the number of ticks after which load at any - * particular idx is approximated to be zero. - * degrade_factor is a precomputed table, a row for each load idx. - * Each column corresponds to degradation factor for a power of two ticks, - * based on 128 point scale. - * Example: - * row 2, col 3 (=12) says that the degradation at load idx 2 after - * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). - * - * With this power of 2 load factors, we can degrade the load n times - * by looking at 1 bits in n and doing as many mult/shift instead of - * n mult/shifts needed by the exact degradation. - */ -#define DEGRADE_SHIFT 7 -static const unsigned char - degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; -static const unsigned char - degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { - {0, 0, 0, 0, 0, 0, 0, 0}, - {64, 32, 8, 0, 0, 0, 0, 0}, - {96, 72, 40, 12, 1, 0, 0}, - {112, 98, 75, 43, 15, 1, 0}, - {120, 112, 98, 76, 45, 16, 2} }; - -/* - * Update cpu_load for any missed ticks, due to tickless idle. The backlog - * would be when CPU is idle and so we just decay the old load without - * adding any new load. - */ -static unsigned long -decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) -{ - int j = 0; - - if (!missed_updates) - return load; - - if (missed_updates >= degrade_zero_ticks[idx]) - return 0; - - if (idx == 1) - return load >> missed_updates; - - while (missed_updates) { - if (missed_updates % 2) - load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; - - missed_updates >>= 1; - j++; - } - return load; -} - -/* - * Update rq->cpu_load[] statistics. This function is usually called every - * scheduler tick (TICK_NSEC). With tickless idle this will not be called - * every tick. We fix it up based on jiffies. - */ -static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, - unsigned long pending_updates) -{ - int i, scale; - - this_rq->nr_load_updates++; - - /* Update our load: */ - this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ - for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { - unsigned long old_load, new_load; - - /* scale is effectively 1 << i now, and >> i divides by scale */ - - old_load = this_rq->cpu_load[i]; - old_load = decay_load_missed(old_load, pending_updates - 1, i); - new_load = this_load; - /* - * Round up the averaging division if load is increasing. This - * prevents us from getting stuck on 9 if the load is 10, for - * example. - */ - if (new_load > old_load) - new_load += scale - 1; - - this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; - } - - sched_avg_update(this_rq); -} - -#ifdef CONFIG_SMP -static inline unsigned long get_rq_runnable_load(struct rq *rq) -{ - return rq->cfs.runnable_load_avg; -} -#else -static inline unsigned long get_rq_runnable_load(struct rq *rq) -{ - return rq->load.weight; -} -#endif - -#ifdef CONFIG_NO_HZ_COMMON -/* - * There is no sane way to deal with nohz on smp when using jiffies because the - * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading - * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. - * - * Therefore we cannot use the delta approach from the regular tick since that - * would seriously skew the load calculation. However we'll make do for those - * updates happening while idle (nohz_idle_balance) or coming out of idle - * (tick_nohz_idle_exit). - * - * This means we might still be one tick off for nohz periods. - */ - -/* - * Called from nohz_idle_balance() to update the load ratings before doing the - * idle balance. - */ -void update_idle_cpu_load(struct rq *this_rq) -{ - unsigned long curr_jiffies = ACCESS_ONCE(jiffies); - unsigned long load = get_rq_runnable_load(this_rq); - unsigned long pending_updates; - - /* - * bail if there's load or we're actually up-to-date. - */ - if (load || curr_jiffies == this_rq->last_load_update_tick) - return; - - pending_updates = curr_jiffies - this_rq->last_load_update_tick; - this_rq->last_load_update_tick = curr_jiffies; - - __update_cpu_load(this_rq, load, pending_updates); -} - -/* - * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. - */ -void update_cpu_load_nohz(void) -{ - struct rq *this_rq = this_rq(); - unsigned long curr_jiffies = ACCESS_ONCE(jiffies); - unsigned long pending_updates; - - if (curr_jiffies == this_rq->last_load_update_tick) - return; - - raw_spin_lock(&this_rq->lock); - pending_updates = curr_jiffies - this_rq->last_load_update_tick; - if (pending_updates) { - this_rq->last_load_update_tick = curr_jiffies; - /* - * We were idle, this means load 0, the current load might be - * !0 due to remote wakeups and the sort. - */ - __update_cpu_load(this_rq, 0, pending_updates); - } - raw_spin_unlock(&this_rq->lock); -} -#endif /* CONFIG_NO_HZ */ - -/* - * Called from scheduler_tick() - */ -void update_cpu_load_active(struct rq *this_rq) -{ - unsigned long load = get_rq_runnable_load(this_rq); - /* - * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). - */ - this_rq->last_load_update_tick = jiffies; - __update_cpu_load(this_rq, load, 1); - - calc_load_account_active(this_rq); -} diff --git a/kernel/kernel/sched/rt.c b/kernel/kernel/sched/rt.c index 637aa208a..8cf360d30 100644 --- a/kernel/kernel/sched/rt.c +++ b/kernel/kernel/sched/rt.c @@ -18,19 +18,22 @@ static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) { struct rt_bandwidth *rt_b = container_of(timer, struct rt_bandwidth, rt_period_timer); - ktime_t now; - int overrun; int idle = 0; + int overrun; + raw_spin_lock(&rt_b->rt_runtime_lock); for (;;) { - now = hrtimer_cb_get_time(timer); - overrun = hrtimer_forward(timer, now, rt_b->rt_period); - + overrun = hrtimer_forward_now(timer, rt_b->rt_period); if (!overrun) break; + raw_spin_unlock(&rt_b->rt_runtime_lock); idle = do_sched_rt_period_timer(rt_b, overrun); + raw_spin_lock(&rt_b->rt_runtime_lock); } + if (idle) + rt_b->rt_period_active = 0; + raw_spin_unlock(&rt_b->rt_runtime_lock); return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; } @@ -53,15 +56,16 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) return; - if (hrtimer_active(&rt_b->rt_period_timer)) - return; - raw_spin_lock(&rt_b->rt_runtime_lock); - start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period); + if (!rt_b->rt_period_active) { + rt_b->rt_period_active = 1; + hrtimer_forward_now(&rt_b->rt_period_timer, rt_b->rt_period); + hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED); + } raw_spin_unlock(&rt_b->rt_runtime_lock); } -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) && defined(HAVE_RT_PUSH_IPI) static void push_irq_work_func(struct irq_work *work); #endif @@ -258,7 +262,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) #ifdef CONFIG_SMP -static int pull_rt_task(struct rq *this_rq); +static void pull_rt_task(struct rq *this_rq); static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) { @@ -324,7 +328,7 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) rt_rq = &rq_of_rt_rq(rt_rq)->rt; rt_rq->rt_nr_total++; - if (p->nr_cpus_allowed > 1) + if (tsk_nr_cpus_allowed(p) > 1) rt_rq->rt_nr_migratory++; update_rt_migration(rt_rq); @@ -341,7 +345,7 @@ static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) rt_rq = &rq_of_rt_rq(rt_rq)->rt; rt_rq->rt_nr_total--; - if (p->nr_cpus_allowed > 1) + if (tsk_nr_cpus_allowed(p) > 1) rt_rq->rt_nr_migratory--; update_rt_migration(rt_rq); @@ -352,13 +356,23 @@ static inline int has_pushable_tasks(struct rq *rq) return !plist_head_empty(&rq->rt.pushable_tasks); } -static inline void set_post_schedule(struct rq *rq) +static DEFINE_PER_CPU(struct callback_head, rt_push_head); +static DEFINE_PER_CPU(struct callback_head, rt_pull_head); + +static void push_rt_tasks(struct rq *); +static void pull_rt_task(struct rq *); + +static inline void queue_push_tasks(struct rq *rq) { - /* - * We detect this state here so that we can avoid taking the RQ - * lock again later if there is no need to push - */ - rq->post_schedule = has_pushable_tasks(rq); + if (!has_pushable_tasks(rq)) + return; + + queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks); +} + +static inline void queue_pull_task(struct rq *rq) +{ + queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task); } static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) @@ -410,12 +424,11 @@ static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) return false; } -static inline int pull_rt_task(struct rq *this_rq) +static inline void pull_rt_task(struct rq *this_rq) { - return 0; } -static inline void set_post_schedule(struct rq *rq) +static inline void queue_push_tasks(struct rq *rq) { } #endif /* CONFIG_SMP */ @@ -624,11 +637,11 @@ bool sched_rt_bandwidth_account(struct rt_rq *rt_rq) /* * We ran out of runtime, see if we can borrow some from our neighbours. */ -static int do_balance_runtime(struct rt_rq *rt_rq) +static void do_balance_runtime(struct rt_rq *rt_rq) { struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd; - int i, weight, more = 0; + int i, weight; u64 rt_period; weight = cpumask_weight(rd->span); @@ -662,7 +675,6 @@ static int do_balance_runtime(struct rt_rq *rt_rq) diff = rt_period - rt_rq->rt_runtime; iter->rt_runtime -= diff; rt_rq->rt_runtime += diff; - more = 1; if (rt_rq->rt_runtime == rt_period) { raw_spin_unlock(&iter->rt_runtime_lock); break; @@ -672,8 +684,6 @@ next: raw_spin_unlock(&iter->rt_runtime_lock); } raw_spin_unlock(&rt_b->rt_runtime_lock); - - return more; } /* @@ -785,26 +795,19 @@ static void __enable_runtime(struct rq *rq) } } -static int balance_runtime(struct rt_rq *rt_rq) +static void balance_runtime(struct rt_rq *rt_rq) { - int more = 0; - if (!sched_feat(RT_RUNTIME_SHARE)) - return more; + return; if (rt_rq->rt_time > rt_rq->rt_runtime) { raw_spin_unlock(&rt_rq->rt_runtime_lock); - more = do_balance_runtime(rt_rq); + do_balance_runtime(rt_rq); raw_spin_lock(&rt_rq->rt_runtime_lock); } - - return more; } #else /* !CONFIG_SMP */ -static inline int balance_runtime(struct rt_rq *rt_rq) -{ - return 0; -} +static inline void balance_runtime(struct rt_rq *rt_rq) {} #endif /* CONFIG_SMP */ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) @@ -1261,7 +1264,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); - if (!task_current(rq, p) && p->nr_cpus_allowed > 1) + if (!task_current(rq, p) && tsk_nr_cpus_allowed(p) > 1) enqueue_pushable_task(rq, p); } @@ -1325,7 +1328,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) rq = cpu_rq(cpu); rcu_read_lock(); - curr = ACCESS_ONCE(rq->curr); /* unlocked access */ + curr = READ_ONCE(rq->curr); /* unlocked access */ /* * If the current task on @p's runqueue is an RT task, then @@ -1350,7 +1353,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) * will have to sort it out. */ if (curr && unlikely(rt_task(curr)) && - (curr->nr_cpus_allowed < 2 || + (tsk_nr_cpus_allowed(curr) < 2 || curr->prio <= p->prio)) { int target = find_lowest_rq(p); @@ -1374,7 +1377,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) * Current can't be migrated, useless to reschedule, * let's hope p can move out. */ - if (rq->curr->nr_cpus_allowed == 1 || + if (tsk_nr_cpus_allowed(rq->curr) == 1 || !cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) return; @@ -1382,7 +1385,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) * p is migratable, so let's not schedule it and * see if it is pushed or pulled somewhere else. */ - if (p->nr_cpus_allowed != 1 + if (tsk_nr_cpus_allowed(p) != 1 && cpupri_find(&rq->rd->cpupri, p, NULL)) return; @@ -1467,7 +1470,15 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) struct rt_rq *rt_rq = &rq->rt; if (need_pull_rt_task(rq, prev)) { + /* + * This is OK, because current is on_cpu, which avoids it being + * picked for load-balance and preemption/IRQs are still + * disabled avoiding further scheduler activity on it and we're + * being very careful to re-start the picking loop. + */ + lockdep_unpin_lock(&rq->lock); pull_rt_task(rq); + lockdep_pin_lock(&rq->lock); /* * pull_rt_task() can drop (and re-acquire) rq->lock; this * means a dl or stop task can slip in, in which case we need @@ -1495,7 +1506,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) /* The running task is never eligible for pushing */ dequeue_pushable_task(rq, p); - set_post_schedule(rq); + queue_push_tasks(rq); return p; } @@ -1508,7 +1519,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) * The previous task needs to be made eligible for pushing * if it is still active */ - if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1) + if (on_rt_rq(&p->rt) && tsk_nr_cpus_allowed(p) > 1) enqueue_pushable_task(rq, p); } @@ -1558,7 +1569,7 @@ static int find_lowest_rq(struct task_struct *task) if (unlikely(!lowest_mask)) return -1; - if (task->nr_cpus_allowed == 1) + if (tsk_nr_cpus_allowed(task) == 1) return -1; /* No other targets possible */ if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) @@ -1690,7 +1701,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) BUG_ON(rq->cpu != task_cpu(p)); BUG_ON(task_current(rq, p)); - BUG_ON(p->nr_cpus_allowed <= 1); + BUG_ON(tsk_nr_cpus_allowed(p) <= 1); BUG_ON(!task_on_rq_queued(p)); BUG_ON(!rt_task(p)); @@ -1950,14 +1961,15 @@ static void push_irq_work_func(struct irq_work *work) } #endif /* HAVE_RT_PUSH_IPI */ -static int pull_rt_task(struct rq *this_rq) +static void pull_rt_task(struct rq *this_rq) { - int this_cpu = this_rq->cpu, ret = 0, cpu; + int this_cpu = this_rq->cpu, cpu; + bool resched = false; struct task_struct *p; struct rq *src_rq; if (likely(!rt_overloaded(this_rq))) - return 0; + return; /* * Match the barrier from rt_set_overloaded; this guarantees that if we @@ -1968,7 +1980,7 @@ static int pull_rt_task(struct rq *this_rq) #ifdef HAVE_RT_PUSH_IPI if (sched_feat(RT_PUSH_IPI)) { tell_cpu_to_push(this_rq); - return 0; + return; } #endif @@ -2021,7 +2033,7 @@ static int pull_rt_task(struct rq *this_rq) if (p->prio < src_rq->curr->prio) goto skip; - ret = 1; + resched = true; deactivate_task(src_rq, p, 0); set_task_cpu(p, this_cpu); @@ -2037,12 +2049,8 @@ skip: double_unlock_balance(this_rq, src_rq); } - return ret; -} - -static void post_schedule_rt(struct rq *rq) -{ - push_rt_tasks(rq); + if (resched) + resched_curr(this_rq); } /* @@ -2053,53 +2061,13 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) { if (!task_running(rq, p) && !test_tsk_need_resched(rq->curr) && - has_pushable_tasks(rq) && - p->nr_cpus_allowed > 1 && + tsk_nr_cpus_allowed(p) > 1 && (dl_task(rq->curr) || rt_task(rq->curr)) && - (rq->curr->nr_cpus_allowed < 2 || + (tsk_nr_cpus_allowed(rq->curr) < 2 || rq->curr->prio <= p->prio)) push_rt_tasks(rq); } -static void set_cpus_allowed_rt(struct task_struct *p, - const struct cpumask *new_mask) -{ - struct rq *rq; - int weight; - - BUG_ON(!rt_task(p)); - - if (!task_on_rq_queued(p)) - return; - - weight = cpumask_weight(new_mask); - - /* - * Only update if the process changes its state from whether it - * can migrate or not. - */ - if ((p->nr_cpus_allowed > 1) == (weight > 1)) - return; - - rq = task_rq(p); - - /* - * The process used to be able to migrate OR it can now migrate - */ - if (weight <= 1) { - if (!task_current(rq, p)) - dequeue_pushable_task(rq, p); - BUG_ON(!rq->rt.rt_nr_migratory); - rq->rt.rt_nr_migratory--; - } else { - if (!task_current(rq, p)) - enqueue_pushable_task(rq, p); - rq->rt.rt_nr_migratory++; - } - - update_rt_migration(&rq->rt); -} - /* Assumes rq->lock is held */ static void rq_online_rt(struct rq *rq) { @@ -2138,8 +2106,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) if (!task_on_rq_queued(p) || rq->rt.rt_nr_running) return; - if (pull_rt_task(rq)) - resched_curr(rq); + queue_pull_task(rq); } void __init init_sched_rt_class(void) @@ -2160,8 +2127,6 @@ void __init init_sched_rt_class(void) */ static void switched_to_rt(struct rq *rq, struct task_struct *p) { - int check_resched = 1; - /* * If we are already running, then there's nothing * that needs to be done. But if we are not running @@ -2171,13 +2136,12 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) */ if (task_on_rq_queued(p) && rq->curr != p) { #ifdef CONFIG_SMP - if (p->nr_cpus_allowed > 1 && rq->rt.overloaded && - /* Don't resched if we changed runqueues */ - push_rt_task(rq) && rq != task_rq(p)) - check_resched = 0; -#endif /* CONFIG_SMP */ - if (check_resched && p->prio < rq->curr->prio) + if (tsk_nr_cpus_allowed(p) > 1 && rq->rt.overloaded) + queue_push_tasks(rq); +#else + if (p->prio < rq->curr->prio) resched_curr(rq); +#endif /* CONFIG_SMP */ } } @@ -2198,14 +2162,13 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) * may need to pull tasks to this runqueue. */ if (oldprio < p->prio) - pull_rt_task(rq); + queue_pull_task(rq); + /* * If there's a higher priority task waiting to run - * then reschedule. Note, the above pull_rt_task - * can release the rq lock and p could migrate. - * Only reschedule if p is still on the same runqueue. + * then reschedule. */ - if (p->prio > rq->rt.highest_prio.curr && rq->curr == p) + if (p->prio > rq->rt.highest_prio.curr) resched_curr(rq); #else /* For UP simply resched on drop of prio */ @@ -2313,10 +2276,9 @@ const struct sched_class rt_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_rt, - .set_cpus_allowed = set_cpus_allowed_rt, + .set_cpus_allowed = set_cpus_allowed_common, .rq_online = rq_online_rt, .rq_offline = rq_offline_rt, - .post_schedule = post_schedule_rt, .task_woken = task_woken_rt, .switched_from = switched_from_rt, #endif diff --git a/kernel/kernel/sched/sched.h b/kernel/kernel/sched/sched.h index 308f66454..7dd5f53b6 100644 --- a/kernel/kernel/sched/sched.h +++ b/kernel/kernel/sched/sched.h @@ -26,8 +26,14 @@ extern __read_mostly int scheduler_running; extern unsigned long calc_load_update; extern atomic_long_t calc_load_tasks; +extern void calc_global_load_tick(struct rq *this_rq); extern long calc_load_fold_active(struct rq *this_rq); + +#ifdef CONFIG_SMP extern void update_cpu_load_active(struct rq *this_rq); +#else +static inline void update_cpu_load_active(struct rq *this_rq) { } +#endif /* * Helpers for converting nanosecond timing to jiffy resolution @@ -78,6 +84,10 @@ extern void update_cpu_load_active(struct rq *this_rq); */ #define RUNTIME_INF ((u64)~0ULL) +static inline int idle_policy(int policy) +{ + return policy == SCHED_IDLE; +} static inline int fair_policy(int policy) { return policy == SCHED_NORMAL || policy == SCHED_BATCH; @@ -92,6 +102,11 @@ static inline int dl_policy(int policy) { return policy == SCHED_DEADLINE; } +static inline bool valid_policy(int policy) +{ + return idle_policy(policy) || fair_policy(policy) || + rt_policy(policy) || dl_policy(policy); +} static inline int task_has_rt_policy(struct task_struct *p) { @@ -103,11 +118,6 @@ static inline int task_has_dl_policy(struct task_struct *p) return dl_policy(p->policy); } -static inline bool dl_time_before(u64 a, u64 b) -{ - return (s64)(a - b) < 0; -} - /* * Tells if entity @a should preempt entity @b. */ @@ -131,6 +141,7 @@ struct rt_bandwidth { ktime_t rt_period; u64 rt_runtime; struct hrtimer rt_period_timer; + unsigned int rt_period_active; }; void __dl_clear_params(struct task_struct *p); @@ -215,7 +226,7 @@ struct cfs_bandwidth { s64 hierarchical_quota; u64 runtime_expires; - int idle, timer_active; + int idle, period_active; struct hrtimer period_timer, slack_timer; struct list_head throttled_cfs_rq; @@ -238,7 +249,6 @@ struct task_group { #ifdef CONFIG_SMP atomic_long_t load_avg; - atomic_t runnable_avg; #endif #endif @@ -306,7 +316,7 @@ extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b); extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); -extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force); +extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq); extern void free_rt_sched_group(struct task_group *tg); @@ -359,27 +369,20 @@ struct cfs_rq { #ifdef CONFIG_SMP /* - * CFS Load tracking - * Under CFS, load is tracked on a per-entity basis and aggregated up. - * This allows for the description of both thread and group usage (in - * the FAIR_GROUP_SCHED case). - * runnable_load_avg is the sum of the load_avg_contrib of the - * sched_entities on the rq. - * blocked_load_avg is similar to runnable_load_avg except that its - * the blocked sched_entities on the rq. - * utilization_load_avg is the sum of the average running time of the - * sched_entities on the rq. + * CFS load tracking */ - unsigned long runnable_load_avg, blocked_load_avg, utilization_load_avg; - atomic64_t decay_counter; - u64 last_decay; - atomic_long_t removed_load; - + struct sched_avg avg; + u64 runnable_load_sum; + unsigned long runnable_load_avg; #ifdef CONFIG_FAIR_GROUP_SCHED - /* Required to track per-cpu representation of a task_group */ - u32 tg_runnable_contrib; - unsigned long tg_load_contrib; + unsigned long tg_load_avg_contrib; +#endif + atomic_long_t removed_load_avg, removed_util_avg; +#ifndef CONFIG_64BIT + u64 load_last_update_time_copy; +#endif +#ifdef CONFIG_FAIR_GROUP_SCHED /* * h_load = weight * f(tg) * @@ -588,8 +591,6 @@ struct rq { #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ struct list_head leaf_cfs_rq_list; - - struct sched_avg avg; #endif /* CONFIG_FAIR_GROUP_SCHED */ /* @@ -617,9 +618,10 @@ struct rq { unsigned long cpu_capacity; unsigned long cpu_capacity_orig; + struct callback_head *balance_callback; + unsigned char idle_balance; /* For active balancing */ - int post_schedule; int active_balance; int push_cpu; struct cpu_stop_work active_balance_work; @@ -707,7 +709,7 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); static inline u64 __rq_clock_broken(struct rq *rq) { - return ACCESS_ONCE(rq->clock); + return READ_ONCE(rq->clock); } static inline u64 rq_clock(struct rq *rq) @@ -760,6 +762,21 @@ extern int migrate_swap(struct task_struct *, struct task_struct *); #ifdef CONFIG_SMP +static inline void +queue_balance_callback(struct rq *rq, + struct callback_head *head, + void (*func)(struct rq *rq)) +{ + lockdep_assert_held(&rq->lock); + + if (unlikely(head->next)) + return; + + head->func = (void (*)(struct callback_head *))func; + head->next = rq->balance_callback; + rq->balance_callback = head; +} + extern void sched_ttwu_pending(void); #define rcu_dereference_check_sched_domain(p) \ @@ -990,17 +1007,7 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ -#ifdef CONFIG_NUMA_BALANCING -#define sched_feat_numa(x) sched_feat(x) -#ifdef CONFIG_SCHED_DEBUG -#define numabalancing_enabled sched_feat_numa(NUMA) -#else -extern bool numabalancing_enabled; -#endif /* CONFIG_SCHED_DEBUG */ -#else -#define sched_feat_numa(x) (0) -#define numabalancing_enabled (0) -#endif /* CONFIG_NUMA_BALANCING */ +extern struct static_key_false sched_numa_balancing; static inline u64 global_rt_period(void) { @@ -1042,9 +1049,6 @@ static inline int task_on_rq_migrating(struct task_struct *p) #ifndef prepare_arch_switch # define prepare_arch_switch(next) do { } while (0) #endif -#ifndef finish_arch_switch -# define finish_arch_switch(prev) do { } while (0) -#endif #ifndef finish_arch_post_lock_switch # define finish_arch_post_lock_switch() do { } while (0) #endif @@ -1068,9 +1072,13 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) * After ->on_cpu is cleared, the task can be moved to a different CPU. * We must ensure this doesn't happen until the switch is completely * finished. + * + * In particular, the load of prev->state in finish_task_switch() must + * happen before this. + * + * Pairs with the control dependency and rmb in try_to_wake_up(). */ - smp_wmb(); - prev->on_cpu = 0; + smp_store_release(&prev->on_cpu, 0); #endif #ifdef CONFIG_DEBUG_SPINLOCK /* this is a valid case when another task releases the spinlock */ @@ -1147,16 +1155,18 @@ static const u32 prio_to_wmult[40] = { /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, }; -#define ENQUEUE_WAKEUP 1 -#define ENQUEUE_HEAD 2 +#define ENQUEUE_WAKEUP 0x01 +#define ENQUEUE_HEAD 0x02 #ifdef CONFIG_SMP -#define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */ +#define ENQUEUE_WAKING 0x04 /* sched_class::task_waking was called */ #else -#define ENQUEUE_WAKING 0 +#define ENQUEUE_WAKING 0x00 #endif -#define ENQUEUE_REPLENISH 8 +#define ENQUEUE_REPLENISH 0x08 +#define ENQUEUE_RESTORE 0x10 -#define DEQUEUE_SLEEP 1 +#define DEQUEUE_SLEEP 0x01 +#define DEQUEUE_SAVE 0x02 #define RETRY_TASK ((void *)-1UL) @@ -1184,9 +1194,8 @@ struct sched_class { #ifdef CONFIG_SMP int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); - void (*migrate_task_rq)(struct task_struct *p, int next_cpu); + void (*migrate_task_rq)(struct task_struct *p); - void (*post_schedule) (struct rq *this_rq); void (*task_waking) (struct task_struct *task); void (*task_woken) (struct rq *this_rq, struct task_struct *task); @@ -1218,7 +1227,7 @@ struct sched_class { void (*update_curr) (struct rq *rq); #ifdef CONFIG_FAIR_GROUP_SCHED - void (*task_move_group) (struct task_struct *p, int on_rq); + void (*task_move_group) (struct task_struct *p); #endif }; @@ -1247,6 +1256,8 @@ extern void trigger_load_balance(struct rq *rq); extern void idle_enter_fair(struct rq *this_rq); extern void idle_exit_fair(struct rq *this_rq); +extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask); + #else static inline void idle_enter_fair(struct rq *rq) { } @@ -1285,7 +1296,6 @@ extern void update_max_interval(void); extern void init_sched_dl_class(void); extern void init_sched_rt_class(void); extern void init_sched_fair_class(void); -extern void init_sched_dl_class(void); extern void resched_curr(struct rq *rq); extern void resched_cpu(int cpu); @@ -1308,9 +1318,7 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se); unsigned long to_ratio(u64 period, u64 runtime); -extern void update_idle_cpu_load(struct rq *this_rq); - -extern void init_task_runnable_average(struct task_struct *p); +extern void init_entity_runnable_average(struct sched_entity *se); static inline void add_nr_running(struct rq *rq, unsigned count) { @@ -1406,6 +1414,17 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) } #endif +#ifndef arch_scale_cpu_capacity +static __always_inline +unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) +{ + if (sd && (sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) + return sd->smt_gain / sd->span_weight; + + return SCHED_CAPACITY_SCALE; +} +#endif + static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); @@ -1416,8 +1435,6 @@ static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { } static inline void sched_avg_update(struct rq *rq) { } #endif -extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period); - /* * __task_rq_lock - lock the rq @p resides on. */ @@ -1431,8 +1448,10 @@ static inline struct rq *__task_rq_lock(struct task_struct *p) for (;;) { rq = task_rq(p); raw_spin_lock(&rq->lock); - if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) + if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { + lockdep_pin_lock(&rq->lock); return rq; + } raw_spin_unlock(&rq->lock); while (unlikely(task_on_rq_migrating(p))) @@ -1469,8 +1488,10 @@ static inline struct rq *task_rq_lock(struct task_struct *p, unsigned long *flag * If we observe the new cpu in task_rq_lock, the acquire will * pair with the WMB to ensure we must then also see migrating. */ - if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) + if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { + lockdep_pin_lock(&rq->lock); return rq; + } raw_spin_unlock(&rq->lock); raw_spin_unlock_irqrestore(&p->pi_lock, *flags); @@ -1482,6 +1503,7 @@ static inline struct rq *task_rq_lock(struct task_struct *p, unsigned long *flag static inline void __task_rq_unlock(struct rq *rq) __releases(rq->lock) { + lockdep_unpin_lock(&rq->lock); raw_spin_unlock(&rq->lock); } @@ -1490,6 +1512,7 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) __releases(rq->lock) __releases(p->pi_lock) { + lockdep_unpin_lock(&rq->lock); raw_spin_unlock(&rq->lock); raw_spin_unlock_irqrestore(&p->pi_lock, *flags); } @@ -1676,9 +1699,22 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq); extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); + +#ifdef CONFIG_SCHED_DEBUG extern void print_cfs_stats(struct seq_file *m, int cpu); extern void print_rt_stats(struct seq_file *m, int cpu); extern void print_dl_stats(struct seq_file *m, int cpu); +extern void +print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq); + +#ifdef CONFIG_NUMA_BALANCING +extern void +show_numa_stats(struct task_struct *p, struct seq_file *m); +extern void +print_numa_stats(struct seq_file *m, int node, unsigned long tsf, + unsigned long tpf, unsigned long gsf, unsigned long gpf); +#endif /* CONFIG_NUMA_BALANCING */ +#endif /* CONFIG_SCHED_DEBUG */ extern void init_cfs_rq(struct cfs_rq *cfs_rq); extern void init_rt_rq(struct rt_rq *rt_rq); diff --git a/kernel/kernel/sched/stats.h b/kernel/kernel/sched/stats.h index 4ab704339..b0fbc7632 100644 --- a/kernel/kernel/sched/stats.h +++ b/kernel/kernel/sched/stats.h @@ -47,7 +47,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta) # define schedstat_set(var, val) do { } while (0) #endif -#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) +#ifdef CONFIG_SCHED_INFO static inline void sched_info_reset_dequeued(struct task_struct *t) { t->sched_info.last_queued = 0; @@ -156,7 +156,7 @@ sched_info_switch(struct rq *rq, #define sched_info_depart(rq, t) do { } while (0) #define sched_info_arrive(rq, next) do { } while (0) #define sched_info_switch(rq, t, next) do { } while (0) -#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ +#endif /* CONFIG_SCHED_INFO */ /* * The following are functions that support scheduler-internal time accounting. @@ -174,7 +174,8 @@ static inline bool cputimer_running(struct task_struct *tsk) { struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; - if (!cputimer->running) + /* Check if cputimer isn't running. This is accessed without locking. */ + if (!READ_ONCE(cputimer->running)) return false; /* @@ -215,9 +216,7 @@ static inline void account_group_user_time(struct task_struct *tsk, if (!cputimer_running(tsk)) return; - raw_spin_lock(&cputimer->lock); - cputimer->cputime.utime += cputime; - raw_spin_unlock(&cputimer->lock); + atomic64_add(cputime, &cputimer->cputime_atomic.utime); } /** @@ -238,9 +237,7 @@ static inline void account_group_system_time(struct task_struct *tsk, if (!cputimer_running(tsk)) return; - raw_spin_lock(&cputimer->lock); - cputimer->cputime.stime += cputime; - raw_spin_unlock(&cputimer->lock); + atomic64_add(cputime, &cputimer->cputime_atomic.stime); } /** @@ -261,7 +258,5 @@ static inline void account_group_exec_runtime(struct task_struct *tsk, if (!cputimer_running(tsk)) return; - raw_spin_lock(&cputimer->lock); - cputimer->cputime.sum_exec_runtime += ns; - raw_spin_unlock(&cputimer->lock); + atomic64_add(ns, &cputimer->cputime_atomic.sum_exec_runtime); } diff --git a/kernel/kernel/sched/stop_task.c b/kernel/kernel/sched/stop_task.c index 79ffec45a..cbc67da10 100644 --- a/kernel/kernel/sched/stop_task.c +++ b/kernel/kernel/sched/stop_task.c @@ -123,6 +123,7 @@ const struct sched_class stop_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_stop, + .set_cpus_allowed = set_cpus_allowed_common, #endif .set_curr_task = set_curr_task_stop, diff --git a/kernel/kernel/sched/swait.c b/kernel/kernel/sched/swait.c new file mode 100644 index 000000000..205fe3686 --- /dev/null +++ b/kernel/kernel/sched/swait.c @@ -0,0 +1,143 @@ +#include <linux/sched.h> +#include <linux/swait.h> +#include <linux/suspend.h> + +void __init_swait_queue_head(struct swait_queue_head *q, const char *name, + struct lock_class_key *key) +{ + raw_spin_lock_init(&q->lock); + lockdep_set_class_and_name(&q->lock, key, name); + INIT_LIST_HEAD(&q->task_list); +} +EXPORT_SYMBOL(__init_swait_queue_head); + +/* + * The thing about the wake_up_state() return value; I think we can ignore it. + * + * If for some reason it would return 0, that means the previously waiting + * task is already running, so it will observe condition true (or has already). + */ +void swake_up_locked(struct swait_queue_head *q) +{ + struct swait_queue *curr; + + if (list_empty(&q->task_list)) + return; + + curr = list_first_entry(&q->task_list, typeof(*curr), task_list); + wake_up_process(curr->task); + list_del_init(&curr->task_list); +} +EXPORT_SYMBOL(swake_up_locked); + +void swake_up_all_locked(struct swait_queue_head *q) +{ + struct swait_queue *curr; + int wakes = 0; + + while (!list_empty(&q->task_list)) { + + curr = list_first_entry(&q->task_list, typeof(*curr), + task_list); + wake_up_process(curr->task); + list_del_init(&curr->task_list); + wakes++; + } + if (pm_in_action) + return; + WARN(wakes > 2, "complate_all() with %d waiters\n", wakes); +} +EXPORT_SYMBOL(swake_up_all_locked); + +void swake_up(struct swait_queue_head *q) +{ + unsigned long flags; + + if (!swait_active(q)) + return; + + raw_spin_lock_irqsave(&q->lock, flags); + swake_up_locked(q); + raw_spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(swake_up); + +/* + * Does not allow usage from IRQ disabled, since we must be able to + * release IRQs to guarantee bounded hold time. + */ +void swake_up_all(struct swait_queue_head *q) +{ + struct swait_queue *curr; + LIST_HEAD(tmp); + + if (!swait_active(q)) + return; + + raw_spin_lock_irq(&q->lock); + list_splice_init(&q->task_list, &tmp); + while (!list_empty(&tmp)) { + curr = list_first_entry(&tmp, typeof(*curr), task_list); + + wake_up_state(curr->task, TASK_NORMAL); + list_del_init(&curr->task_list); + + if (list_empty(&tmp)) + break; + + raw_spin_unlock_irq(&q->lock); + raw_spin_lock_irq(&q->lock); + } + raw_spin_unlock_irq(&q->lock); +} +EXPORT_SYMBOL(swake_up_all); + +void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait) +{ + wait->task = current; + if (list_empty(&wait->task_list)) + list_add(&wait->task_list, &q->task_list); +} + +void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&q->lock, flags); + __prepare_to_swait(q, wait); + set_current_state(state); + raw_spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(prepare_to_swait); + +long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state) +{ + if (signal_pending_state(state, current)) + return -ERESTARTSYS; + + prepare_to_swait(q, wait, state); + + return 0; +} +EXPORT_SYMBOL(prepare_to_swait_event); + +void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait) +{ + __set_current_state(TASK_RUNNING); + if (!list_empty(&wait->task_list)) + list_del_init(&wait->task_list); +} + +void finish_swait(struct swait_queue_head *q, struct swait_queue *wait) +{ + unsigned long flags; + + __set_current_state(TASK_RUNNING); + + if (!list_empty_careful(&wait->task_list)) { + raw_spin_lock_irqsave(&q->lock, flags); + list_del_init(&wait->task_list); + raw_spin_unlock_irqrestore(&q->lock, flags); + } +} +EXPORT_SYMBOL(finish_swait); diff --git a/kernel/kernel/sched/wait-simple.c b/kernel/kernel/sched/wait-simple.c deleted file mode 100644 index 7dfa86d1f..000000000 --- a/kernel/kernel/sched/wait-simple.c +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Simple waitqueues without fancy flags and callbacks - * - * (C) 2011 Thomas Gleixner <tglx@linutronix.de> - * - * Based on kernel/wait.c - * - * For licencing details see kernel-base/COPYING - */ -#include <linux/init.h> -#include <linux/export.h> -#include <linux/sched.h> -#include <linux/wait-simple.h> - -/* Adds w to head->list. Must be called with head->lock locked. */ -static inline void __swait_enqueue(struct swait_head *head, struct swaiter *w) -{ - list_add(&w->node, &head->list); - /* We can't let the condition leak before the setting of head */ - smp_mb(); -} - -/* Removes w from head->list. Must be called with head->lock locked. */ -static inline void __swait_dequeue(struct swaiter *w) -{ - list_del_init(&w->node); -} - -void __init_swait_head(struct swait_head *head, struct lock_class_key *key) -{ - raw_spin_lock_init(&head->lock); - lockdep_set_class(&head->lock, key); - INIT_LIST_HEAD(&head->list); -} -EXPORT_SYMBOL(__init_swait_head); - -void swait_prepare_locked(struct swait_head *head, struct swaiter *w) -{ - w->task = current; - if (list_empty(&w->node)) - __swait_enqueue(head, w); -} - -void swait_prepare(struct swait_head *head, struct swaiter *w, int state) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&head->lock, flags); - swait_prepare_locked(head, w); - __set_current_state(state); - raw_spin_unlock_irqrestore(&head->lock, flags); -} -EXPORT_SYMBOL(swait_prepare); - -void swait_finish_locked(struct swait_head *head, struct swaiter *w) -{ - __set_current_state(TASK_RUNNING); - if (w->task) - __swait_dequeue(w); -} - -void swait_finish(struct swait_head *head, struct swaiter *w) -{ - unsigned long flags; - - __set_current_state(TASK_RUNNING); - if (w->task) { - raw_spin_lock_irqsave(&head->lock, flags); - __swait_dequeue(w); - raw_spin_unlock_irqrestore(&head->lock, flags); - } -} -EXPORT_SYMBOL(swait_finish); - -unsigned int -__swait_wake_locked(struct swait_head *head, unsigned int state, unsigned int num) -{ - struct swaiter *curr, *next; - int woken = 0; - - list_for_each_entry_safe(curr, next, &head->list, node) { - if (wake_up_state(curr->task, state)) { - __swait_dequeue(curr); - /* - * The waiting task can free the waiter as - * soon as curr->task = NULL is written, - * without taking any locks. A memory barrier - * is required here to prevent the following - * store to curr->task from getting ahead of - * the dequeue operation. - */ - smp_wmb(); - curr->task = NULL; - if (++woken == num) - break; - } - } - return woken; -} - -unsigned int -__swait_wake(struct swait_head *head, unsigned int state, unsigned int num) -{ - unsigned long flags; - int woken; - - if (!swaitqueue_active(head)) - return 0; - - raw_spin_lock_irqsave(&head->lock, flags); - woken = __swait_wake_locked(head, state, num); - raw_spin_unlock_irqrestore(&head->lock, flags); - return woken; -} -EXPORT_SYMBOL(__swait_wake); diff --git a/kernel/kernel/sched/wait.c b/kernel/kernel/sched/wait.c index 852143a79..f15d6b6a5 100644 --- a/kernel/kernel/sched/wait.c +++ b/kernel/kernel/sched/wait.c @@ -341,7 +341,7 @@ long wait_woken(wait_queue_t *wait, unsigned mode, long timeout) * condition being true _OR_ WQ_FLAG_WOKEN such that we will not miss * an event. */ - set_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */ + smp_store_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */ return timeout; } @@ -354,7 +354,7 @@ int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) * doesn't imply write barrier and the users expects write * barrier semantics on wakeup functions. The following * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up() - * and is paired with set_mb() in wait_woken(). + * and is paired with smp_store_mb() in wait_woken(). */ smp_wmb(); /* C */ wait->flags |= WQ_FLAG_WOKEN; @@ -392,7 +392,7 @@ __wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q, do { prepare_to_wait(wq, &q->wait, mode); if (test_bit(q->key.bit_nr, q->key.flags)) - ret = (*action)(&q->key); + ret = (*action)(&q->key, mode); } while (test_bit(q->key.bit_nr, q->key.flags) && !ret); finish_wait(wq, &q->wait); return ret; @@ -431,7 +431,7 @@ __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, prepare_to_wait_exclusive(wq, &q->wait, mode); if (!test_bit(q->key.bit_nr, q->key.flags)) continue; - ret = action(&q->key); + ret = action(&q->key, mode); if (!ret) continue; abort_exclusive_wait(wq, &q->wait, mode, &q->key); @@ -581,44 +581,44 @@ void wake_up_atomic_t(atomic_t *p) } EXPORT_SYMBOL(wake_up_atomic_t); -__sched int bit_wait(struct wait_bit_key *word) +__sched int bit_wait(struct wait_bit_key *word, int mode) { - if (signal_pending_state(current->state, current)) - return 1; schedule(); + if (signal_pending_state(mode, current)) + return -EINTR; return 0; } EXPORT_SYMBOL(bit_wait); -__sched int bit_wait_io(struct wait_bit_key *word) +__sched int bit_wait_io(struct wait_bit_key *word, int mode) { - if (signal_pending_state(current->state, current)) - return 1; io_schedule(); + if (signal_pending_state(mode, current)) + return -EINTR; return 0; } EXPORT_SYMBOL(bit_wait_io); -__sched int bit_wait_timeout(struct wait_bit_key *word) +__sched int bit_wait_timeout(struct wait_bit_key *word, int mode) { - unsigned long now = ACCESS_ONCE(jiffies); - if (signal_pending_state(current->state, current)) - return 1; + unsigned long now = READ_ONCE(jiffies); if (time_after_eq(now, word->timeout)) return -EAGAIN; schedule_timeout(word->timeout - now); + if (signal_pending_state(mode, current)) + return -EINTR; return 0; } EXPORT_SYMBOL_GPL(bit_wait_timeout); -__sched int bit_wait_io_timeout(struct wait_bit_key *word) +__sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode) { - unsigned long now = ACCESS_ONCE(jiffies); - if (signal_pending_state(current->state, current)) - return 1; + unsigned long now = READ_ONCE(jiffies); if (time_after_eq(now, word->timeout)) return -EAGAIN; io_schedule_timeout(word->timeout - now); + if (signal_pending_state(mode, current)) + return -EINTR; return 0; } EXPORT_SYMBOL_GPL(bit_wait_io_timeout); diff --git a/kernel/kernel/sched/work-simple.c b/kernel/kernel/sched/work-simple.c index e57a05225..9ffe40543 100644 --- a/kernel/kernel/sched/work-simple.c +++ b/kernel/kernel/sched/work-simple.c @@ -5,7 +5,7 @@ * PREEMPT_RT_FULL safe. The callbacks are executed in kthread context. */ -#include <linux/wait-simple.h> +#include <linux/swait.h> #include <linux/work-simple.h> #include <linux/kthread.h> #include <linux/slab.h> @@ -19,7 +19,7 @@ static struct sworker *glob_worker; struct sworker { struct list_head events; - struct swait_head wq; + struct swait_queue_head wq; raw_spinlock_t lock; @@ -80,7 +80,7 @@ static struct sworker *swork_create(void) INIT_LIST_HEAD(&worker->events); raw_spin_lock_init(&worker->lock); - init_swait_head(&worker->wq); + init_swait_queue_head(&worker->wq); worker->task = kthread_run(swork_kthread, worker, "kswork"); if (IS_ERR(worker->task)) { @@ -117,7 +117,7 @@ bool swork_queue(struct swork_event *sev) list_add_tail(&sev->item, &glob_worker->events); raw_spin_unlock_irqrestore(&glob_worker->lock, flags); - swait_wake(&glob_worker->wq); + swake_up(&glob_worker->wq); return true; } EXPORT_SYMBOL_GPL(swork_queue); diff --git a/kernel/kernel/seccomp.c b/kernel/kernel/seccomp.c index 4f4402894..15a1795bb 100644 --- a/kernel/kernel/seccomp.c +++ b/kernel/kernel/seccomp.c @@ -175,17 +175,16 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) */ static u32 seccomp_run_filters(struct seccomp_data *sd) { - struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter); struct seccomp_data sd_local; u32 ret = SECCOMP_RET_ALLOW; + /* Make sure cross-thread synced filter points somewhere sane. */ + struct seccomp_filter *f = + lockless_dereference(current->seccomp.filter); /* Ensure unexpected behavior doesn't result in failing open. */ if (unlikely(WARN_ON(f == NULL))) return SECCOMP_RET_KILL; - /* Make sure cross-thread synced filter points somewhere sane. */ - smp_read_barrier_depends(); - if (!sd) { populate_seccomp_data(&sd_local); sd = &sd_local; @@ -317,24 +316,24 @@ static inline void seccomp_sync_threads(void) put_seccomp_filter(thread); smp_store_release(&thread->seccomp.filter, caller->seccomp.filter); + + /* + * Don't let an unprivileged task work around + * the no_new_privs restriction by creating + * a thread that sets it up, enters seccomp, + * then dies. + */ + if (task_no_new_privs(caller)) + task_set_no_new_privs(thread); + /* * Opt the other thread into seccomp if needed. * As threads are considered to be trust-realm * equivalent (see ptrace_may_access), it is safe to * allow one thread to transition the other. */ - if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) { - /* - * Don't let an unprivileged task work around - * the no_new_privs restriction by creating - * a thread that sets it up, enters seccomp, - * then dies. - */ - if (task_no_new_privs(caller)) - task_set_no_new_privs(thread); - + if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) seccomp_assign_mode(thread, SECCOMP_MODE_FILTER); - } } } @@ -346,16 +345,14 @@ static inline void seccomp_sync_threads(void) */ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) { - struct seccomp_filter *filter; - unsigned long fp_size; - struct sock_filter *fp; - int new_len; - long ret; + struct seccomp_filter *sfilter; + int ret; + const bool save_orig = config_enabled(CONFIG_CHECKPOINT_RESTORE); if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) return ERR_PTR(-EINVAL); + BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter)); - fp_size = fprog->len * sizeof(struct sock_filter); /* * Installing a seccomp filter requires that the task has @@ -368,60 +365,21 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) CAP_SYS_ADMIN) != 0) return ERR_PTR(-EACCES); - fp = kzalloc(fp_size, GFP_KERNEL|__GFP_NOWARN); - if (!fp) - return ERR_PTR(-ENOMEM); - - /* Copy the instructions from fprog. */ - ret = -EFAULT; - if (copy_from_user(fp, fprog->filter, fp_size)) - goto free_prog; - - /* Check and rewrite the fprog via the skb checker */ - ret = bpf_check_classic(fp, fprog->len); - if (ret) - goto free_prog; - - /* Check and rewrite the fprog for seccomp use */ - ret = seccomp_check_filter(fp, fprog->len); - if (ret) - goto free_prog; - - /* Convert 'sock_filter' insns to 'bpf_insn' insns */ - ret = bpf_convert_filter(fp, fprog->len, NULL, &new_len); - if (ret) - goto free_prog; - /* Allocate a new seccomp_filter */ - ret = -ENOMEM; - filter = kzalloc(sizeof(struct seccomp_filter), - GFP_KERNEL|__GFP_NOWARN); - if (!filter) - goto free_prog; - - filter->prog = bpf_prog_alloc(bpf_prog_size(new_len), __GFP_NOWARN); - if (!filter->prog) - goto free_filter; - - ret = bpf_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len); - if (ret) - goto free_filter_prog; - - kfree(fp); - atomic_set(&filter->usage, 1); - filter->prog->len = new_len; + sfilter = kzalloc(sizeof(*sfilter), GFP_KERNEL | __GFP_NOWARN); + if (!sfilter) + return ERR_PTR(-ENOMEM); - bpf_prog_select_runtime(filter->prog); + ret = bpf_prog_create_from_user(&sfilter->prog, fprog, + seccomp_check_filter, save_orig); + if (ret < 0) { + kfree(sfilter); + return ERR_PTR(ret); + } - return filter; + atomic_set(&sfilter->usage, 1); -free_filter_prog: - __bpf_prog_free(filter->prog); -free_filter: - kfree(filter); -free_prog: - kfree(fp); - return ERR_PTR(ret); + return sfilter; } /** @@ -512,7 +470,7 @@ void get_seccomp_filter(struct task_struct *tsk) static inline void seccomp_filter_free(struct seccomp_filter *filter) { if (filter) { - bpf_prog_free(filter->prog); + bpf_prog_destroy(filter->prog); kfree(filter); } } @@ -591,7 +549,11 @@ void secure_computing_strict(int this_syscall) { int mode = current->seccomp.mode; - if (mode == 0) + if (config_enabled(CONFIG_CHECKPOINT_RESTORE) && + unlikely(current->ptrace & PT_SUSPEND_SECCOMP)) + return; + + if (mode == SECCOMP_MODE_DISABLED) return; else if (mode == SECCOMP_MODE_STRICT) __secure_computing_strict(this_syscall); @@ -692,6 +654,10 @@ u32 seccomp_phase1(struct seccomp_data *sd) int this_syscall = sd ? sd->nr : syscall_get_nr(current, task_pt_regs(current)); + if (config_enabled(CONFIG_CHECKPOINT_RESTORE) && + unlikely(current->ptrace & PT_SUSPEND_SECCOMP)) + return SECCOMP_PHASE1_OK; + switch (mode) { case SECCOMP_MODE_STRICT: __secure_computing_strict(this_syscall); /* may call do_exit */ @@ -902,3 +868,76 @@ long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter) /* prctl interface doesn't have flags, so they are always zero. */ return do_seccomp(op, 0, uargs); } + +#if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE) +long seccomp_get_filter(struct task_struct *task, unsigned long filter_off, + void __user *data) +{ + struct seccomp_filter *filter; + struct sock_fprog_kern *fprog; + long ret; + unsigned long count = 0; + + if (!capable(CAP_SYS_ADMIN) || + current->seccomp.mode != SECCOMP_MODE_DISABLED) { + return -EACCES; + } + + spin_lock_irq(&task->sighand->siglock); + if (task->seccomp.mode != SECCOMP_MODE_FILTER) { + ret = -EINVAL; + goto out; + } + + filter = task->seccomp.filter; + while (filter) { + filter = filter->prev; + count++; + } + + if (filter_off >= count) { + ret = -ENOENT; + goto out; + } + count -= filter_off; + + filter = task->seccomp.filter; + while (filter && count > 1) { + filter = filter->prev; + count--; + } + + if (WARN_ON(count != 1 || !filter)) { + /* The filter tree shouldn't shrink while we're using it. */ + ret = -ENOENT; + goto out; + } + + fprog = filter->prog->orig_prog; + if (!fprog) { + /* This must be a new non-cBPF filter, since we save every + * every cBPF filter's orig_prog above when + * CONFIG_CHECKPOINT_RESTORE is enabled. + */ + ret = -EMEDIUMTYPE; + goto out; + } + + ret = fprog->len; + if (!data) + goto out; + + get_seccomp_filter(task); + spin_unlock_irq(&task->sighand->siglock); + + if (copy_to_user(data, fprog->filter, bpf_classic_proglen(fprog))) + ret = -EFAULT; + + put_seccomp_filter(task); + return ret; + +out: + spin_unlock_irq(&task->sighand->siglock); + return ret; +} +#endif diff --git a/kernel/kernel/signal.c b/kernel/kernel/signal.c index 1336e4c01..bc2c990f3 100644 --- a/kernel/kernel/signal.c +++ b/kernel/kernel/signal.c @@ -246,7 +246,7 @@ static inline void print_dropped_signal(int sig) * RETURNS: * %true if @mask is set, %false if made noop because @task was dying. */ -bool task_set_jobctl_pending(struct task_struct *task, unsigned int mask) +bool task_set_jobctl_pending(struct task_struct *task, unsigned long mask) { BUG_ON(mask & ~(JOBCTL_PENDING_MASK | JOBCTL_STOP_CONSUME | JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING)); @@ -298,7 +298,7 @@ void task_clear_jobctl_trapping(struct task_struct *task) * CONTEXT: * Must be called with @task->sighand->siglock held. */ -void task_clear_jobctl_pending(struct task_struct *task, unsigned int mask) +void task_clear_jobctl_pending(struct task_struct *task, unsigned long mask) { BUG_ON(mask & ~JOBCTL_PENDING_MASK); @@ -353,7 +353,6 @@ static bool task_participate_group_stop(struct task_struct *task) return false; } -#ifdef __HAVE_ARCH_CMPXCHG static inline struct sigqueue *get_task_cache(struct task_struct *t) { struct sigqueue *q = t->sigqueue_cache; @@ -370,20 +369,6 @@ static inline int put_task_cache(struct task_struct *t, struct sigqueue *q) return 1; } -#else - -static inline struct sigqueue *get_task_cache(struct task_struct *t) -{ - return NULL; -} - -static inline int put_task_cache(struct task_struct *t, struct sigqueue *q) -{ - return 1; -} - -#endif - /* * allocate a new signal queue record * - this may be called without locks if and only if t == current, otherwise an @@ -487,21 +472,16 @@ void flush_task_sigqueue(struct task_struct *tsk) } /* - * Flush all pending signals for a task. + * Flush all pending signals for this kthread. */ -void __flush_signals(struct task_struct *t) -{ - clear_tsk_thread_flag(t, TIF_SIGPENDING); - flush_sigqueue(&t->pending); - flush_sigqueue(&t->signal->shared_pending); -} - void flush_signals(struct task_struct *t) { unsigned long flags; spin_lock_irqsave(&t->sighand->siglock, flags); - __flush_signals(t); + clear_tsk_thread_flag(t, TIF_SIGPENDING); + flush_sigqueue(&t->pending); + flush_sigqueue(&t->signal->shared_pending); spin_unlock_irqrestore(&t->sighand->siglock, flags); } @@ -581,41 +561,6 @@ int unhandled_signal(struct task_struct *tsk, int sig) return !tsk->ptrace; } -/* - * Notify the system that a driver wants to block all signals for this - * process, and wants to be notified if any signals at all were to be - * sent/acted upon. If the notifier routine returns non-zero, then the - * signal will be acted upon after all. If the notifier routine returns 0, - * then then signal will be blocked. Only one block per process is - * allowed. priv is a pointer to private data that the notifier routine - * can use to determine if the signal should be blocked or not. - */ -void -block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask) -{ - unsigned long flags; - - spin_lock_irqsave(¤t->sighand->siglock, flags); - current->notifier_mask = mask; - current->notifier_data = priv; - current->notifier = notifier; - spin_unlock_irqrestore(¤t->sighand->siglock, flags); -} - -/* Notify the system that blocking has ended. */ - -void -unblock_all_signals(void) -{ - unsigned long flags; - - spin_lock_irqsave(¤t->sighand->siglock, flags); - current->notifier = NULL; - current->notifier_data = NULL; - recalc_sigpending(); - spin_unlock_irqrestore(¤t->sighand->siglock, flags); -} - static void collect_signal(int sig, struct sigpending *list, siginfo_t *info) { struct sigqueue *q, *first = NULL; @@ -658,19 +603,8 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, { int sig = next_signal(pending, mask); - if (sig) { - if (current->notifier) { - if (sigismember(current->notifier_mask, sig)) { - if (!(current->notifier)(current->notifier_data)) { - clear_thread_flag(TIF_SIGPENDING); - return 0; - } - } - } - + if (sig) collect_signal(sig, pending, info); - } - return sig; } @@ -914,7 +848,7 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force) sigset_t flush; if (signal->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP)) { - if (signal->flags & SIGNAL_GROUP_COREDUMP) + if (!(signal->flags & SIGNAL_GROUP_EXIT)) return sig == SIGKILL; /* * The process is in the middle of dying, nothing to do. @@ -2101,7 +2035,7 @@ static bool do_signal_stop(int signr) struct signal_struct *sig = current->signal; if (!(current->jobctl & JOBCTL_STOP_PENDING)) { - unsigned int gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME; + unsigned long gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME; struct task_struct *t; /* signr will be recorded in task->jobctl for retries */ @@ -2589,9 +2523,6 @@ EXPORT_SYMBOL(force_sig); EXPORT_SYMBOL(send_sig); EXPORT_SYMBOL(send_sig_info); EXPORT_SYMBOL(sigprocmask); -EXPORT_SYMBOL(block_all_signals); -EXPORT_SYMBOL(unblock_all_signals); - /* * System call entry points. @@ -3658,7 +3589,7 @@ SYSCALL_DEFINE0(pause) #endif -int sigsuspend(sigset_t *set) +static int sigsuspend(sigset_t *set) { current->saved_sigmask = current->blocked; set_current_blocked(set); diff --git a/kernel/kernel/smp.c b/kernel/kernel/smp.c index 07854477c..d903c0222 100644 --- a/kernel/kernel/smp.c +++ b/kernel/kernel/smp.c @@ -669,7 +669,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), cpumask_var_t cpus; int cpu, ret; - might_sleep_if(gfp_flags & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(gfp_flags)); if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) { preempt_disable(); diff --git a/kernel/kernel/smpboot.c b/kernel/kernel/smpboot.c index c697f73d8..d264f59bf 100644 --- a/kernel/kernel/smpboot.c +++ b/kernel/kernel/smpboot.c @@ -113,7 +113,8 @@ static int smpboot_thread_fn(void *data) if (kthread_should_stop()) { __set_current_state(TASK_RUNNING); preempt_enable(); - if (ht->cleanup) + /* cleanup must mirror setup */ + if (ht->cleanup && td->status != HP_THREAD_NONE) ht->cleanup(td->cpu, cpu_online(td->cpu)); kfree(td); return 0; @@ -221,9 +222,8 @@ static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cp { struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); - if (ht->pre_unpark) - ht->pre_unpark(cpu); - kthread_unpark(tsk); + if (!ht->selfparking) + kthread_unpark(tsk); } void smpboot_unpark_threads(unsigned int cpu) @@ -232,7 +232,8 @@ void smpboot_unpark_threads(unsigned int cpu) mutex_lock(&smpboot_threads_lock); list_for_each_entry(cur, &hotplug_threads, list) - smpboot_unpark_thread(cur, cpu); + if (cpumask_test_cpu(cpu, cur->cpumask)) + smpboot_unpark_thread(cur, cpu); mutex_unlock(&smpboot_threads_lock); } @@ -271,25 +272,34 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht) } /** - * smpboot_register_percpu_thread - Register a per_cpu thread related to hotplug + * smpboot_register_percpu_thread_cpumask - Register a per_cpu thread related + * to hotplug * @plug_thread: Hotplug thread descriptor + * @cpumask: The cpumask where threads run * * Creates and starts the threads on all online cpus. */ -int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread) +int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread, + const struct cpumask *cpumask) { unsigned int cpu; int ret = 0; + if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL)) + return -ENOMEM; + cpumask_copy(plug_thread->cpumask, cpumask); + get_online_cpus(); mutex_lock(&smpboot_threads_lock); for_each_online_cpu(cpu) { ret = __smpboot_create_thread(plug_thread, cpu); if (ret) { smpboot_destroy_threads(plug_thread); + free_cpumask_var(plug_thread->cpumask); goto out; } - smpboot_unpark_thread(plug_thread, cpu); + if (cpumask_test_cpu(cpu, cpumask)) + smpboot_unpark_thread(plug_thread, cpu); } list_add(&plug_thread->list, &hotplug_threads); out: @@ -297,7 +307,7 @@ out: put_online_cpus(); return ret; } -EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread); +EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread_cpumask); /** * smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug @@ -313,9 +323,53 @@ void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread) smpboot_destroy_threads(plug_thread); mutex_unlock(&smpboot_threads_lock); put_online_cpus(); + free_cpumask_var(plug_thread->cpumask); } EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread); +/** + * smpboot_update_cpumask_percpu_thread - Adjust which per_cpu hotplug threads stay parked + * @plug_thread: Hotplug thread descriptor + * @new: Revised mask to use + * + * The cpumask field in the smp_hotplug_thread must not be updated directly + * by the client, but only by calling this function. + * This function can only be called on a registered smp_hotplug_thread. + */ +int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread, + const struct cpumask *new) +{ + struct cpumask *old = plug_thread->cpumask; + cpumask_var_t tmp; + unsigned int cpu; + + if (!alloc_cpumask_var(&tmp, GFP_KERNEL)) + return -ENOMEM; + + get_online_cpus(); + mutex_lock(&smpboot_threads_lock); + + /* Park threads that were exclusively enabled on the old mask. */ + cpumask_andnot(tmp, old, new); + for_each_cpu_and(cpu, tmp, cpu_online_mask) + smpboot_park_thread(plug_thread, cpu); + + /* Unpark threads that are exclusively enabled on the new mask. */ + cpumask_andnot(tmp, new, old); + for_each_cpu_and(cpu, tmp, cpu_online_mask) + smpboot_unpark_thread(plug_thread, cpu); + + cpumask_copy(old, new); + + mutex_unlock(&smpboot_threads_lock); + put_online_cpus(); + + free_cpumask_var(tmp); + + return 0; +} +EXPORT_SYMBOL_GPL(smpboot_update_cpumask_percpu_thread); + static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD); /* diff --git a/kernel/kernel/softirq.c b/kernel/kernel/softirq.c index 49baf8184..cb9c1d5de 100644 --- a/kernel/kernel/softirq.c +++ b/kernel/kernel/softirq.c @@ -58,6 +58,10 @@ EXPORT_SYMBOL(irq_stat); static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; DEFINE_PER_CPU(struct task_struct *, ksoftirqd); +#ifdef CONFIG_PREEMPT_RT_FULL +#define TIMER_SOFTIRQS ((1 << TIMER_SOFTIRQ) | (1 << HRTIMER_SOFTIRQ)) +DEFINE_PER_CPU(struct task_struct *, ktimer_softirqd); +#endif const char * const softirq_to_name[NR_SOFTIRQS] = { "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", @@ -171,6 +175,17 @@ static void wakeup_softirqd(void) wake_up_process(tsk); } +#ifdef CONFIG_PREEMPT_RT_FULL +static void wakeup_timer_softirqd(void) +{ + /* Interrupts are disabled: no need to stop preemption */ + struct task_struct *tsk = __this_cpu_read(ktimer_softirqd); + + if (tsk && tsk->state != TASK_RUNNING) + wake_up_process(tsk); +} +#endif + static void handle_softirq(unsigned int vec_nr) { struct softirq_action *h = softirq_vec + vec_nr; @@ -272,9 +287,9 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) if (preempt_count() == cnt) { #ifdef CONFIG_DEBUG_PREEMPT - current->preempt_disable_ip = get_parent_ip(CALLER_ADDR1); + current->preempt_disable_ip = get_lock_parent_ip(); #endif - trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); + trace_preempt_off(CALLER_ADDR0, get_lock_parent_ip()); } } EXPORT_SYMBOL(__local_bh_disable_ip); @@ -473,7 +488,6 @@ void __raise_softirq_irqoff(unsigned int nr) static inline void local_bh_disable_nort(void) { local_bh_disable(); } static inline void _local_bh_enable_nort(void) { _local_bh_enable(); } static void ksoftirqd_set_sched_params(unsigned int cpu) { } -static void ksoftirqd_clr_sched_params(unsigned int cpu, bool online) { } #else /* !PREEMPT_RT_FULL */ @@ -549,31 +563,21 @@ static void do_current_softirqs(void) do_single_softirq(i); } softirq_clr_runner(i); - unlock_softirq(i); WARN_ON(current->softirq_nestcnt != 1); + local_irq_enable(); + unlock_softirq(i); + local_irq_disable(); } } -static void __local_bh_disable(void) +void __local_bh_disable(void) { if (++current->softirq_nestcnt == 1) migrate_disable(); } +EXPORT_SYMBOL(__local_bh_disable); -void local_bh_disable(void) -{ - __local_bh_disable(); -} -EXPORT_SYMBOL(local_bh_disable); - -void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) -{ - __local_bh_disable(); - if (cnt & PREEMPT_CHECK_OFFSET) - preempt_disable(); -} - -static void __local_bh_enable(void) +void __local_bh_enable(void) { if (WARN_ON(current->softirq_nestcnt == 0)) return; @@ -586,25 +590,7 @@ static void __local_bh_enable(void) if (--current->softirq_nestcnt == 0) migrate_enable(); } - -void local_bh_enable(void) -{ - __local_bh_enable(); -} -EXPORT_SYMBOL(local_bh_enable); - -extern void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) -{ - __local_bh_enable(); - if (cnt & PREEMPT_CHECK_OFFSET) - preempt_enable(); -} - -void local_bh_enable_ip(unsigned long ip) -{ - local_bh_enable(); -} -EXPORT_SYMBOL(local_bh_enable_ip); +EXPORT_SYMBOL(__local_bh_enable); void _local_bh_enable(void) { @@ -629,8 +615,8 @@ static void run_ksoftirqd(unsigned int cpu) do_current_softirqs(); current->softirq_nestcnt--; - rcu_note_context_switch(); local_irq_enable(); + cond_resched_rcu_qs(); } /* @@ -648,8 +634,12 @@ void thread_do_softirq(void) static void do_raise_softirq_irqoff(unsigned int nr) { + unsigned int mask; + + mask = 1UL << nr; + trace_softirq_raise(nr); - or_softirq_pending(1UL << nr); + or_softirq_pending(mask); /* * If we are not in a hard interrupt and inside a bh disabled @@ -658,16 +648,51 @@ static void do_raise_softirq_irqoff(unsigned int nr) * delegate it to ksoftirqd. */ if (!in_irq() && current->softirq_nestcnt) - current->softirqs_raised |= (1U << nr); - else if (__this_cpu_read(ksoftirqd)) - __this_cpu_read(ksoftirqd)->softirqs_raised |= (1U << nr); + current->softirqs_raised |= mask; + else if (!__this_cpu_read(ksoftirqd) || !__this_cpu_read(ktimer_softirqd)) + return; + + if (mask & TIMER_SOFTIRQS) + __this_cpu_read(ktimer_softirqd)->softirqs_raised |= mask; + else + __this_cpu_read(ksoftirqd)->softirqs_raised |= mask; +} + +static void wakeup_proper_softirq(unsigned int nr) +{ + if ((1UL << nr) & TIMER_SOFTIRQS) + wakeup_timer_softirqd(); + else + wakeup_softirqd(); } + void __raise_softirq_irqoff(unsigned int nr) { do_raise_softirq_irqoff(nr); if (!in_irq() && !current->softirq_nestcnt) - wakeup_softirqd(); + wakeup_proper_softirq(nr); +} + +/* + * Same as __raise_softirq_irqoff() but will process them in ksoftirqd + */ +void __raise_softirq_irqoff_ksoft(unsigned int nr) +{ + unsigned int mask; + + if (WARN_ON_ONCE(!__this_cpu_read(ksoftirqd) || + !__this_cpu_read(ktimer_softirqd))) + return; + mask = 1UL << nr; + + trace_softirq_raise(nr); + or_softirq_pending(mask); + if (mask & TIMER_SOFTIRQS) + __this_cpu_read(ktimer_softirqd)->softirqs_raised |= mask; + else + __this_cpu_read(ksoftirqd)->softirqs_raised |= mask; + wakeup_proper_softirq(nr); } /* @@ -693,7 +718,7 @@ void raise_softirq_irqoff(unsigned int nr) * raise a WARN() if the condition is met. */ if (!current->softirq_nestcnt) - wakeup_softirqd(); + wakeup_proper_softirq(nr); } static inline int ksoftirqd_softirq_pending(void) @@ -706,22 +731,37 @@ static inline void _local_bh_enable_nort(void) { } static inline void ksoftirqd_set_sched_params(unsigned int cpu) { + /* Take over all but timer pending softirqs when starting */ + local_irq_disable(); + current->softirqs_raised = local_softirq_pending() & ~TIMER_SOFTIRQS; + local_irq_enable(); +} + +static inline void ktimer_softirqd_set_sched_params(unsigned int cpu) +{ struct sched_param param = { .sched_priority = 1 }; sched_setscheduler(current, SCHED_FIFO, ¶m); - /* Take over all pending softirqs when starting */ + + /* Take over timer pending softirqs when starting */ local_irq_disable(); - current->softirqs_raised = local_softirq_pending(); + current->softirqs_raised = local_softirq_pending() & TIMER_SOFTIRQS; local_irq_enable(); } -static inline void ksoftirqd_clr_sched_params(unsigned int cpu, bool online) +static inline void ktimer_softirqd_clr_sched_params(unsigned int cpu, + bool online) { struct sched_param param = { .sched_priority = 0 }; sched_setscheduler(current, SCHED_NORMAL, ¶m); } +static int ktimer_softirqd_should_run(unsigned int cpu) +{ + return current->softirqs_raised; +} + #endif /* PREEMPT_RT_FULL */ /* * Enter an interrupt context. @@ -771,6 +811,9 @@ static inline void invoke_softirq(void) if (__this_cpu_read(ksoftirqd) && __this_cpu_read(ksoftirqd)->softirqs_raised) wakeup_softirqd(); + if (__this_cpu_read(ktimer_softirqd) && + __this_cpu_read(ktimer_softirqd)->softirqs_raised) + wakeup_timer_softirqd(); local_irq_restore(flags); #endif } @@ -1203,17 +1246,30 @@ static struct notifier_block cpu_nfb = { static struct smp_hotplug_thread softirq_threads = { .store = &ksoftirqd, .setup = ksoftirqd_set_sched_params, - .cleanup = ksoftirqd_clr_sched_params, .thread_should_run = ksoftirqd_should_run, .thread_fn = run_ksoftirqd, .thread_comm = "ksoftirqd/%u", }; +#ifdef CONFIG_PREEMPT_RT_FULL +static struct smp_hotplug_thread softirq_timer_threads = { + .store = &ktimer_softirqd, + .setup = ktimer_softirqd_set_sched_params, + .cleanup = ktimer_softirqd_clr_sched_params, + .thread_should_run = ktimer_softirqd_should_run, + .thread_fn = run_ksoftirqd, + .thread_comm = "ktimersoftd/%u", +}; +#endif + static __init int spawn_ksoftirqd(void) { register_cpu_notifier(&cpu_nfb); BUG_ON(smpboot_register_percpu_thread(&softirq_threads)); +#ifdef CONFIG_PREEMPT_RT_FULL + BUG_ON(smpboot_register_percpu_thread(&softirq_timer_threads)); +#endif return 0; } diff --git a/kernel/kernel/stop_machine.c b/kernel/kernel/stop_machine.c index 1af29ad20..f84d3b45c 100644 --- a/kernel/kernel/stop_machine.c +++ b/kernel/kernel/stop_machine.c @@ -30,18 +30,21 @@ struct cpu_stop_done { atomic_t nr_todo; /* nr left to execute */ bool executed; /* actually executed? */ int ret; /* collected return value */ - struct task_struct *waiter; /* woken when nr_todo reaches 0 */ + struct completion completion; /* fired if nr_todo reaches 0 */ }; /* the actual stopper, one per every possible cpu, enabled on online cpus */ struct cpu_stopper { + struct task_struct *thread; + raw_spinlock_t lock; bool enabled; /* is this stopper enabled? */ struct list_head works; /* list of pending works */ + + struct cpu_stop_work stop_work; /* for stop_cpus */ }; static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); -static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task); static bool stop_machine_initialized = false; /* @@ -56,7 +59,7 @@ static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo) { memset(done, 0, sizeof(*done)); atomic_set(&done->nr_todo, nr_todo); - done->waiter = current; + init_completion(&done->completion); } /* signal completion unless @done is NULL */ @@ -65,48 +68,32 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed) if (done) { if (executed) done->executed = true; - if (atomic_dec_and_test(&done->nr_todo)) { - wake_up_process(done->waiter); - done->waiter = NULL; - } + if (atomic_dec_and_test(&done->nr_todo)) + complete(&done->completion); } } +static void __cpu_stop_queue_work(struct cpu_stopper *stopper, + struct cpu_stop_work *work) +{ + list_add_tail(&work->list, &stopper->works); + wake_up_process(stopper->thread); +} + /* queue @work to @stopper. if offline, @work is completed immediately */ static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); - struct task_struct *p = per_cpu(cpu_stopper_task, cpu); - unsigned long flags; raw_spin_lock_irqsave(&stopper->lock, flags); - - if (stopper->enabled) { - list_add_tail(&work->list, &stopper->works); - wake_up_process(p); - } else + if (stopper->enabled) + __cpu_stop_queue_work(stopper, work); + else cpu_stop_signal_done(work->done, false); - raw_spin_unlock_irqrestore(&stopper->lock, flags); } -static void wait_for_stop_done(struct cpu_stop_done *done) -{ - set_current_state(TASK_UNINTERRUPTIBLE); - while (atomic_read(&done->nr_todo)) { - schedule(); - set_current_state(TASK_UNINTERRUPTIBLE); - } - /* - * We need to wait until cpu_stop_signal_done() has cleared - * done->waiter. - */ - while (done->waiter) - cpu_relax(); - set_current_state(TASK_RUNNING); -} - /** * stop_one_cpu - stop a cpu * @cpu: cpu to stop @@ -138,7 +125,7 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg) cpu_stop_init_done(&done, 1); cpu_stop_queue_work(cpu, &work); - wait_for_stop_done(&done); + wait_for_completion(&done.completion); return done.executed ? done.ret : -ENOENT; } @@ -157,7 +144,7 @@ enum multi_stop_state { }; struct multi_stop_data { - int (*fn)(void *); + cpu_stop_fn_t fn; void *data; /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */ unsigned int num_threads; @@ -229,25 +216,31 @@ static int multi_cpu_stop(void *data) return err; } -struct irq_cpu_stop_queue_work_info { - int cpu1; - int cpu2; - struct cpu_stop_work *work1; - struct cpu_stop_work *work2; -}; - -/* - * This function is always run with irqs and preemption disabled. - * This guarantees that both work1 and work2 get queued, before - * our local migrate thread gets the chance to preempt us. - */ -static void irq_cpu_stop_queue_work(void *arg) +static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1, + int cpu2, struct cpu_stop_work *work2) { - struct irq_cpu_stop_queue_work_info *info = arg; - cpu_stop_queue_work(info->cpu1, info->work1); - cpu_stop_queue_work(info->cpu2, info->work2); -} + struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1); + struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2); + int err; + + lg_double_lock(&stop_cpus_lock, cpu1, cpu2); + raw_spin_lock_irq(&stopper1->lock); + raw_spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING); + + err = -ENOENT; + if (!stopper1->enabled || !stopper2->enabled) + goto unlock; + + err = 0; + __cpu_stop_queue_work(stopper1, work1); + __cpu_stop_queue_work(stopper2, work2); +unlock: + raw_spin_unlock(&stopper2->lock); + raw_spin_unlock_irq(&stopper1->lock); + lg_double_unlock(&stop_cpus_lock, cpu1, cpu2); + return err; +} /** * stop_two_cpus - stops two cpus * @cpu1: the cpu to stop @@ -263,7 +256,6 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * { struct cpu_stop_done done; struct cpu_stop_work work1, work2; - struct irq_cpu_stop_queue_work_info call_args; struct multi_stop_data msdata; preempt_disable_nort(); @@ -280,42 +272,19 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * .done = &done }; - call_args = (struct irq_cpu_stop_queue_work_info){ - .cpu1 = cpu1, - .cpu2 = cpu2, - .work1 = &work1, - .work2 = &work2, - }; - cpu_stop_init_done(&done, 2); set_state(&msdata, MULTI_STOP_PREPARE); - /* - * If we observe both CPUs active we know _cpu_down() cannot yet have - * queued its stop_machine works and therefore ours will get executed - * first. Or its not either one of our CPUs that's getting unplugged, - * in which case we don't care. - * - * This relies on the stopper workqueues to be FIFO. - */ - if (!cpu_active(cpu1) || !cpu_active(cpu2)) { + if (cpu1 > cpu2) + swap(cpu1, cpu2); + if (cpu_stop_queue_two_works(cpu1, &work1, cpu2, &work2)) { preempt_enable_nort(); return -ENOENT; } - lg_local_lock(&stop_cpus_lock); - /* - * Queuing needs to be done by the lowest numbered CPU, to ensure - * that works are always queued in the same order on every CPU. - * This prevents deadlocks. - */ - smp_call_function_single(min(cpu1, cpu2), - &irq_cpu_stop_queue_work, - &call_args, 1); - lg_local_unlock(&stop_cpus_lock); preempt_enable_nort(); - wait_for_stop_done(&done); + wait_for_completion(&done.completion); return done.executed ? done.ret : -ENOENT; } @@ -343,7 +312,6 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, /* static data for stop_cpus */ static DEFINE_MUTEX(stop_cpus_mutex); -static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work); static void queue_stop_cpus_work(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg, @@ -352,14 +320,6 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask, struct cpu_stop_work *work; unsigned int cpu; - /* initialize works and done */ - for_each_cpu(cpu, cpumask) { - work = &per_cpu(stop_cpus_work, cpu); - work->fn = fn; - work->arg = arg; - work->done = done; - } - /* * Make sure that all work is queued on all cpus before * any of the cpus can execute it. @@ -368,8 +328,14 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask, lg_global_lock(&stop_cpus_lock); else lg_global_trylock_relax(&stop_cpus_lock); - for_each_cpu(cpu, cpumask) - cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu)); + + for_each_cpu(cpu, cpumask) { + work = &per_cpu(cpu_stopper.stop_work, cpu); + work->fn = fn; + work->arg = arg; + work->done = done; + cpu_stop_queue_work(cpu, work); + } lg_global_unlock(&stop_cpus_lock); } @@ -380,7 +346,7 @@ static int __stop_cpus(const struct cpumask *cpumask, cpu_stop_init_done(&done, cpumask_weight(cpumask)); queue_stop_cpus_work(cpumask, fn, arg, &done, false); - wait_for_stop_done(&done); + wait_for_completion(&done.completion); return done.executed ? done.ret : -ENOENT; } @@ -511,56 +477,52 @@ repeat: kallsyms_lookup((unsigned long)fn, NULL, NULL, NULL, ksym_buf), arg); - /* - * Make sure that the wakeup and setting done->waiter - * to NULL is atomic. - */ - local_irq_disable(); cpu_stop_signal_done(done, true); - local_irq_enable(); goto repeat; } } +void stop_machine_park(int cpu) +{ + struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); + /* + * Lockless. cpu_stopper_thread() will take stopper->lock and flush + * the pending works before it parks, until then it is fine to queue + * the new works. + */ + stopper->enabled = false; + kthread_park(stopper->thread); +} + extern void sched_set_stop_task(int cpu, struct task_struct *stop); static void cpu_stop_create(unsigned int cpu) { - sched_set_stop_task(cpu, per_cpu(cpu_stopper_task, cpu)); + sched_set_stop_task(cpu, per_cpu(cpu_stopper.thread, cpu)); } static void cpu_stop_park(unsigned int cpu) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); - struct cpu_stop_work *work; - unsigned long flags; - /* drain remaining works */ - raw_spin_lock_irqsave(&stopper->lock, flags); - list_for_each_entry(work, &stopper->works, list) - cpu_stop_signal_done(work->done, false); - stopper->enabled = false; - raw_spin_unlock_irqrestore(&stopper->lock, flags); + WARN_ON(!list_empty(&stopper->works)); } -static void cpu_stop_unpark(unsigned int cpu) +void stop_machine_unpark(int cpu) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); - raw_spin_lock_irq(&stopper->lock); stopper->enabled = true; - raw_spin_unlock_irq(&stopper->lock); + kthread_unpark(stopper->thread); } static struct smp_hotplug_thread cpu_stop_threads = { - .store = &cpu_stopper_task, + .store = &cpu_stopper.thread, .thread_should_run = cpu_stop_should_run, .thread_fn = cpu_stopper_thread, .thread_comm = "migration/%u", .create = cpu_stop_create, - .setup = cpu_stop_unpark, .park = cpu_stop_park, - .pre_unpark = cpu_stop_unpark, .selfparking = true, }; @@ -578,14 +540,15 @@ static int __init cpu_stop_init(void) lg_lock_init(&stop_cpus_lock, "stop_cpus_lock"); BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads)); + stop_machine_unpark(raw_smp_processor_id()); stop_machine_initialized = true; return 0; } early_initcall(cpu_stop_init); -#ifdef CONFIG_STOP_MACHINE +#if defined(CONFIG_SMP) || defined(CONFIG_HOTPLUG_CPU) -int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) +static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) { struct multi_stop_data msdata = { .fn = fn, @@ -618,7 +581,7 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) return stop_cpus(cpu_online_mask, multi_cpu_stop, &msdata); } -int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) +int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) { int ret; @@ -652,7 +615,7 @@ EXPORT_SYMBOL_GPL(stop_machine); * 0 if all executions of @fn returned 0, any non zero return value if any * returned non zero. */ -int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data, +int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) { struct multi_stop_data msdata = { .fn = fn, .data = data, @@ -676,11 +639,11 @@ int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data, ret = multi_cpu_stop(&msdata); /* Busy wait for completion. */ - while (atomic_read(&done.nr_todo)) + while (!completion_done(&done.completion)) cpu_relax(); mutex_unlock(&stop_cpus_mutex); return ret ?: done.ret; } -#endif /* CONFIG_STOP_MACHINE */ +#endif /* CONFIG_SMP || CONFIG_HOTPLUG_CPU */ diff --git a/kernel/kernel/sys.c b/kernel/kernel/sys.c index a4e372b79..78947de6f 100644 --- a/kernel/kernel/sys.c +++ b/kernel/kernel/sys.c @@ -92,10 +92,10 @@ # define SET_TSC_CTL(a) (-EINVAL) #endif #ifndef MPX_ENABLE_MANAGEMENT -# define MPX_ENABLE_MANAGEMENT(a) (-EINVAL) +# define MPX_ENABLE_MANAGEMENT() (-EINVAL) #endif #ifndef MPX_DISABLE_MANAGEMENT -# define MPX_DISABLE_MANAGEMENT(a) (-EINVAL) +# define MPX_DISABLE_MANAGEMENT() (-EINVAL) #endif #ifndef GET_FP_MODE # define GET_FP_MODE(a) (-EINVAL) @@ -222,7 +222,7 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) goto out_unlock; /* No processes for this user */ } do_each_thread(g, p) { - if (uid_eq(task_uid(p), uid)) + if (uid_eq(task_uid(p), uid) && task_pid_vnr(p)) error = set_one_prio(p, niceval, error); } while_each_thread(g, p); if (!uid_eq(uid, cred->uid)) @@ -290,7 +290,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) goto out_unlock; /* No processes for this user */ } do_each_thread(g, p) { - if (uid_eq(task_uid(p), uid)) { + if (uid_eq(task_uid(p), uid) && task_pid_vnr(p)) { niceval = nice_to_rlimit(task_nice(p)); if (niceval > retval) retval = niceval; @@ -1668,8 +1668,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) * overall picture. */ err = -EACCES; - if (!S_ISREG(inode->i_mode) || - exe.file->f_path.mnt->mnt_flags & MNT_NOEXEC) + if (!S_ISREG(inode->i_mode) || path_noexec(&exe.file->f_path)) goto exit; err = inode_permission(inode, MAY_EXEC); @@ -1722,7 +1721,6 @@ exit_err: goto exit; } -#ifdef CONFIG_CHECKPOINT_RESTORE /* * WARNING: we don't require any capability here so be very careful * in what is allowed for modification from userspace. @@ -1818,6 +1816,7 @@ out: return error; } +#ifdef CONFIG_CHECKPOINT_RESTORE static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data_size) { struct prctl_mm_map prctl_map = { .exe_fd = (u32)-1, }; @@ -1854,11 +1853,13 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL; } - if (prctl_map.exe_fd != (u32)-1) + if (prctl_map.exe_fd != (u32)-1) { error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd); - down_read(&mm->mmap_sem); - if (error) - goto out; + if (error) + return error; + } + + down_write(&mm->mmap_sem); /* * We don't validate if these members are pointing to @@ -1895,17 +1896,46 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data if (prctl_map.auxv_size) memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv)); - error = 0; -out: - up_read(&mm->mmap_sem); - return error; + up_write(&mm->mmap_sem); + return 0; } #endif /* CONFIG_CHECKPOINT_RESTORE */ +static int prctl_set_auxv(struct mm_struct *mm, unsigned long addr, + unsigned long len) +{ + /* + * This doesn't move the auxiliary vector itself since it's pinned to + * mm_struct, but it permits filling the vector with new values. It's + * up to the caller to provide sane values here, otherwise userspace + * tools which use this vector might be unhappy. + */ + unsigned long user_auxv[AT_VECTOR_SIZE]; + + if (len > sizeof(user_auxv)) + return -EINVAL; + + if (copy_from_user(user_auxv, (const void __user *)addr, len)) + return -EFAULT; + + /* Make sure the last entry is always AT_NULL */ + user_auxv[AT_VECTOR_SIZE - 2] = 0; + user_auxv[AT_VECTOR_SIZE - 1] = 0; + + BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv)); + + task_lock(current); + memcpy(mm->saved_auxv, user_auxv, len); + task_unlock(current); + + return 0; +} + static int prctl_set_mm(int opt, unsigned long addr, unsigned long arg4, unsigned long arg5) { struct mm_struct *mm = current->mm; + struct prctl_mm_map prctl_map; struct vm_area_struct *vma; int error; @@ -1925,50 +1955,75 @@ static int prctl_set_mm(int opt, unsigned long addr, if (opt == PR_SET_MM_EXE_FILE) return prctl_set_mm_exe_file(mm, (unsigned int)addr); + if (opt == PR_SET_MM_AUXV) + return prctl_set_auxv(mm, addr, arg4); + if (addr >= TASK_SIZE || addr < mmap_min_addr) return -EINVAL; error = -EINVAL; - down_read(&mm->mmap_sem); + down_write(&mm->mmap_sem); vma = find_vma(mm, addr); + prctl_map.start_code = mm->start_code; + prctl_map.end_code = mm->end_code; + prctl_map.start_data = mm->start_data; + prctl_map.end_data = mm->end_data; + prctl_map.start_brk = mm->start_brk; + prctl_map.brk = mm->brk; + prctl_map.start_stack = mm->start_stack; + prctl_map.arg_start = mm->arg_start; + prctl_map.arg_end = mm->arg_end; + prctl_map.env_start = mm->env_start; + prctl_map.env_end = mm->env_end; + prctl_map.auxv = NULL; + prctl_map.auxv_size = 0; + prctl_map.exe_fd = -1; + switch (opt) { case PR_SET_MM_START_CODE: - mm->start_code = addr; + prctl_map.start_code = addr; break; case PR_SET_MM_END_CODE: - mm->end_code = addr; + prctl_map.end_code = addr; break; case PR_SET_MM_START_DATA: - mm->start_data = addr; + prctl_map.start_data = addr; break; case PR_SET_MM_END_DATA: - mm->end_data = addr; + prctl_map.end_data = addr; + break; + case PR_SET_MM_START_STACK: + prctl_map.start_stack = addr; break; - case PR_SET_MM_START_BRK: - if (addr <= mm->end_data) - goto out; - - if (check_data_rlimit(rlimit(RLIMIT_DATA), mm->brk, addr, - mm->end_data, mm->start_data)) - goto out; - - mm->start_brk = addr; + prctl_map.start_brk = addr; break; - case PR_SET_MM_BRK: - if (addr <= mm->end_data) - goto out; - - if (check_data_rlimit(rlimit(RLIMIT_DATA), addr, mm->start_brk, - mm->end_data, mm->start_data)) - goto out; - - mm->brk = addr; + prctl_map.brk = addr; break; + case PR_SET_MM_ARG_START: + prctl_map.arg_start = addr; + break; + case PR_SET_MM_ARG_END: + prctl_map.arg_end = addr; + break; + case PR_SET_MM_ENV_START: + prctl_map.env_start = addr; + break; + case PR_SET_MM_ENV_END: + prctl_map.env_end = addr; + break; + default: + goto out; + } + + error = validate_prctl_map(&prctl_map); + if (error) + goto out; + switch (opt) { /* * If command line arguments and environment * are placed somewhere else on stack, we can @@ -1985,55 +2040,23 @@ static int prctl_set_mm(int opt, unsigned long addr, error = -EFAULT; goto out; } - if (opt == PR_SET_MM_START_STACK) - mm->start_stack = addr; - else if (opt == PR_SET_MM_ARG_START) - mm->arg_start = addr; - else if (opt == PR_SET_MM_ARG_END) - mm->arg_end = addr; - else if (opt == PR_SET_MM_ENV_START) - mm->env_start = addr; - else if (opt == PR_SET_MM_ENV_END) - mm->env_end = addr; - break; - - /* - * This doesn't move auxiliary vector itself - * since it's pinned to mm_struct, but allow - * to fill vector with new values. It's up - * to a caller to provide sane values here - * otherwise user space tools which use this - * vector might be unhappy. - */ - case PR_SET_MM_AUXV: { - unsigned long user_auxv[AT_VECTOR_SIZE]; - - if (arg4 > sizeof(user_auxv)) - goto out; - up_read(&mm->mmap_sem); - - if (copy_from_user(user_auxv, (const void __user *)addr, arg4)) - return -EFAULT; - - /* Make sure the last entry is always AT_NULL */ - user_auxv[AT_VECTOR_SIZE - 2] = 0; - user_auxv[AT_VECTOR_SIZE - 1] = 0; - - BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv)); - - task_lock(current); - memcpy(mm->saved_auxv, user_auxv, arg4); - task_unlock(current); - - return 0; - } - default: - goto out; } + mm->start_code = prctl_map.start_code; + mm->end_code = prctl_map.end_code; + mm->start_data = prctl_map.start_data; + mm->end_data = prctl_map.end_data; + mm->start_brk = prctl_map.start_brk; + mm->brk = prctl_map.brk; + mm->start_stack = prctl_map.start_stack; + mm->arg_start = prctl_map.arg_start; + mm->arg_end = prctl_map.arg_end; + mm->env_start = prctl_map.env_start; + mm->env_end = prctl_map.env_end; + error = 0; out: - up_read(&mm->mmap_sem); + up_write(&mm->mmap_sem); return error; } @@ -2230,12 +2253,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_MPX_ENABLE_MANAGEMENT: if (arg2 || arg3 || arg4 || arg5) return -EINVAL; - error = MPX_ENABLE_MANAGEMENT(me); + error = MPX_ENABLE_MANAGEMENT(); break; case PR_MPX_DISABLE_MANAGEMENT: if (arg2 || arg3 || arg4 || arg5) return -EINVAL; - error = MPX_DISABLE_MANAGEMENT(me); + error = MPX_DISABLE_MANAGEMENT(); break; case PR_SET_FP_MODE: error = SET_FP_MODE(me, arg2); diff --git a/kernel/kernel/sys_ni.c b/kernel/kernel/sys_ni.c index 7995ef586..0623787ec 100644 --- a/kernel/kernel/sys_ni.c +++ b/kernel/kernel/sys_ni.c @@ -140,6 +140,7 @@ cond_syscall(sys_sgetmask); cond_syscall(sys_ssetmask); cond_syscall(sys_vm86old); cond_syscall(sys_vm86); +cond_syscall(sys_modify_ldt); cond_syscall(sys_ipc); cond_syscall(compat_sys_ipc); cond_syscall(compat_sys_sysctl); @@ -193,6 +194,7 @@ cond_syscall(sys_mlock); cond_syscall(sys_munlock); cond_syscall(sys_mlockall); cond_syscall(sys_munlockall); +cond_syscall(sys_mlock2); cond_syscall(sys_mincore); cond_syscall(sys_madvise); cond_syscall(sys_mremap); @@ -218,6 +220,7 @@ cond_syscall(compat_sys_timerfd_gettime); cond_syscall(sys_eventfd); cond_syscall(sys_eventfd2); cond_syscall(sys_memfd_create); +cond_syscall(sys_userfaultfd); /* performance counters: */ cond_syscall(sys_perf_event_open); @@ -243,3 +246,6 @@ cond_syscall(sys_bpf); /* execveat */ cond_syscall(sys_execveat); + +/* membarrier */ +cond_syscall(sys_membarrier); diff --git a/kernel/kernel/sysctl.c b/kernel/kernel/sysctl.c index c3eee4c6d..dc6858d66 100644 --- a/kernel/kernel/sysctl.c +++ b/kernel/kernel/sysctl.c @@ -64,6 +64,7 @@ #include <linux/binfmts.h> #include <linux/sched/sysctl.h> #include <linux/kexec.h> +#include <linux/bpf.h> #include <asm/uaccess.h> #include <asm/processor.h> @@ -349,15 +350,6 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "timer_migration", - .data = &sysctl_timer_migration, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, - }, #endif /* CONFIG_SMP */ #ifdef CONFIG_NUMA_BALANCING { @@ -630,7 +622,7 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_KEXEC +#ifdef CONFIG_KEXEC_CORE { .procname = "kexec_load_disabled", .data = &kexec_load_disabled, @@ -881,6 +873,13 @@ static struct ctl_table kern_table[] = { .extra2 = &one, }, { + .procname = "watchdog_cpumask", + .data = &watchdog_cpumask_bits, + .maxlen = NR_CPUS, + .mode = 0644, + .proc_handler = proc_watchdog_cpumask, + }, + { .procname = "softlockup_panic", .data = &softlockup_panic, .maxlen = sizeof(int), @@ -889,6 +888,17 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, +#ifdef CONFIG_HARDLOCKUP_DETECTOR + { + .procname = "hardlockup_panic", + .data = &hardlockup_panic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, +#endif #ifdef CONFIG_SMP { .procname = "softlockup_all_cpu_backtrace", @@ -899,6 +909,15 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, + { + .procname = "hardlockup_all_cpu_backtrace", + .data = &sysctl_hardlockup_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, #endif /* CONFIG_SMP */ #endif #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) @@ -1132,6 +1151,27 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) + { + .procname = "timer_migration", + .data = &sysctl_timer_migration, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = timer_migration_handler, + }, +#endif +#ifdef CONFIG_BPF_SYSCALL + { + .procname = "unprivileged_bpf_disabled", + .data = &sysctl_unprivileged_bpf_disabled, + .maxlen = sizeof(sysctl_unprivileged_bpf_disabled), + .mode = 0644, + /* only handle a transition from default "0" to "1" */ + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + .extra2 = &one, + }, +#endif { } }; @@ -1988,7 +2028,7 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, int val = *valp; if (val < 0) { *negp = true; - *lvalp = (unsigned long)-val; + *lvalp = -(unsigned long)val; } else { *negp = false; *lvalp = (unsigned long)val; @@ -2194,7 +2234,7 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp, int val = *valp; if (val < 0) { *negp = true; - *lvalp = (unsigned long)-val; + *lvalp = -(unsigned long)val; } else { *negp = false; *lvalp = (unsigned long)val; @@ -2429,7 +2469,7 @@ static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp, unsigned long lval; if (val < 0) { *negp = true; - lval = (unsigned long)-val; + lval = -(unsigned long)val; } else { *negp = false; lval = (unsigned long)val; @@ -2452,7 +2492,7 @@ static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp unsigned long lval; if (val < 0) { *negp = true; - lval = (unsigned long)-val; + lval = -(unsigned long)val; } else { *negp = false; lval = (unsigned long)val; @@ -2477,7 +2517,7 @@ static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp, unsigned long lval; if (val < 0) { *negp = true; - lval = (unsigned long)-val; + lval = -(unsigned long)val; } else { *negp = false; lval = (unsigned long)val; diff --git a/kernel/kernel/system_certificates.S b/kernel/kernel/system_certificates.S deleted file mode 100644 index 3e9868d47..000000000 --- a/kernel/kernel/system_certificates.S +++ /dev/null @@ -1,20 +0,0 @@ -#include <linux/export.h> -#include <linux/init.h> - - __INITRODATA - - .align 8 - .globl VMLINUX_SYMBOL(system_certificate_list) -VMLINUX_SYMBOL(system_certificate_list): -__cert_list_start: - .incbin "kernel/x509_certificate_list" -__cert_list_end: - - .align 8 - .globl VMLINUX_SYMBOL(system_certificate_list_size) -VMLINUX_SYMBOL(system_certificate_list_size): -#ifdef CONFIG_64BIT - .quad __cert_list_end - __cert_list_start -#else - .long __cert_list_end - __cert_list_start -#endif diff --git a/kernel/kernel/system_keyring.c b/kernel/kernel/system_keyring.c deleted file mode 100644 index 875f64e89..000000000 --- a/kernel/kernel/system_keyring.c +++ /dev/null @@ -1,106 +0,0 @@ -/* System trusted keyring for trusted public keys - * - * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. - */ - -#include <linux/export.h> -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/cred.h> -#include <linux/err.h> -#include <keys/asymmetric-type.h> -#include <keys/system_keyring.h> -#include "module-internal.h" - -struct key *system_trusted_keyring; -EXPORT_SYMBOL_GPL(system_trusted_keyring); - -extern __initconst const u8 system_certificate_list[]; -extern __initconst const unsigned long system_certificate_list_size; - -/* - * Load the compiled-in keys - */ -static __init int system_trusted_keyring_init(void) -{ - pr_notice("Initialise system trusted keyring\n"); - - system_trusted_keyring = - keyring_alloc(".system_keyring", - KUIDT_INIT(0), KGIDT_INIT(0), current_cred(), - ((KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH), - KEY_ALLOC_NOT_IN_QUOTA, NULL); - if (IS_ERR(system_trusted_keyring)) - panic("Can't allocate system trusted keyring\n"); - - set_bit(KEY_FLAG_TRUSTED_ONLY, &system_trusted_keyring->flags); - return 0; -} - -/* - * Must be initialised before we try and load the keys into the keyring. - */ -device_initcall(system_trusted_keyring_init); - -/* - * Load the compiled-in list of X.509 certificates. - */ -static __init int load_system_certificate_list(void) -{ - key_ref_t key; - const u8 *p, *end; - size_t plen; - - pr_notice("Loading compiled-in X.509 certificates\n"); - - p = system_certificate_list; - end = p + system_certificate_list_size; - while (p < end) { - /* Each cert begins with an ASN.1 SEQUENCE tag and must be more - * than 256 bytes in size. - */ - if (end - p < 4) - goto dodgy_cert; - if (p[0] != 0x30 && - p[1] != 0x82) - goto dodgy_cert; - plen = (p[2] << 8) | p[3]; - plen += 4; - if (plen > end - p) - goto dodgy_cert; - - key = key_create_or_update(make_key_ref(system_trusted_keyring, 1), - "asymmetric", - NULL, - p, - plen, - ((KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW | KEY_USR_READ), - KEY_ALLOC_NOT_IN_QUOTA | - KEY_ALLOC_TRUSTED); - if (IS_ERR(key)) { - pr_err("Problem loading in-kernel X.509 certificate (%ld)\n", - PTR_ERR(key)); - } else { - set_bit(KEY_FLAG_BUILTIN, &key_ref_to_ptr(key)->flags); - pr_notice("Loaded X.509 cert '%s'\n", - key_ref_to_ptr(key)->description); - key_ref_put(key); - } - p += plen; - } - - return 0; - -dodgy_cert: - pr_err("Problem parsing in-kernel X.509 certificate list\n"); - return 0; -} -late_initcall(load_system_certificate_list); diff --git a/kernel/kernel/task_work.c b/kernel/kernel/task_work.c index 8727032e3..53fa971d0 100644 --- a/kernel/kernel/task_work.c +++ b/kernel/kernel/task_work.c @@ -18,6 +18,8 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */ * This is like the signal handler which runs in kernel mode, but it doesn't * try to wake up the @task. * + * Note: there is no ordering guarantee on works queued here. + * * RETURNS: * 0 if succeeds or -ESRCH. */ @@ -108,16 +110,6 @@ void task_work_run(void) raw_spin_unlock_wait(&task->pi_lock); smp_mb(); - /* Reverse the list to run the works in fifo order */ - head = NULL; - do { - next = work->next; - work->next = head; - head = work; - work = next; - } while (work); - - work = head; do { next = work->next; work->func(work); diff --git a/kernel/kernel/time/Kconfig b/kernel/kernel/time/Kconfig index 579ce1b92..4008d9f95 100644 --- a/kernel/kernel/time/Kconfig +++ b/kernel/kernel/time/Kconfig @@ -92,12 +92,10 @@ config NO_HZ_FULL depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS # We need at least one periodic CPU for timekeeping depends on SMP - # RCU_USER_QS dependency depends on HAVE_CONTEXT_TRACKING # VIRT_CPU_ACCOUNTING_GEN dependency depends on HAVE_VIRT_CPU_ACCOUNTING_GEN select NO_HZ_COMMON - select RCU_USER_QS select RCU_NOCB_CPU select VIRT_CPU_ACCOUNTING_GEN select IRQ_WORK diff --git a/kernel/kernel/time/Makefile b/kernel/kernel/time/Makefile index 01f031241..49eca0bee 100644 --- a/kernel/kernel/time/Makefile +++ b/kernel/kernel/time/Makefile @@ -12,20 +12,3 @@ obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o obj-$(CONFIG_TIMER_STATS) += timer_stats.o obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o obj-$(CONFIG_TEST_UDELAY) += test_udelay.o - -$(obj)/time.o: $(obj)/timeconst.h - -quiet_cmd_hzfile = HZFILE $@ - cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@ - -targets += hz.bc -$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE - $(call if_changed,hzfile) - -quiet_cmd_bc = BC $@ - cmd_bc = bc -q $(filter-out FORCE,$^) > $@ - -targets += timeconst.h -$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE - $(call if_changed,bc) - diff --git a/kernel/kernel/time/alarmtimer.c b/kernel/kernel/time/alarmtimer.c index 1b001ed1e..7fbba635a 100644 --- a/kernel/kernel/time/alarmtimer.c +++ b/kernel/kernel/time/alarmtimer.c @@ -317,19 +317,16 @@ EXPORT_SYMBOL_GPL(alarm_init); * @alarm: ptr to alarm to set * @start: time to run the alarm */ -int alarm_start(struct alarm *alarm, ktime_t start) +void alarm_start(struct alarm *alarm, ktime_t start) { struct alarm_base *base = &alarm_bases[alarm->type]; unsigned long flags; - int ret; spin_lock_irqsave(&base->lock, flags); alarm->node.expires = start; alarmtimer_enqueue(base, alarm); - ret = hrtimer_start(&alarm->timer, alarm->node.expires, - HRTIMER_MODE_ABS); + hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS); spin_unlock_irqrestore(&base->lock, flags); - return ret; } EXPORT_SYMBOL_GPL(alarm_start); @@ -338,12 +335,12 @@ EXPORT_SYMBOL_GPL(alarm_start); * @alarm: ptr to alarm to set * @start: time relative to now to run the alarm */ -int alarm_start_relative(struct alarm *alarm, ktime_t start) +void alarm_start_relative(struct alarm *alarm, ktime_t start) { struct alarm_base *base = &alarm_bases[alarm->type]; start = ktime_add(start, base->gettime()); - return alarm_start(alarm, start); + alarm_start(alarm, start); } EXPORT_SYMBOL_GPL(alarm_start_relative); @@ -495,12 +492,12 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, */ static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp) { - clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid; - if (!alarmtimer_get_rtcdev()) return -EINVAL; - return hrtimer_get_res(baseid, tp); + tp->tv_sec = 0; + tp->tv_nsec = hrtimer_resolution; + return 0; } /** diff --git a/kernel/kernel/time/clockevents.c b/kernel/kernel/time/clockevents.c index 637a09461..a9b76a403 100644 --- a/kernel/kernel/time/clockevents.c +++ b/kernel/kernel/time/clockevents.c @@ -94,23 +94,9 @@ u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt) } EXPORT_SYMBOL_GPL(clockevent_delta2ns); -static int __clockevents_set_state(struct clock_event_device *dev, - enum clock_event_state state) +static int __clockevents_switch_state(struct clock_event_device *dev, + enum clock_event_state state) { - /* Transition with legacy set_mode() callback */ - if (dev->set_mode) { - /* Legacy callback doesn't support new modes */ - if (state > CLOCK_EVT_STATE_ONESHOT) - return -ENOSYS; - /* - * 'clock_event_state' and 'clock_event_mode' have 1-to-1 - * mapping until *_ONESHOT, and so a simple cast will work. - */ - dev->set_mode((enum clock_event_mode)state, dev); - dev->mode = (enum clock_event_mode)state; - return 0; - } - if (dev->features & CLOCK_EVT_FEAT_DUMMY) return 0; @@ -120,19 +106,37 @@ static int __clockevents_set_state(struct clock_event_device *dev, /* The clockevent device is getting replaced. Shut it down. */ case CLOCK_EVT_STATE_SHUTDOWN: - return dev->set_state_shutdown(dev); + if (dev->set_state_shutdown) + return dev->set_state_shutdown(dev); + return 0; case CLOCK_EVT_STATE_PERIODIC: /* Core internal bug */ if (!(dev->features & CLOCK_EVT_FEAT_PERIODIC)) return -ENOSYS; - return dev->set_state_periodic(dev); + if (dev->set_state_periodic) + return dev->set_state_periodic(dev); + return 0; case CLOCK_EVT_STATE_ONESHOT: /* Core internal bug */ if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT)) return -ENOSYS; - return dev->set_state_oneshot(dev); + if (dev->set_state_oneshot) + return dev->set_state_oneshot(dev); + return 0; + + case CLOCK_EVT_STATE_ONESHOT_STOPPED: + /* Core internal bug */ + if (WARN_ONCE(!clockevent_state_oneshot(dev), + "Current state: %d\n", + clockevent_get_state(dev))) + return -EINVAL; + + if (dev->set_state_oneshot_stopped) + return dev->set_state_oneshot_stopped(dev); + else + return -ENOSYS; default: return -ENOSYS; @@ -140,26 +144,26 @@ static int __clockevents_set_state(struct clock_event_device *dev, } /** - * clockevents_set_state - set the operating state of a clock event device + * clockevents_switch_state - set the operating state of a clock event device * @dev: device to modify * @state: new state * * Must be called with interrupts disabled ! */ -void clockevents_set_state(struct clock_event_device *dev, - enum clock_event_state state) +void clockevents_switch_state(struct clock_event_device *dev, + enum clock_event_state state) { - if (dev->state != state) { - if (__clockevents_set_state(dev, state)) + if (clockevent_get_state(dev) != state) { + if (__clockevents_switch_state(dev, state)) return; - dev->state = state; + clockevent_set_state(dev, state); /* * A nsec2cyc multiplicator of 0 is invalid and we'd crash * on it, so fix it up and emit a warning: */ - if (state == CLOCK_EVT_STATE_ONESHOT) { + if (clockevent_state_oneshot(dev)) { if (unlikely(!dev->mult)) { dev->mult = 1; WARN_ON(1); @@ -174,7 +178,7 @@ void clockevents_set_state(struct clock_event_device *dev, */ void clockevents_shutdown(struct clock_event_device *dev) { - clockevents_set_state(dev, CLOCK_EVT_STATE_SHUTDOWN); + clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN); dev->next_event.tv64 = KTIME_MAX; } @@ -186,12 +190,8 @@ int clockevents_tick_resume(struct clock_event_device *dev) { int ret = 0; - if (dev->set_mode) { - dev->set_mode(CLOCK_EVT_MODE_RESUME, dev); - dev->mode = CLOCK_EVT_MODE_RESUME; - } else if (dev->tick_resume) { + if (dev->tick_resume) ret = dev->tick_resume(dev); - } return ret; } @@ -248,7 +248,7 @@ static int clockevents_program_min_delta(struct clock_event_device *dev) delta = dev->min_delta_ns; dev->next_event = ktime_add_ns(ktime_get(), delta); - if (dev->state == CLOCK_EVT_STATE_SHUTDOWN) + if (clockevent_state_shutdown(dev)) return 0; dev->retries++; @@ -285,7 +285,7 @@ static int clockevents_program_min_delta(struct clock_event_device *dev) delta = dev->min_delta_ns; dev->next_event = ktime_add_ns(ktime_get(), delta); - if (dev->state == CLOCK_EVT_STATE_SHUTDOWN) + if (clockevent_state_shutdown(dev)) return 0; dev->retries++; @@ -317,9 +317,13 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, dev->next_event = expires; - if (dev->state == CLOCK_EVT_STATE_SHUTDOWN) + if (clockevent_state_shutdown(dev)) return 0; + /* We must be in ONESHOT state here */ + WARN_ONCE(!clockevent_state_oneshot(dev), "Current state: %d\n", + clockevent_get_state(dev)); + /* Shortcut for clockevent devices that can deal with ktime. */ if (dev->features & CLOCK_EVT_FEAT_KTIME) return dev->set_next_ktime(expires, dev); @@ -362,7 +366,7 @@ static int clockevents_replace(struct clock_event_device *ced) struct clock_event_device *dev, *newdev = NULL; list_for_each_entry(dev, &clockevent_devices, list) { - if (dev == ced || dev->state != CLOCK_EVT_STATE_DETACHED) + if (dev == ced || !clockevent_state_detached(dev)) continue; if (!tick_check_replacement(newdev, dev)) @@ -388,7 +392,7 @@ static int clockevents_replace(struct clock_event_device *ced) static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu) { /* Fast track. Device is unused */ - if (ced->state == CLOCK_EVT_STATE_DETACHED) { + if (clockevent_state_detached(ced)) { list_del_init(&ced->list); return 0; } @@ -438,37 +442,6 @@ int clockevents_unbind_device(struct clock_event_device *ced, int cpu) } EXPORT_SYMBOL_GPL(clockevents_unbind_device); -/* Sanity check of state transition callbacks */ -static int clockevents_sanity_check(struct clock_event_device *dev) -{ - /* Legacy set_mode() callback */ - if (dev->set_mode) { - /* We shouldn't be supporting new modes now */ - WARN_ON(dev->set_state_periodic || dev->set_state_oneshot || - dev->set_state_shutdown || dev->tick_resume); - - BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); - return 0; - } - - if (dev->features & CLOCK_EVT_FEAT_DUMMY) - return 0; - - /* New state-specific callbacks */ - if (!dev->set_state_shutdown) - return -EINVAL; - - if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) && - !dev->set_state_periodic) - return -EINVAL; - - if ((dev->features & CLOCK_EVT_FEAT_ONESHOT) && - !dev->set_state_oneshot) - return -EINVAL; - - return 0; -} - /** * clockevents_register_device - register a clock event device * @dev: device to register @@ -477,10 +450,8 @@ void clockevents_register_device(struct clock_event_device *dev) { unsigned long flags; - BUG_ON(clockevents_sanity_check(dev)); - /* Initialize state to DETACHED */ - dev->state = CLOCK_EVT_STATE_DETACHED; + clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED); if (!dev->cpumask) { WARN_ON(num_possible_cpus() > 1); @@ -545,11 +516,11 @@ int __clockevents_update_freq(struct clock_event_device *dev, u32 freq) { clockevents_config(dev, freq); - if (dev->state == CLOCK_EVT_STATE_ONESHOT) + if (clockevent_state_oneshot(dev)) return clockevents_program_event(dev, dev->next_event, false); - if (dev->state == CLOCK_EVT_STATE_PERIODIC) - return __clockevents_set_state(dev, CLOCK_EVT_STATE_PERIODIC); + if (clockevent_state_periodic(dev)) + return __clockevents_switch_state(dev, CLOCK_EVT_STATE_PERIODIC); return 0; } @@ -603,13 +574,13 @@ void clockevents_exchange_device(struct clock_event_device *old, */ if (old) { module_put(old->owner); - clockevents_set_state(old, CLOCK_EVT_STATE_DETACHED); + clockevents_switch_state(old, CLOCK_EVT_STATE_DETACHED); list_del(&old->list); list_add(&old->list, &clockevents_released); } if (new) { - BUG_ON(new->state != CLOCK_EVT_STATE_DETACHED); + BUG_ON(!clockevent_state_detached(new)); clockevents_shutdown(new); } } @@ -622,7 +593,7 @@ void clockevents_suspend(void) struct clock_event_device *dev; list_for_each_entry_reverse(dev, &clockevent_devices, list) - if (dev->suspend) + if (dev->suspend && !clockevent_state_detached(dev)) dev->suspend(dev); } @@ -634,7 +605,7 @@ void clockevents_resume(void) struct clock_event_device *dev; list_for_each_entry(dev, &clockevent_devices, list) - if (dev->resume) + if (dev->resume && !clockevent_state_detached(dev)) dev->resume(dev); } @@ -665,7 +636,7 @@ void tick_cleanup_dead_cpu(int cpu) if (cpumask_test_cpu(cpu, dev->cpumask) && cpumask_weight(dev->cpumask) == 1 && !tick_is_broadcast_device(dev)) { - BUG_ON(dev->state != CLOCK_EVT_STATE_DETACHED); + BUG_ON(!clockevent_state_detached(dev)); list_del(&dev->list); } } diff --git a/kernel/kernel/time/clocksource.c b/kernel/kernel/time/clocksource.c index 15facb1b9..1347882d1 100644 --- a/kernel/kernel/time/clocksource.c +++ b/kernel/kernel/time/clocksource.c @@ -23,6 +23,8 @@ * o Allow clocksource drivers to be unregistered */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/device.h> #include <linux/clocksource.h> #include <linux/init.h> @@ -215,11 +217,12 @@ static void clocksource_watchdog(unsigned long data) continue; /* Check the deviation from the watchdog clocksource. */ - if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) { - pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable, because the skew is too large:\n", cs->name); - pr_warn(" '%s' wd_now: %llx wd_last: %llx mask: %llx\n", + if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) { + pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable because the skew is too large:\n", + cs->name); + pr_warn(" '%s' wd_now: %llx wd_last: %llx mask: %llx\n", watchdog->name, wdnow, wdlast, watchdog->mask); - pr_warn(" '%s' cs_now: %llx cs_last: %llx mask: %llx\n", + pr_warn(" '%s' cs_now: %llx cs_last: %llx mask: %llx\n", cs->name, csnow, cslast, cs->mask); __clocksource_unstable(cs); continue; @@ -476,7 +479,7 @@ static u32 clocksource_max_adjustment(struct clocksource *cs) * return half the number of nanoseconds the hardware counter can technically * cover. This is done so that we can potentially detect problems caused by * delayed timers or bad hardware, which might result in time intervals that - * are larger then what the math used can handle without overflows. + * are larger than what the math used can handle without overflows. */ u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask, u64 *max_cyc) { @@ -567,9 +570,8 @@ static void __clocksource_select(bool skipcur) */ if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && oneshot) { /* Override clocksource cannot be used. */ - printk(KERN_WARNING "Override clocksource %s is not " - "HRT compatible. Cannot switch while in " - "HRT/NOHZ mode\n", cs->name); + pr_warn("Override clocksource %s is not HRT compatible - cannot switch while in HRT/NOHZ mode\n", + cs->name); override_name[0] = 0; } else /* Override clocksource can be used. */ @@ -593,16 +595,15 @@ static void __clocksource_select(bool skipcur) */ static void clocksource_select(void) { - return __clocksource_select(false); + __clocksource_select(false); } static void clocksource_select_fallback(void) { - return __clocksource_select(true); + __clocksource_select(true); } #else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */ - static inline void clocksource_select(void) { } static inline void clocksource_select_fallback(void) { } @@ -708,8 +709,8 @@ void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq clocksource_update_max_deferment(cs); - pr_info("clocksource %s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n", - cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns); + pr_info("%s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n", + cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns); } EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale); @@ -1008,12 +1009,10 @@ __setup("clocksource=", boot_override_clocksource); static int __init boot_override_clock(char* str) { if (!strcmp(str, "pmtmr")) { - printk("Warning: clock=pmtmr is deprecated. " - "Use clocksource=acpi_pm.\n"); + pr_warn("clock=pmtmr is deprecated - use clocksource=acpi_pm\n"); return boot_override_clocksource("acpi_pm"); } - printk("Warning! clock= boot option is deprecated. " - "Use clocksource=xyz\n"); + pr_warn("clock= boot option is deprecated - use clocksource=xyz\n"); return boot_override_clocksource(str); } diff --git a/kernel/kernel/time/hrtimer.c b/kernel/kernel/time/hrtimer.c index 5d193396e..2659c632a 100644 --- a/kernel/kernel/time/hrtimer.c +++ b/kernel/kernel/time/hrtimer.c @@ -61,40 +61,36 @@ /* * The timer bases: * - * There are more clockids then hrtimer bases. Thus, we index + * There are more clockids than hrtimer bases. Thus, we index * into the timer bases by the hrtimer_base_type enum. When trying * to reach a base using a clockid, hrtimer_clockid_to_base() * is used to convert from clockid to the proper hrtimer_base_type. */ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = { - .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock), + .seq = SEQCNT_ZERO(hrtimer_bases.seq), .clock_base = { { .index = HRTIMER_BASE_MONOTONIC, .clockid = CLOCK_MONOTONIC, .get_time = &ktime_get, - .resolution = KTIME_LOW_RES, }, { .index = HRTIMER_BASE_REALTIME, .clockid = CLOCK_REALTIME, .get_time = &ktime_get_real, - .resolution = KTIME_LOW_RES, }, { .index = HRTIMER_BASE_BOOTTIME, .clockid = CLOCK_BOOTTIME, .get_time = &ktime_get_boottime, - .resolution = KTIME_LOW_RES, }, { .index = HRTIMER_BASE_TAI, .clockid = CLOCK_TAI, .get_time = &ktime_get_clocktai, - .resolution = KTIME_LOW_RES, }, } }; @@ -111,27 +107,6 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id) return hrtimer_clock_to_base_table[clock_id]; } - -/* - * Get the coarse grained time at the softirq based on xtime and - * wall_to_monotonic. - */ -static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) -{ - ktime_t xtim, mono, boot, tai; - ktime_t off_real, off_boot, off_tai; - - mono = ktime_get_update_offsets_tick(&off_real, &off_boot, &off_tai); - boot = ktime_add(mono, off_boot); - xtim = ktime_add(mono, off_real); - tai = ktime_add(mono, off_tai); - - base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim; - base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono; - base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot; - base->clock_base[HRTIMER_BASE_TAI].softirq_time = tai; -} - /* * Functions and macros which are different for UP/SMP systems are kept in a * single place @@ -139,6 +114,18 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) #ifdef CONFIG_SMP /* + * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base() + * such that hrtimer_callback_running() can unconditionally dereference + * timer->base->cpu_base + */ +static struct hrtimer_cpu_base migration_cpu_base = { + .seq = SEQCNT_ZERO(migration_cpu_base), + .clock_base = { { .cpu_base = &migration_cpu_base, }, }, +}; + +#define migration_base migration_cpu_base.clock_base[0] + +/* * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock * means that all timers which are tied to this base via timer->base are * locked, and the base itself is locked too. @@ -147,8 +134,8 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) * be found on the lists/queues. * * When the timer's base is locked, and the timer removed from list, it is - * possible to set timer->base = NULL and drop the lock: the timer remains - * locked. + * possible to set timer->base = &migration_base and drop the lock: the timer + * remains locked. */ static struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, @@ -158,7 +145,7 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, for (;;) { base = timer->base; - if (likely(base != NULL)) { + if (likely(base != &migration_base)) { raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); if (likely(base == timer->base)) return base; @@ -192,21 +179,47 @@ hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base) #endif } +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +static inline +struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, + int pinned) +{ + if (pinned || !base->migration_enabled) + return base; + return &per_cpu(hrtimer_bases, get_nohz_timer_target()); +} +#else +static inline +struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, + int pinned) +{ + return base; +} +#endif + /* - * Switch the timer base to the current CPU when possible. + * We switch the timer base to a power-optimized selected CPU target, + * if: + * - NO_HZ_COMMON is enabled + * - timer migration is enabled + * - the timer callback is not running + * - the timer is not the first expiring timer on the new target + * + * If one of the above requirements is not fulfilled we move the timer + * to the current CPU or leave it on the previously assigned CPU if + * the timer callback is currently running. */ static inline struct hrtimer_clock_base * switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, int pinned) { + struct hrtimer_cpu_base *new_cpu_base, *this_cpu_base; struct hrtimer_clock_base *new_base; - struct hrtimer_cpu_base *new_cpu_base; - int this_cpu = smp_processor_id(); - int cpu = get_nohz_timer_target(pinned); int basenum = base->index; + this_cpu_base = this_cpu_ptr(&hrtimer_bases); + new_cpu_base = get_target_base(this_cpu_base, pinned); again: - new_cpu_base = &per_cpu(hrtimer_bases, cpu); new_base = &new_cpu_base->clock_base[basenum]; if (base != new_base) { @@ -222,22 +235,24 @@ again: if (unlikely(hrtimer_callback_running(timer))) return base; - /* See the comment in lock_timer_base() */ - timer->base = NULL; + /* See the comment in lock_hrtimer_base() */ + timer->base = &migration_base; raw_spin_unlock(&base->cpu_base->lock); raw_spin_lock(&new_base->cpu_base->lock); - if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) { - cpu = this_cpu; + if (new_cpu_base != this_cpu_base && + hrtimer_check_target(timer, new_base)) { raw_spin_unlock(&new_base->cpu_base->lock); raw_spin_lock(&base->cpu_base->lock); + new_cpu_base = this_cpu_base; timer->base = base; goto again; } timer->base = new_base; } else { - if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) { - cpu = this_cpu; + if (new_cpu_base != this_cpu_base && + hrtimer_check_target(timer, new_base)) { + new_cpu_base = this_cpu_base; goto again; } } @@ -445,24 +460,35 @@ static inline void debug_deactivate(struct hrtimer *timer) } #if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS) +static inline void hrtimer_update_next_timer(struct hrtimer_cpu_base *cpu_base, + struct hrtimer *timer) +{ +#ifdef CONFIG_HIGH_RES_TIMERS + cpu_base->next_timer = timer; +#endif +} + static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base) { struct hrtimer_clock_base *base = cpu_base->clock_base; ktime_t expires, expires_next = { .tv64 = KTIME_MAX }; - int i; + unsigned int active = cpu_base->active_bases; - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { + hrtimer_update_next_timer(cpu_base, NULL); + for (; active; base++, active >>= 1) { struct timerqueue_node *next; struct hrtimer *timer; - next = timerqueue_getnext(&base->active); - if (!next) + if (!(active & 0x01)) continue; + next = timerqueue_getnext(&base->active); timer = container_of(next, struct hrtimer, node); expires = ktime_sub(hrtimer_get_expires(timer), base->offset); - if (expires.tv64 < expires_next.tv64) + if (expires.tv64 < expires_next.tv64) { expires_next = expires; + hrtimer_update_next_timer(cpu_base, timer); + } } /* * clock_was_set() might have changed base->offset of any of @@ -475,6 +501,16 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base) } #endif +static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) +{ + ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; + ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; + ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; + + return ktime_get_update_offsets_now(&base->clock_was_set_seq, + offs_real, offs_boot, offs_tai); +} + /* High resolution timer related functions */ #ifdef CONFIG_HIGH_RES_TIMERS @@ -482,6 +518,8 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base) * High resolution timer enabled ? */ static int hrtimer_hres_enabled __read_mostly = 1; +unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC; +EXPORT_SYMBOL_GPL(hrtimer_resolution); /* * Enable / Disable high resolution mode @@ -510,9 +548,14 @@ static inline int hrtimer_is_hres_enabled(void) /* * Is the high resolution mode active ? */ +static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base) +{ + return cpu_base->hres_active; +} + static inline int hrtimer_hres_active(void) { - return __this_cpu_read(hrtimer_bases.hres_active); + return __hrtimer_hres_active(this_cpu_ptr(&hrtimer_bases)); } /* @@ -523,7 +566,12 @@ static inline int hrtimer_hres_active(void) static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) { - ktime_t expires_next = __hrtimer_get_next_event(cpu_base); + ktime_t expires_next; + + if (!cpu_base->hres_active) + return; + + expires_next = __hrtimer_get_next_event(cpu_base); if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64) return; @@ -547,41 +595,40 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) if (cpu_base->hang_detected) return; - if (cpu_base->expires_next.tv64 != KTIME_MAX) - tick_program_event(cpu_base->expires_next, 1); + tick_program_event(cpu_base->expires_next, 1); } /* - * Shared reprogramming for clock_realtime and clock_monotonic - * * When a timer is enqueued and expires earlier than the already enqueued * timers, we have to check, whether it expires earlier than the timer for * which the clock event device was armed. * - * Note, that in case the state has HRTIMER_STATE_CALLBACK set, no reprogramming - * and no expiry check happens. The timer gets enqueued into the rbtree. The - * reprogramming and expiry check is done in the hrtimer_interrupt or in the - * softirq. - * * Called with interrupts disabled and base->cpu_base.lock held */ -static int hrtimer_reprogram(struct hrtimer *timer, - struct hrtimer_clock_base *base) +static void hrtimer_reprogram(struct hrtimer *timer, + struct hrtimer_clock_base *base) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); - int res; WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0); /* - * When the callback is running, we do not reprogram the clock event - * device. The timer callback is either running on a different CPU or - * the callback is executed in the hrtimer_interrupt context. The - * reprogramming is handled at the end of the hrtimer_interrupt. + * If the timer is not on the current cpu, we cannot reprogram + * the other cpus clock event device. */ - if (hrtimer_callback_running(timer)) - return 0; + if (base->cpu_base != cpu_base) + return; + + /* + * If the hrtimer interrupt is running, then it will + * reevaluate the clock bases and reprogram the clock event + * device. The callbacks are always executed in hard interrupt + * context so we don't need an extra check for a running + * callback. + */ + if (cpu_base->in_hrtirq) + return; if (base->cpu_base != cpu_base) return 0; @@ -591,24 +638,16 @@ static int hrtimer_reprogram(struct hrtimer *timer, /* * CLOCK_REALTIME timer might be requested with an absolute - * expiry time which is less than base->offset. Nothing wrong - * about that, just avoid to call into the tick code, which - * has now objections against negative expiry values. + * expiry time which is less than base->offset. Set it to 0. */ if (expires.tv64 < 0) - return -ETIME; + expires.tv64 = 0; if (expires.tv64 >= cpu_base->expires_next.tv64) - return 0; + return; - /* - * When the target cpu of the timer is currently executing - * hrtimer_interrupt(), then we do not touch the clock event - * device. hrtimer_interrupt() will reevaluate all clock bases - * before reprogramming the device. - */ - if (cpu_base->in_hrtirq) - return 0; + /* Update the pointer to the next expiring timer */ + cpu_base->next_timer = timer; /* * If a hang was detected in the last timer interrupt then we @@ -617,19 +656,16 @@ static int hrtimer_reprogram(struct hrtimer *timer, * to make progress. */ if (cpu_base->hang_detected) - return 0; + return; cpu_base->expires_next = expires; /* - * Clockevents returns -ETIME, when the event was in the past. + * Program the timer hardware. We enforce the expiry for + * events which are already in the past. */ - res = tick_program_event(expires, 1); - return res; + tick_program_event(expires, 1); } -static void __run_hrtimer(struct hrtimer *timer, ktime_t *now); -static int hrtimer_rt_defer(struct hrtimer *timer); - /* * Initialize the high resolution related parts of cpu_base */ @@ -639,30 +675,6 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) base->hres_active = 0; } -static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, - struct hrtimer_clock_base *base, - int wakeup) -{ - if (!hrtimer_reprogram(timer, base)) - return 0; - if (!wakeup) - return -ETIME; -#ifdef CONFIG_PREEMPT_RT_BASE - if (!hrtimer_rt_defer(timer)) - return -ETIME; -#endif - return 1; -} - -static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) -{ - ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; - ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; - ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; - - return ktime_get_update_offsets_now(offs_real, offs_boot, offs_tai); -} - /* * Retrigger next event is called after clock was set * @@ -672,7 +684,7 @@ static void retrigger_next_event(void *arg) { struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); - if (!hrtimer_hres_active()) + if (!base->hres_active) return; raw_spin_lock(&base->lock); @@ -684,32 +696,21 @@ static void retrigger_next_event(void *arg) /* * Switch to high resolution mode */ -static int hrtimer_switch_to_hres(void) +static void hrtimer_switch_to_hres(void) { - int i, cpu = smp_processor_id(); - struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu); - unsigned long flags; - - if (base->hres_active) - return 1; - - local_irq_save(flags); + struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); if (tick_init_highres()) { - local_irq_restore(flags); printk(KERN_WARNING "Could not switch to high resolution " - "mode on CPU %d\n", cpu); - return 0; + "mode on CPU %d\n", base->cpu); + return; } base->hres_active = 1; - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) - base->clock_base[i].resolution = KTIME_HIGH_RES; + hrtimer_resolution = HIGH_RES_NSEC; tick_setup_sched_timer(); /* "Retrigger" the interrupt to get things going */ retrigger_next_event(NULL); - local_irq_restore(flags); - return 1; } static void clock_was_set_work(struct work_struct *work) @@ -769,25 +770,17 @@ void clock_was_set_delayed(void) #else +static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *b) { return 0; } static inline int hrtimer_hres_active(void) { return 0; } static inline int hrtimer_is_hres_enabled(void) { return 0; } -static inline int hrtimer_switch_to_hres(void) { return 0; } +static inline void hrtimer_switch_to_hres(void) { } static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { } -static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, - struct hrtimer_clock_base *base, - int wakeup) -{ - return 0; -} - -static inline int hrtimer_reprogram(struct hrtimer *timer, - struct hrtimer_clock_base *base) -{ - return 0; -} +static inline void hrtimer_reprogram(struct hrtimer *timer, + struct hrtimer_clock_base *base) { } static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } static inline void retrigger_next_event(void *arg) { } + #endif /* CONFIG_HIGH_RES_TIMERS */ /* @@ -872,6 +865,14 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) * * Forward the timer expiry so it will expire in the future. * Returns the number of overruns. + * + * Can be safely called from the callback function of @timer. If + * called from other contexts @timer must neither be enqueued nor + * running the callback and the caller needs to take care of + * serialization. + * + * Note: This only updates the timer expiry value and does not requeue + * the timer. */ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) { @@ -883,8 +884,11 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) if (delta.tv64 < 0) return 0; - if (interval.tv64 < timer->base->resolution.tv64) - interval.tv64 = timer->base->resolution.tv64; + if (WARN_ON(timer->state & HRTIMER_STATE_ENQUEUED)) + return 0; + + if (interval.tv64 < hrtimer_resolution) + interval.tv64 = hrtimer_resolution; if (unlikely(delta.tv64 >= interval.tv64)) { s64 incr = ktime_to_ns(interval); @@ -924,7 +928,7 @@ void hrtimer_wait_for_timer(const struct hrtimer *timer) if (base && base->cpu_base && !timer->irqsafe) wait_event(base->cpu_base->wait, - !(timer->state & HRTIMER_STATE_CALLBACK)); + !(hrtimer_callback_running(timer))); } #else @@ -944,16 +948,11 @@ static int enqueue_hrtimer(struct hrtimer *timer, { debug_activate(timer); - timerqueue_add(&base->active, &timer->node); base->cpu_base->active_bases |= 1 << base->index; - /* - * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the - * state of a possibly running callback. - */ - timer->state |= HRTIMER_STATE_ENQUEUED; + timer->state = HRTIMER_STATE_ENQUEUED; - return (&timer->node == base->active.next); + return timerqueue_add(&base->active, &timer->node); } /* @@ -968,46 +967,45 @@ static int enqueue_hrtimer(struct hrtimer *timer, */ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, - unsigned long newstate, int reprogram) + u8 newstate, int reprogram) { - struct timerqueue_node *next_timer; - if (!(timer->state & HRTIMER_STATE_ENQUEUED)) - goto out; + struct hrtimer_cpu_base *cpu_base = base->cpu_base; + u8 state = timer->state; + + timer->state = newstate; + if (!(state & HRTIMER_STATE_ENQUEUED)) + return; if (unlikely(!list_empty(&timer->cb_entry))) { list_del_init(&timer->cb_entry); - goto out; + return; } - next_timer = timerqueue_getnext(&base->active); - timerqueue_del(&base->active, &timer->node); - if (&timer->node == next_timer) { + if (!timerqueue_del(&base->active, &timer->node)) + cpu_base->active_bases &= ~(1 << base->index); + #ifdef CONFIG_HIGH_RES_TIMERS - /* Reprogram the clock event device. if enabled */ - if (reprogram && hrtimer_hres_active()) { - ktime_t expires; - - expires = ktime_sub(hrtimer_get_expires(timer), - base->offset); - if (base->cpu_base->expires_next.tv64 == expires.tv64) - hrtimer_force_reprogram(base->cpu_base, 1); - } + /* + * Note: If reprogram is false we do not update + * cpu_base->next_timer. This happens when we remove the first + * timer on a remote cpu. No harm as we never dereference + * cpu_base->next_timer. So the worst thing what can happen is + * an superflous call to hrtimer_force_reprogram() on the + * remote cpu later on if the same timer gets enqueued again. + */ + if (reprogram && timer == cpu_base->next_timer) + hrtimer_force_reprogram(cpu_base, 1); #endif - } - if (!timerqueue_getnext(&base->active)) - base->cpu_base->active_bases &= ~(1 << base->index); -out: - timer->state = newstate; } /* * remove hrtimer, called with base lock held */ static inline int -remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) +remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart) { if (hrtimer_is_queued(timer)) { - unsigned long state; + u8 state = timer->state; int reprogram; /* @@ -1021,44 +1019,56 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) debug_deactivate(timer); timer_stats_hrtimer_clear_start_info(timer); reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases); - /* - * We must preserve the CALLBACK state flag here, - * otherwise we could move the timer base in - * switch_hrtimer_base. - */ - state = timer->state & HRTIMER_STATE_CALLBACK; + + if (!restart) + state = HRTIMER_STATE_INACTIVE; + __remove_hrtimer(timer, base, state, reprogram); return 1; } return 0; } -int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, - unsigned long delta_ns, const enum hrtimer_mode mode, - int wakeup) +static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim, + const enum hrtimer_mode mode) +{ +#ifdef CONFIG_TIME_LOW_RES + /* + * CONFIG_TIME_LOW_RES indicates that the system has no way to return + * granular time values. For relative timers we add hrtimer_resolution + * (i.e. one jiffie) to prevent short timeouts. + */ + timer->is_rel = mode & HRTIMER_MODE_REL; + if (timer->is_rel) + tim = ktime_add_safe(tim, ktime_set(0, hrtimer_resolution)); +#endif + return tim; +} + +/** + * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU + * @timer: the timer to be added + * @tim: expiry time + * @delta_ns: "slack" range for the timer + * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or + * relative (HRTIMER_MODE_REL) + */ +void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, + unsigned long delta_ns, const enum hrtimer_mode mode) { struct hrtimer_clock_base *base, *new_base; unsigned long flags; - int ret, leftmost; + int leftmost; base = lock_hrtimer_base(timer, &flags); /* Remove an active timer from the queue: */ - ret = remove_hrtimer(timer, base); + remove_hrtimer(timer, base, true); - if (mode & HRTIMER_MODE_REL) { + if (mode & HRTIMER_MODE_REL) tim = ktime_add_safe(tim, base->get_time()); - /* - * CONFIG_TIME_LOW_RES is a temporary way for architectures - * to signal that they simply return xtime in - * do_gettimeoffset(). In this case we want to round up by - * resolution when starting a relative timer, to avoid short - * timeouts. This will go away with the GTOD framework. - */ -#ifdef CONFIG_TIME_LOW_RES - tim = ktime_add_safe(tim, base->resolution); -#endif - } + + tim = hrtimer_update_lowres(timer, tim, mode); hrtimer_set_expires_range_ns(timer, tim, delta_ns); @@ -1077,94 +1087,25 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, } #endif leftmost = enqueue_hrtimer(timer, new_base); - - if (!leftmost) { - unlock_hrtimer_base(timer, &flags); - return ret; - } + if (!leftmost) + goto unlock; if (!hrtimer_is_hres_active(timer)) { /* * Kick to reschedule the next tick to handle the new timer * on dynticks target. */ - wake_up_nohz_cpu(new_base->cpu_base->cpu); - } else if (new_base->cpu_base == this_cpu_ptr(&hrtimer_bases)) { - - ret = hrtimer_enqueue_reprogram(timer, new_base, wakeup); - if (ret < 0) { - /* - * In case we failed to reprogram the timer (mostly - * because out current timer is already elapsed), - * remove it again and report a failure. This avoids - * stale base->first entries. - */ - debug_deactivate(timer); - __remove_hrtimer(timer, new_base, - timer->state & HRTIMER_STATE_CALLBACK, 0); - } else if (ret > 0) { - /* - * Only allow reprogramming if the new base is on this CPU. - * (it might still be on another CPU if the timer was pending) - * - * XXX send_remote_softirq() ? - */ - /* - * We need to drop cpu_base->lock to avoid a - * lock ordering issue vs. rq->lock. - */ - raw_spin_unlock(&new_base->cpu_base->lock); - raise_softirq_irqoff(HRTIMER_SOFTIRQ); - local_irq_restore(flags); - return 0; - } + if (new_base->cpu_base->nohz_active) + wake_up_nohz_cpu(new_base->cpu_base->cpu); + } else { + hrtimer_reprogram(timer, new_base); } - +unlock: unlock_hrtimer_base(timer, &flags); - - return ret; -} -EXPORT_SYMBOL_GPL(__hrtimer_start_range_ns); - -/** - * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU - * @timer: the timer to be added - * @tim: expiry time - * @delta_ns: "slack" range for the timer - * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or - * relative (HRTIMER_MODE_REL) - * - * Returns: - * 0 on success - * 1 when the timer was active - */ -int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, - unsigned long delta_ns, const enum hrtimer_mode mode) -{ - return __hrtimer_start_range_ns(timer, tim, delta_ns, mode, 1); } EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); /** - * hrtimer_start - (re)start an hrtimer on the current CPU - * @timer: the timer to be added - * @tim: expiry time - * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or - * relative (HRTIMER_MODE_REL) - * - * Returns: - * 0 on success - * 1 when the timer was active - */ -int -hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) -{ - return __hrtimer_start_range_ns(timer, tim, 0, mode, 1); -} -EXPORT_SYMBOL_GPL(hrtimer_start); - - -/** * hrtimer_try_to_cancel - try to deactivate a timer * @timer: hrtimer to stop * @@ -1180,10 +1121,19 @@ int hrtimer_try_to_cancel(struct hrtimer *timer) unsigned long flags; int ret = -1; + /* + * Check lockless first. If the timer is not active (neither + * enqueued nor running the callback, nothing to do here. The + * base lock does not serialize against a concurrent enqueue, + * so we can avoid taking it. + */ + if (!hrtimer_active(timer)) + return 0; + base = lock_hrtimer_base(timer, &flags); if (!hrtimer_callback_running(timer)) - ret = remove_hrtimer(timer, base); + ret = remove_hrtimer(timer, base, false); unlock_hrtimer_base(timer, &flags); @@ -1215,44 +1165,44 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel); /** * hrtimer_get_remaining - get remaining time for the timer * @timer: the timer to read + * @adjust: adjust relative timers when CONFIG_TIME_LOW_RES=y */ -ktime_t hrtimer_get_remaining(const struct hrtimer *timer) +ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust) { unsigned long flags; ktime_t rem; lock_hrtimer_base(timer, &flags); - rem = hrtimer_expires_remaining(timer); + if (IS_ENABLED(CONFIG_TIME_LOW_RES) && adjust) + rem = hrtimer_expires_remaining_adjusted(timer); + else + rem = hrtimer_expires_remaining(timer); unlock_hrtimer_base(timer, &flags); return rem; } -EXPORT_SYMBOL_GPL(hrtimer_get_remaining); +EXPORT_SYMBOL_GPL(__hrtimer_get_remaining); #ifdef CONFIG_NO_HZ_COMMON /** * hrtimer_get_next_event - get the time until next expiry event * - * Returns the delta to the next expiry event or KTIME_MAX if no timer - * is pending. + * Returns the next expiry time or KTIME_MAX if no timer is pending. */ -ktime_t hrtimer_get_next_event(void) +u64 hrtimer_get_next_event(void) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); - ktime_t mindelta = { .tv64 = KTIME_MAX }; + u64 expires = KTIME_MAX; unsigned long flags; raw_spin_lock_irqsave(&cpu_base->lock, flags); - if (!hrtimer_hres_active()) - mindelta = ktime_sub(__hrtimer_get_next_event(cpu_base), - ktime_get()); + if (!__hrtimer_hres_active(cpu_base)) + expires = __hrtimer_get_next_event(cpu_base).tv64; raw_spin_unlock_irqrestore(&cpu_base->lock, flags); - if (mindelta.tv64 < 0) - mindelta.tv64 = 0; - return mindelta; + return expires; } #endif @@ -1295,41 +1245,86 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, } EXPORT_SYMBOL_GPL(hrtimer_init); -/** - * hrtimer_get_res - get the timer resolution for a clock - * @which_clock: which clock to query - * @tp: pointer to timespec variable to store the resolution +/* + * A timer is active, when it is enqueued into the rbtree or the + * callback function is running or it's in the state of being migrated + * to another cpu. * - * Store the resolution of the clock selected by @which_clock in the - * variable pointed to by @tp. + * It is important for this function to not return a false negative. */ -int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) +bool hrtimer_active(const struct hrtimer *timer) { struct hrtimer_cpu_base *cpu_base; - int base = hrtimer_clockid_to_base(which_clock); + unsigned int seq; - cpu_base = raw_cpu_ptr(&hrtimer_bases); - *tp = ktime_to_timespec(cpu_base->clock_base[base].resolution); + do { + cpu_base = READ_ONCE(timer->base->cpu_base); + seq = raw_read_seqcount_begin(&cpu_base->seq); - return 0; + if (timer->state != HRTIMER_STATE_INACTIVE || + cpu_base->running_soft == timer || + cpu_base->running == timer) + return true; + + } while (read_seqcount_retry(&cpu_base->seq, seq) || + cpu_base != READ_ONCE(timer->base->cpu_base)); + + return false; } -EXPORT_SYMBOL_GPL(hrtimer_get_res); +EXPORT_SYMBOL_GPL(hrtimer_active); + +/* + * The write_seqcount_barrier()s in __run_hrtimer() split the thing into 3 + * distinct sections: + * + * - queued: the timer is queued + * - callback: the timer is being ran + * - post: the timer is inactive or (re)queued + * + * On the read side we ensure we observe timer->state and cpu_base->running + * from the same section, if anything changed while we looked at it, we retry. + * This includes timer->base changing because sequence numbers alone are + * insufficient for that. + * + * The sequence numbers are required because otherwise we could still observe + * a false negative if the read side got smeared over multiple consequtive + * __run_hrtimer() invocations. + */ -static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) +static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, + struct hrtimer_clock_base *base, + struct hrtimer *timer, ktime_t *now) { - struct hrtimer_clock_base *base = timer->base; - struct hrtimer_cpu_base *cpu_base = base->cpu_base; enum hrtimer_restart (*fn)(struct hrtimer *); int restart; - WARN_ON(!irqs_disabled()); + lockdep_assert_held(&cpu_base->lock); debug_deactivate(timer); - __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); + cpu_base->running = timer; + + /* + * Separate the ->running assignment from the ->state assignment. + * + * As with a regular write barrier, this ensures the read side in + * hrtimer_active() cannot observe cpu_base->running == NULL && + * timer->state == INACTIVE. + */ + raw_write_seqcount_barrier(&cpu_base->seq); + + __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0); timer_stats_account_hrtimer(timer); fn = timer->function; /* + * Clear the 'is relative' flag for the TIME_LOW_RES case. If the + * timer is restarted with a period then it becomes an absolute + * timer. If its not restarted it does not matter. + */ + if (IS_ENABLED(CONFIG_TIME_LOW_RES)) + timer->is_rel = false; + + /* * Because we run timers from hardirq context, there is no chance * they get migrated to another cpu, therefore its safe to unlock * the timer base. @@ -1341,69 +1336,57 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) raw_spin_lock(&cpu_base->lock); /* - * Note: We clear the CALLBACK bit after enqueue_hrtimer and + * Note: We clear the running state after enqueue_hrtimer and * we do not reprogramm the event hardware. Happens either in * hrtimer_start_range_ns() or in hrtimer_interrupt() + * + * Note: Because we dropped the cpu_base->lock above, + * hrtimer_start_range_ns() can have popped in and enqueued the timer + * for us already. */ - if (restart != HRTIMER_NORESTART) { - BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); + if (restart != HRTIMER_NORESTART && + !(timer->state & HRTIMER_STATE_ENQUEUED)) enqueue_hrtimer(timer, base); - } - WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK)); + /* + * Separate the ->running assignment from the ->state assignment. + * + * As with a regular write barrier, this ensures the read side in + * hrtimer_active() cannot observe cpu_base->running == NULL && + * timer->state == INACTIVE. + */ + raw_write_seqcount_barrier(&cpu_base->seq); - timer->state &= ~HRTIMER_STATE_CALLBACK; + WARN_ON_ONCE(cpu_base->running != timer); + cpu_base->running = NULL; } -static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer); - #ifdef CONFIG_PREEMPT_RT_BASE static void hrtimer_rt_reprogram(int restart, struct hrtimer *timer, struct hrtimer_clock_base *base) { - /* - * Note, we clear the callback flag before we requeue the - * timer otherwise we trigger the callback_running() check - * in hrtimer_reprogram(). - */ - timer->state &= ~HRTIMER_STATE_CALLBACK; - - if (restart != HRTIMER_NORESTART) { - BUG_ON(hrtimer_active(timer)); - /* - * Enqueue the timer, if it's the leftmost timer then - * we need to reprogram it. - */ - if (!enqueue_hrtimer(timer, base)) - return; + int leftmost; -#ifndef CONFIG_HIGH_RES_TIMERS - } -#else - if (base->cpu_base->hres_active && - hrtimer_reprogram(timer, base)) - goto requeue; + if (restart != HRTIMER_NORESTART && + !(timer->state & HRTIMER_STATE_ENQUEUED)) { - } else if (hrtimer_active(timer)) { - /* - * If the timer was rearmed on another CPU, reprogram - * the event device. - */ - if (&timer->node == base->active.next && - base->cpu_base->hres_active && - hrtimer_reprogram(timer, base)) - goto requeue; - } - return; + leftmost = enqueue_hrtimer(timer, base); + if (!leftmost) + return; +#ifdef CONFIG_HIGH_RES_TIMERS + if (!hrtimer_is_hres_active(timer)) { + /* + * Kick to reschedule the next tick to handle the new timer + * on dynticks target. + */ + if (base->cpu_base->nohz_active) + wake_up_nohz_cpu(base->cpu_base->cpu); + } else { -requeue: - /* - * Timer is expired. Thus move it from tree to pending list - * again. - */ - __remove_hrtimer(timer, base, timer->state, 0); - list_add_tail(&timer->cb_entry, &base->expired); + hrtimer_reprogram(timer, base); + } #endif + } } /* @@ -1436,8 +1419,11 @@ static void hrtimer_rt_run_pending(void) * Same as the above __run_hrtimer function * just we run with interrupts enabled. */ - debug_hrtimer_deactivate(timer); - __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); + debug_deactivate(timer); + cpu_base->running_soft = timer; + raw_write_seqcount_barrier(&cpu_base->seq); + + __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0); timer_stats_account_hrtimer(timer); fn = timer->function; @@ -1446,6 +1432,10 @@ static void hrtimer_rt_run_pending(void) raw_spin_lock_irq(&cpu_base->lock); hrtimer_rt_reprogram(restart, timer, base); + raw_write_seqcount_barrier(&cpu_base->seq); + + WARN_ON_ONCE(cpu_base->running_soft != timer); + cpu_base->running_soft = NULL; } } @@ -1466,53 +1456,25 @@ static int hrtimer_rt_defer(struct hrtimer *timer) #else -static inline void hrtimer_rt_run_pending(void) -{ - hrtimer_peek_ahead_timers(); -} - static inline int hrtimer_rt_defer(struct hrtimer *timer) { return 0; } #endif -#ifdef CONFIG_HIGH_RES_TIMERS +static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer); -/* - * High resolution timer interrupt - * Called with interrupts disabled - */ -void hrtimer_interrupt(struct clock_event_device *dev) +static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now) { - struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); - ktime_t expires_next, now, entry_time, delta; - int i, retries = 0, raise = 0; - - BUG_ON(!cpu_base->hres_active); - cpu_base->nr_events++; - dev->next_event.tv64 = KTIME_MAX; - - raw_spin_lock(&cpu_base->lock); - entry_time = now = hrtimer_update_base(cpu_base); -retry: - cpu_base->in_hrtirq = 1; - /* - * We set expires_next to KTIME_MAX here with cpu_base->lock - * held to prevent that a timer is enqueued in our queue via - * the migration code. This does not affect enqueueing of - * timers which run their callback and need to be requeued on - * this CPU. - */ - cpu_base->expires_next.tv64 = KTIME_MAX; + struct hrtimer_clock_base *base = cpu_base->clock_base; + unsigned int active = cpu_base->active_bases; + int raise = 0; - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { - struct hrtimer_clock_base *base; + for (; active; base++, active >>= 1) { struct timerqueue_node *node; ktime_t basenow; - if (!(cpu_base->active_bases & (1 << i))) + if (!(active & 0x01)) continue; - base = cpu_base->clock_base + i; basenow = ktime_add(now, base->offset); while ((node = timerqueue_getnext(&base->active))) { @@ -1545,11 +1507,46 @@ retry: break; if (!hrtimer_rt_defer(timer)) - __run_hrtimer(timer, &basenow); + __run_hrtimer(cpu_base, base, timer, &basenow); else raise = 1; } } + if (raise) + raise_softirq_irqoff(HRTIMER_SOFTIRQ); +} + +#ifdef CONFIG_HIGH_RES_TIMERS + +/* + * High resolution timer interrupt + * Called with interrupts disabled + */ +void hrtimer_interrupt(struct clock_event_device *dev) +{ + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + ktime_t expires_next, now, entry_time, delta; + int retries = 0; + + BUG_ON(!cpu_base->hres_active); + cpu_base->nr_events++; + dev->next_event.tv64 = KTIME_MAX; + + raw_spin_lock(&cpu_base->lock); + entry_time = now = hrtimer_update_base(cpu_base); +retry: + cpu_base->in_hrtirq = 1; + /* + * We set expires_next to KTIME_MAX here with cpu_base->lock + * held to prevent that a timer is enqueued in our queue via + * the migration code. This does not affect enqueueing of + * timers which run their callback and need to be requeued on + * this CPU. + */ + cpu_base->expires_next.tv64 = KTIME_MAX; + + __hrtimer_run_queues(cpu_base, now); + /* Reevaluate the clock bases for the next expiry */ expires_next = __hrtimer_get_next_event(cpu_base); /* @@ -1561,10 +1558,9 @@ retry: raw_spin_unlock(&cpu_base->lock); /* Reprogramming necessary ? */ - if (expires_next.tv64 == KTIME_MAX || - !tick_program_event(expires_next, 0)) { + if (!tick_program_event(expires_next, 0)) { cpu_base->hang_detected = 0; - goto out; + return; } /* @@ -1595,8 +1591,8 @@ retry: cpu_base->hang_detected = 1; raw_spin_unlock(&cpu_base->lock); delta = ktime_sub(now, entry_time); - if (delta.tv64 > cpu_base->max_hang_time.tv64) - cpu_base->max_hang_time = delta; + if ((unsigned int)delta.tv64 > cpu_base->max_hang_time) + cpu_base->max_hang_time = (unsigned int) delta.tv64; /* * Limit it to a sensible value as we enforce a longer * delay. Give the CPU at least 100ms to catch up. @@ -1608,16 +1604,13 @@ retry: tick_program_event(expires_next, 1); printk_once(KERN_WARNING "hrtimer: interrupt took %llu ns\n", ktime_to_ns(delta)); -out: - if (raise) - raise_softirq_irqoff(HRTIMER_SOFTIRQ); } /* * local version of hrtimer_peek_ahead_timers() called with interrupts * disabled. */ -static void __hrtimer_peek_ahead_timers(void) +static inline void __hrtimer_peek_ahead_timers(void) { struct tick_device *td; @@ -1629,102 +1622,39 @@ static void __hrtimer_peek_ahead_timers(void) hrtimer_interrupt(td->evtdev); } -/** - * hrtimer_peek_ahead_timers -- run soft-expired timers now - * - * hrtimer_peek_ahead_timers will peek at the timer queue of - * the current cpu and check if there are any timers for which - * the soft expires time has passed. If any such timers exist, - * they are run immediately and then removed from the timer queue. - * - */ -void hrtimer_peek_ahead_timers(void) -{ - unsigned long flags; - - local_irq_save(flags); - __hrtimer_peek_ahead_timers(); - local_irq_restore(flags); -} #else /* CONFIG_HIGH_RES_TIMERS */ static inline void __hrtimer_peek_ahead_timers(void) { } #endif /* !CONFIG_HIGH_RES_TIMERS */ - -static void run_hrtimer_softirq(struct softirq_action *h) -{ - hrtimer_rt_run_pending(); -} - /* - * Called from timer softirq every jiffy, expire hrtimers: - * - * For HRT its the fall back code to run the softirq in the timer - * softirq context in case the hrtimer initialization failed or has - * not been done yet. + * Called from run_local_timers in hardirq context every jiffy */ -void hrtimer_run_pending(void) +void hrtimer_run_queues(void) { - if (hrtimer_hres_active()) + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + ktime_t now; + + if (__hrtimer_hres_active(cpu_base)) return; /* - * This _is_ ugly: We have to check in the softirq context, - * whether we can switch to highres and / or nohz mode. The - * clocksource switch happens in the timer interrupt with - * xtime_lock held. Notification from there only sets the - * check bit in the tick_oneshot code, otherwise we might - * deadlock vs. xtime_lock. + * This _is_ ugly: We have to check periodically, whether we + * can switch to highres and / or nohz mode. The clocksource + * switch happens with xtime_lock held. Notification from + * there only sets the check bit in the tick_oneshot code, + * otherwise we might deadlock vs. xtime_lock. */ - if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) + if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) { hrtimer_switch_to_hres(); -} - -/* - * Called from hardirq context every jiffy - */ -void hrtimer_run_queues(void) -{ - struct timerqueue_node *node; - struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); - struct hrtimer_clock_base *base; - int index, gettime = 1, raise = 0; - - if (hrtimer_hres_active()) return; - - for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) { - base = &cpu_base->clock_base[index]; - if (!timerqueue_getnext(&base->active)) - continue; - - if (gettime) { - hrtimer_get_softirq_time(cpu_base); - gettime = 0; - } - - raw_spin_lock(&cpu_base->lock); - - while ((node = timerqueue_getnext(&base->active))) { - struct hrtimer *timer; - - timer = container_of(node, struct hrtimer, node); - if (base->softirq_time.tv64 <= - hrtimer_get_expires_tv64(timer)) - break; - - if (!hrtimer_rt_defer(timer)) - __run_hrtimer(timer, &base->softirq_time); - else - raise = 1; - } - raw_spin_unlock(&cpu_base->lock); } - if (raise) - raise_softirq_irqoff(HRTIMER_SOFTIRQ); + raw_spin_lock(&cpu_base->lock); + now = hrtimer_update_base(cpu_base); + __hrtimer_run_queues(cpu_base, now); + raw_spin_unlock(&cpu_base->lock); } /* @@ -1759,8 +1689,6 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod do { set_current_state(state); hrtimer_start_expires(&t->timer, mode); - if (!hrtimer_active(&t->timer)) - t->task = NULL; if (likely(t->task)) freezable_schedule(); @@ -1937,11 +1865,11 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, debug_deactivate(timer); /* - * Mark it as STATE_MIGRATE not INACTIVE otherwise the + * Mark it as ENQUEUED not INACTIVE otherwise the * timer could be seen as !active and just vanish away * under us on another CPU */ - __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0); + __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0); timer->base = new_base; /* * Enqueue the timers on the new cpu. This does not @@ -1952,9 +1880,6 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, * event device. */ enqueue_hrtimer(timer, new_base); - - /* Clear the migration state bit */ - timer->state &= ~HRTIMER_STATE_MIGRATE; } } @@ -2021,12 +1946,21 @@ static struct notifier_block hrtimers_nb = { .notifier_call = hrtimer_cpu_notify, }; +#ifdef CONFIG_PREEMPT_RT_BASE +static void run_hrtimer_softirq(struct softirq_action *h) +{ + hrtimer_rt_run_pending(); +} +#endif + void __init hrtimers_init(void) { hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); register_cpu_notifier(&hrtimers_nb); +#ifdef CONFIG_PREEMPT_RT_BASE open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq); +#endif } /** @@ -2065,8 +1999,6 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta, hrtimer_init_sleeper(&t, current); hrtimer_start_expires(&t.timer, mode); - if (!hrtimer_active(&t.timer)) - t.task = NULL; if (likely(t.task)) schedule(); diff --git a/kernel/kernel/time/itimer.c b/kernel/kernel/time/itimer.c index d0513909d..184de6751 100644 --- a/kernel/kernel/time/itimer.c +++ b/kernel/kernel/time/itimer.c @@ -26,7 +26,7 @@ */ static struct timeval itimer_get_remtime(struct hrtimer *timer) { - ktime_t rem = hrtimer_get_remaining(timer); + ktime_t rem = __hrtimer_get_remaining(timer, true); /* * Racy but safe: if the itimer expires after the above diff --git a/kernel/kernel/time/ntp.c b/kernel/kernel/time/ntp.c index bd9c53985..cd925efb2 100644 --- a/kernel/kernel/time/ntp.c +++ b/kernel/kernel/time/ntp.c @@ -36,6 +36,7 @@ unsigned long tick_nsec; static u64 tick_length; static u64 tick_length_base; +#define SECS_PER_DAY 86400 #define MAX_TICKADJ 500LL /* usecs */ #define MAX_TICKADJ_SCALED \ (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) @@ -77,6 +78,9 @@ static long time_adjust; /* constant (boot-param configurable) NTP tick adjustment (upscaled) */ static s64 ntp_tick_adj; +/* second value of the next pending leapsecond, or TIME64_MAX if no leap */ +static time64_t ntp_next_leap_sec = TIME64_MAX; + #ifdef CONFIG_NTP_PPS /* @@ -96,7 +100,7 @@ static s64 ntp_tick_adj; static int pps_valid; /* signal watchdog counter */ static long pps_tf[3]; /* phase median filter */ static long pps_jitter; /* current jitter (ns) */ -static struct timespec pps_fbase; /* beginning of the last freq interval */ +static struct timespec64 pps_fbase; /* beginning of the last freq interval */ static int pps_shift; /* current interval duration (s) (shift) */ static int pps_intcnt; /* interval counter */ static s64 pps_freq; /* frequency offset (scaled ns/s) */ @@ -350,6 +354,7 @@ void ntp_clear(void) tick_length = tick_length_base; time_offset = 0; + ntp_next_leap_sec = TIME64_MAX; /* Clear PPS state variables */ pps_clear(); } @@ -360,6 +365,21 @@ u64 ntp_tick_length(void) return tick_length; } +/** + * ntp_get_next_leap - Returns the next leapsecond in CLOCK_REALTIME ktime_t + * + * Provides the time of the next leapsecond against CLOCK_REALTIME in + * a ktime_t format. Returns KTIME_MAX if no leapsecond is pending. + */ +ktime_t ntp_get_next_leap(void) +{ + ktime_t ret; + + if ((time_state == TIME_INS) && (time_status & STA_INS)) + return ktime_set(ntp_next_leap_sec, 0); + ret.tv64 = KTIME_MAX; + return ret; +} /* * this routine handles the overflow of the microsecond field @@ -383,15 +403,21 @@ int second_overflow(unsigned long secs) */ switch (time_state) { case TIME_OK: - if (time_status & STA_INS) + if (time_status & STA_INS) { time_state = TIME_INS; - else if (time_status & STA_DEL) + ntp_next_leap_sec = secs + SECS_PER_DAY - + (secs % SECS_PER_DAY); + } else if (time_status & STA_DEL) { time_state = TIME_DEL; + ntp_next_leap_sec = secs + SECS_PER_DAY - + ((secs+1) % SECS_PER_DAY); + } break; case TIME_INS: - if (!(time_status & STA_INS)) + if (!(time_status & STA_INS)) { + ntp_next_leap_sec = TIME64_MAX; time_state = TIME_OK; - else if (secs % 86400 == 0) { + } else if (secs % SECS_PER_DAY == 0) { leap = -1; time_state = TIME_OOP; printk(KERN_NOTICE @@ -399,19 +425,21 @@ int second_overflow(unsigned long secs) } break; case TIME_DEL: - if (!(time_status & STA_DEL)) + if (!(time_status & STA_DEL)) { + ntp_next_leap_sec = TIME64_MAX; time_state = TIME_OK; - else if ((secs + 1) % 86400 == 0) { + } else if ((secs + 1) % SECS_PER_DAY == 0) { leap = 1; + ntp_next_leap_sec = TIME64_MAX; time_state = TIME_WAIT; printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n"); } break; case TIME_OOP: + ntp_next_leap_sec = TIME64_MAX; time_state = TIME_WAIT; break; - case TIME_WAIT: if (!(time_status & (STA_INS | STA_DEL))) time_state = TIME_OK; @@ -460,6 +488,11 @@ out: } #ifdef CONFIG_GENERIC_CMOS_UPDATE +int __weak update_persistent_clock(struct timespec now) +{ + return -ENODEV; +} + int __weak update_persistent_clock64(struct timespec64 now64) { struct timespec now; @@ -477,7 +510,7 @@ static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock); static void sync_cmos_clock(struct work_struct *work) { struct timespec64 now; - struct timespec next; + struct timespec64 next; int fail = 1; /* @@ -527,7 +560,7 @@ static void sync_cmos_clock(struct work_struct *work) next.tv_nsec -= NSEC_PER_SEC; } queue_delayed_work(system_power_efficient_wq, - &sync_cmos_work, timespec_to_jiffies(&next)); + &sync_cmos_work, timespec64_to_jiffies(&next)); } #ifdef CONFIG_PREEMPT_RT_FULL @@ -590,6 +623,7 @@ static inline void process_adj_status(struct timex *txc, struct timespec64 *ts) if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { time_state = TIME_OK; time_status = STA_UNSYNC; + ntp_next_leap_sec = TIME64_MAX; /* restart PPS frequency calibration */ pps_reset_freq_interval(); } @@ -754,6 +788,24 @@ int __do_adjtimex(struct timex *txc, struct timespec64 *ts, s32 *time_tai) if (!(time_status & STA_NANO)) txc->time.tv_usec /= NSEC_PER_USEC; + /* Handle leapsec adjustments */ + if (unlikely(ts->tv_sec >= ntp_next_leap_sec)) { + if ((time_state == TIME_INS) && (time_status & STA_INS)) { + result = TIME_OOP; + txc->tai++; + txc->time.tv_sec--; + } + if ((time_state == TIME_DEL) && (time_status & STA_DEL)) { + result = TIME_WAIT; + txc->tai--; + txc->time.tv_sec++; + } + if ((time_state == TIME_OOP) && + (ts->tv_sec == ntp_next_leap_sec)) { + result = TIME_WAIT; + } + } + return result; } @@ -764,13 +816,13 @@ int __do_adjtimex(struct timex *txc, struct timespec64 *ts, s32 *time_tai) * pps_normtime.nsec has a range of ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] * while timespec.tv_nsec has a range of [0, NSEC_PER_SEC) */ struct pps_normtime { - __kernel_time_t sec; /* seconds */ + s64 sec; /* seconds */ long nsec; /* nanoseconds */ }; /* normalize the timestamp so that nsec is in the ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] interval */ -static inline struct pps_normtime pps_normalize_ts(struct timespec ts) +static inline struct pps_normtime pps_normalize_ts(struct timespec64 ts) { struct pps_normtime norm = { .sec = ts.tv_sec, @@ -852,7 +904,7 @@ static long hardpps_update_freq(struct pps_normtime freq_norm) pps_errcnt++; pps_dec_freq_interval(); printk_deferred(KERN_ERR - "hardpps: PPSERROR: interval too long - %ld s\n", + "hardpps: PPSERROR: interval too long - %lld s\n", freq_norm.sec); return 0; } @@ -939,7 +991,7 @@ static void hardpps_update_phase(long error) * This code is based on David Mills's reference nanokernel * implementation. It was mostly rewritten but keeps the same idea. */ -void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) +void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts) { struct pps_normtime pts_norm, freq_norm; @@ -960,7 +1012,7 @@ void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) } /* ok, now we have a base for frequency calculation */ - freq_norm = pps_normalize_ts(timespec_sub(*raw_ts, pps_fbase)); + freq_norm = pps_normalize_ts(timespec64_sub(*raw_ts, pps_fbase)); /* check that the signal is in the range * [1s - MAXFREQ us, 1s + MAXFREQ us], otherwise reject it */ diff --git a/kernel/kernel/time/ntp_internal.h b/kernel/kernel/time/ntp_internal.h index bbd102ad9..af924470e 100644 --- a/kernel/kernel/time/ntp_internal.h +++ b/kernel/kernel/time/ntp_internal.h @@ -5,8 +5,9 @@ extern void ntp_init(void); extern void ntp_clear(void); /* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */ extern u64 ntp_tick_length(void); +extern ktime_t ntp_get_next_leap(void); extern int second_overflow(unsigned long secs); extern int ntp_validate_timex(struct timex *); extern int __do_adjtimex(struct timex *, struct timespec64 *, s32 *); -extern void __hardpps(const struct timespec *, const struct timespec *); +extern void __hardpps(const struct timespec64 *, const struct timespec64 *); #endif /* _LINUX_NTP_INTERNAL_H */ diff --git a/kernel/kernel/time/posix-clock.c b/kernel/kernel/time/posix-clock.c index ce033c7aa..9cff0ab82 100644 --- a/kernel/kernel/time/posix-clock.c +++ b/kernel/kernel/time/posix-clock.c @@ -69,10 +69,10 @@ static ssize_t posix_clock_read(struct file *fp, char __user *buf, static unsigned int posix_clock_poll(struct file *fp, poll_table *wait) { struct posix_clock *clk = get_posix_clock(fp); - int result = 0; + unsigned int result = 0; if (!clk) - return -ENODEV; + return POLLERR; if (clk->ops.poll) result = clk->ops.poll(clk, fp, wait); diff --git a/kernel/kernel/time/posix-cpu-timers.c b/kernel/kernel/time/posix-cpu-timers.c index c8c7150f2..bf28ef601 100644 --- a/kernel/kernel/time/posix-cpu-timers.c +++ b/kernel/kernel/time/posix-cpu-timers.c @@ -197,39 +197,62 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, return 0; } -static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b) +/* + * Set cputime to sum_cputime if sum_cputime > cputime. Use cmpxchg + * to avoid race conditions with concurrent updates to cputime. + */ +static inline void __update_gt_cputime(atomic64_t *cputime, u64 sum_cputime) { - if (b->utime > a->utime) - a->utime = b->utime; + u64 curr_cputime; +retry: + curr_cputime = atomic64_read(cputime); + if (sum_cputime > curr_cputime) { + if (atomic64_cmpxchg(cputime, curr_cputime, sum_cputime) != curr_cputime) + goto retry; + } +} - if (b->stime > a->stime) - a->stime = b->stime; +static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, struct task_cputime *sum) +{ + __update_gt_cputime(&cputime_atomic->utime, sum->utime); + __update_gt_cputime(&cputime_atomic->stime, sum->stime); + __update_gt_cputime(&cputime_atomic->sum_exec_runtime, sum->sum_exec_runtime); +} - if (b->sum_exec_runtime > a->sum_exec_runtime) - a->sum_exec_runtime = b->sum_exec_runtime; +/* Sample task_cputime_atomic values in "atomic_timers", store results in "times". */ +static inline void sample_cputime_atomic(struct task_cputime *times, + struct task_cputime_atomic *atomic_times) +{ + times->utime = atomic64_read(&atomic_times->utime); + times->stime = atomic64_read(&atomic_times->stime); + times->sum_exec_runtime = atomic64_read(&atomic_times->sum_exec_runtime); } void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) { struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; struct task_cputime sum; - unsigned long flags; - if (!cputimer->running) { + /* Check if cputimer isn't running. This is accessed without locking. */ + if (!READ_ONCE(cputimer->running)) { /* * The POSIX timer interface allows for absolute time expiry * values through the TIMER_ABSTIME flag, therefore we have - * to synchronize the timer to the clock every time we start - * it. + * to synchronize the timer to the clock every time we start it. */ thread_group_cputime(tsk, &sum); - raw_spin_lock_irqsave(&cputimer->lock, flags); - cputimer->running = 1; - update_gt_cputime(&cputimer->cputime, &sum); - } else - raw_spin_lock_irqsave(&cputimer->lock, flags); - *times = cputimer->cputime; - raw_spin_unlock_irqrestore(&cputimer->lock, flags); + update_gt_cputime(&cputimer->cputime_atomic, &sum); + + /* + * We're setting cputimer->running without a lock. Ensure + * this only gets written to in one operation. We set + * running after update_gt_cputime() as a small optimization, + * but barriers are not required because update_gt_cputime() + * can handle concurrent updates. + */ + WRITE_ONCE(cputimer->running, true); + } + sample_cputime_atomic(times, &cputimer->cputime_atomic); } /* @@ -583,7 +606,8 @@ bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk) if (!task_cputime_zero(&tsk->cputime_expires)) return false; - if (tsk->signal->cputimer.running) + /* Check if cputimer is running. This is accessed without locking. */ + if (READ_ONCE(tsk->signal->cputimer.running)) return false; return true; @@ -841,6 +865,13 @@ static void check_thread_timers(struct task_struct *tsk, unsigned long long expires; unsigned long soft; + /* + * If cputime_expires is zero, then there are no active + * per thread CPU timers. + */ + if (task_cputime_zero(&tsk->cputime_expires)) + return; + expires = check_timers_list(timers, firing, prof_ticks(tsk)); tsk_expires->prof_exp = expires_to_cputime(expires); @@ -853,10 +884,10 @@ static void check_thread_timers(struct task_struct *tsk, /* * Check for the special case thread timers. */ - soft = ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur); + soft = READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur); if (soft != RLIM_INFINITY) { unsigned long hard = - ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max); + READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max); if (hard != RLIM_INFINITY && tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { @@ -883,14 +914,12 @@ static void check_thread_timers(struct task_struct *tsk, } } -static void stop_process_timers(struct signal_struct *sig) +static inline void stop_process_timers(struct signal_struct *sig) { struct thread_group_cputimer *cputimer = &sig->cputimer; - unsigned long flags; - raw_spin_lock_irqsave(&cputimer->lock, flags); - cputimer->running = 0; - raw_spin_unlock_irqrestore(&cputimer->lock, flags); + /* Turn off cputimer->running. This is done without locking. */ + WRITE_ONCE(cputimer->running, false); } static u32 onecputick; @@ -941,6 +970,19 @@ static void check_process_timers(struct task_struct *tsk, unsigned long soft; /* + * If cputimer is not running, then there are no active + * process wide timers (POSIX 1.b, itimers, RLIMIT_CPU). + */ + if (!READ_ONCE(tsk->signal->cputimer.running)) + return; + + /* + * Signify that a thread is checking for process timers. + * Write access to this field is protected by the sighand lock. + */ + sig->cputimer.checking_timer = true; + + /* * Collect the current process totals. */ thread_group_cputimer(tsk, &cputime); @@ -959,11 +1001,11 @@ static void check_process_timers(struct task_struct *tsk, SIGPROF); check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime, SIGVTALRM); - soft = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); + soft = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); if (soft != RLIM_INFINITY) { unsigned long psecs = cputime_to_secs(ptime); unsigned long hard = - ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_max); + READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_max); cputime_t x; if (psecs >= hard) { /* @@ -994,6 +1036,8 @@ static void check_process_timers(struct task_struct *tsk, sig->cputime_expires.sched_exp = sched_expires; if (task_cputime_zero(&sig->cputime_expires)) stop_process_timers(sig); + + sig->cputimer.checking_timer = false; } /* @@ -1096,29 +1140,36 @@ static inline int task_cputime_expired(const struct task_cputime *sample, static inline int fastpath_timer_check(struct task_struct *tsk) { struct signal_struct *sig; - cputime_t utime, stime; - - task_cputime(tsk, &utime, &stime); if (!task_cputime_zero(&tsk->cputime_expires)) { - struct task_cputime task_sample = { - .utime = utime, - .stime = stime, - .sum_exec_runtime = tsk->se.sum_exec_runtime - }; + struct task_cputime task_sample; + task_cputime(tsk, &task_sample.utime, &task_sample.stime); + task_sample.sum_exec_runtime = tsk->se.sum_exec_runtime; if (task_cputime_expired(&task_sample, &tsk->cputime_expires)) return 1; } sig = tsk->signal; - if (sig->cputimer.running) { + /* + * Check if thread group timers expired when the cputimer is + * running and no other thread in the group is already checking + * for thread group cputimers. These fields are read without the + * sighand lock. However, this is fine because this is meant to + * be a fastpath heuristic to determine whether we should try to + * acquire the sighand lock to check/handle timers. + * + * In the worst case scenario, if 'running' or 'checking_timer' gets + * set but the current thread doesn't see the change yet, we'll wait + * until the next thread in the group gets a scheduler interrupt to + * handle the timer. This isn't an issue in practice because these + * types of delays with signals actually getting sent are expected. + */ + if (READ_ONCE(sig->cputimer.running) && + !READ_ONCE(sig->cputimer.checking_timer)) { struct task_cputime group_sample; - unsigned long flags; - raw_spin_lock_irqsave(&sig->cputimer.lock, flags); - group_sample = sig->cputimer.cputime; - raw_spin_unlock_irqrestore(&sig->cputimer.lock, flags); + sample_cputime_atomic(&group_sample, &sig->cputimer.cputime_atomic); if (task_cputime_expired(&group_sample, &sig->cputime_expires)) return 1; @@ -1155,12 +1206,8 @@ static void __run_posix_cpu_timers(struct task_struct *tsk) * put them on the firing list. */ check_thread_timers(tsk, &firing); - /* - * If there are any active process wide timers (POSIX 1.b, itimers, - * RLIMIT_CPU) cputimer must be running. - */ - if (tsk->signal->cputimer.running) - check_process_timers(tsk, &firing); + + check_process_timers(tsk, &firing); /* * We must release these locks before taking any timer's lock. diff --git a/kernel/kernel/time/posix-timers.c b/kernel/kernel/time/posix-timers.c index 0f5d7eae6..464a98155 100644 --- a/kernel/kernel/time/posix-timers.c +++ b/kernel/kernel/time/posix-timers.c @@ -272,13 +272,20 @@ static int posix_get_tai(clockid_t which_clock, struct timespec *tp) return 0; } +static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec *tp) +{ + tp->tv_sec = 0; + tp->tv_nsec = hrtimer_resolution; + return 0; +} + /* * Initialize everything, well, just everything in Posix clocks/timers ;) */ static __init int init_posix_timers(void) { struct k_clock clock_realtime = { - .clock_getres = hrtimer_get_res, + .clock_getres = posix_get_hrtimer_res, .clock_get = posix_clock_realtime_get, .clock_set = posix_clock_realtime_set, .clock_adj = posix_clock_realtime_adj, @@ -290,7 +297,7 @@ static __init int init_posix_timers(void) .timer_del = common_timer_del, }; struct k_clock clock_monotonic = { - .clock_getres = hrtimer_get_res, + .clock_getres = posix_get_hrtimer_res, .clock_get = posix_ktime_get_ts, .nsleep = common_nsleep, .nsleep_restart = hrtimer_nanosleep_restart, @@ -300,7 +307,7 @@ static __init int init_posix_timers(void) .timer_del = common_timer_del, }; struct k_clock clock_monotonic_raw = { - .clock_getres = hrtimer_get_res, + .clock_getres = posix_get_hrtimer_res, .clock_get = posix_get_monotonic_raw, }; struct k_clock clock_realtime_coarse = { @@ -312,7 +319,7 @@ static __init int init_posix_timers(void) .clock_get = posix_get_monotonic_coarse, }; struct k_clock clock_tai = { - .clock_getres = hrtimer_get_res, + .clock_getres = posix_get_hrtimer_res, .clock_get = posix_get_tai, .nsleep = common_nsleep, .nsleep_restart = hrtimer_nanosleep_restart, @@ -322,7 +329,7 @@ static __init int init_posix_timers(void) .timer_del = common_timer_del, }; struct k_clock clock_boottime = { - .clock_getres = hrtimer_get_res, + .clock_getres = posix_get_hrtimer_res, .clock_get = posix_get_boottime, .nsleep = common_nsleep, .nsleep_restart = hrtimer_nanosleep_restart, @@ -755,7 +762,7 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv); - remaining = ktime_sub(hrtimer_get_expires(timer), now); + remaining = __hrtimer_expires_remaining_adjusted(timer, now); /* Return 0 only, when the timer is expired and not pending */ if (remaining.tv64 <= 0) { /* diff --git a/kernel/kernel/time/tick-broadcast-hrtimer.c b/kernel/kernel/time/tick-broadcast-hrtimer.c index 6aac4beed..1b4ac3361 100644 --- a/kernel/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/kernel/time/tick-broadcast-hrtimer.c @@ -18,29 +18,23 @@ static struct hrtimer bctimer; -static void bc_set_mode(enum clock_event_mode mode, - struct clock_event_device *bc) +static int bc_shutdown(struct clock_event_device *evt) { - switch (mode) { - case CLOCK_EVT_MODE_SHUTDOWN: - /* - * Note, we cannot cancel the timer here as we might - * run into the following live lock scenario: - * - * cpu 0 cpu1 - * lock(broadcast_lock); - * hrtimer_interrupt() - * bc_handler() - * tick_handle_oneshot_broadcast(); - * lock(broadcast_lock); - * hrtimer_cancel() - * wait_for_callback() - */ - hrtimer_try_to_cancel(&bctimer); - break; - default: - break; - } + /* + * Note, we cannot cancel the timer here as we might + * run into the following live lock scenario: + * + * cpu 0 cpu1 + * lock(broadcast_lock); + * hrtimer_interrupt() + * bc_handler() + * tick_handle_oneshot_broadcast(); + * lock(broadcast_lock); + * hrtimer_cancel() + * wait_for_callback() + */ + hrtimer_try_to_cancel(&bctimer); + return 0; } /* @@ -66,9 +60,11 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc) * hrtimer_{start/cancel} functions call into tracing, * calls to these functions must be bound within RCU_NONIDLE. */ - RCU_NONIDLE(bc_moved = (hrtimer_try_to_cancel(&bctimer) >= 0) ? - !hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED) : - 0); + RCU_NONIDLE({ + bc_moved = hrtimer_try_to_cancel(&bctimer) >= 0; + if (bc_moved) + hrtimer_start(&bctimer, expires, + HRTIMER_MODE_ABS_PINNED);}); if (bc_moved) { /* Bind the "device" to the cpu */ bc->bound_on = smp_processor_id(); @@ -79,7 +75,7 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc) } static struct clock_event_device ce_broadcast_hrtimer = { - .set_mode = bc_set_mode, + .set_state_shutdown = bc_shutdown, .set_next_ktime = bc_set_next, .features = CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_KTIME | @@ -99,15 +95,17 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t) { ce_broadcast_hrtimer.event_handler(&ce_broadcast_hrtimer); - if (ce_broadcast_hrtimer.next_event.tv64 == KTIME_MAX) - return HRTIMER_NORESTART; + if (clockevent_state_oneshot(&ce_broadcast_hrtimer)) + if (ce_broadcast_hrtimer.next_event.tv64 != KTIME_MAX) + return HRTIMER_RESTART; - return HRTIMER_RESTART; + return HRTIMER_NORESTART; } void tick_setup_hrtimer_broadcast(void) { hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); bctimer.function = bc_handler; + bctimer.irqsafe = true; clockevents_register_device(&ce_broadcast_hrtimer); } diff --git a/kernel/kernel/time/tick-broadcast.c b/kernel/kernel/time/tick-broadcast.c index 7e8ca4f44..f6aae7977 100644 --- a/kernel/kernel/time/tick-broadcast.c +++ b/kernel/kernel/time/tick-broadcast.c @@ -159,7 +159,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { struct clock_event_device *bc = tick_broadcast_device.evtdev; unsigned long flags; - int ret; + int ret = 0; raw_spin_lock_irqsave(&tick_broadcast_lock, flags); @@ -221,13 +221,14 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) * If we kept the cpu in the broadcast mask, * tell the caller to leave the per cpu device * in shutdown state. The periodic interrupt - * is delivered by the broadcast device. + * is delivered by the broadcast device, if + * the broadcast device exists and is not + * hrtimer based. */ - ret = cpumask_test_cpu(cpu, tick_broadcast_mask); + if (bc && !(bc->features & CLOCK_EVT_FEAT_HRTIMER)) + ret = cpumask_test_cpu(cpu, tick_broadcast_mask); break; default: - /* Nothing to do */ - ret = 0; break; } } @@ -255,18 +256,32 @@ int tick_receive_broadcast(void) /* * Broadcast the event to the cpus, which are set in the mask (mangled). */ -static void tick_do_broadcast(struct cpumask *mask) +static bool tick_do_broadcast(struct cpumask *mask) { int cpu = smp_processor_id(); struct tick_device *td; + bool local = false; /* * Check, if the current cpu is in the mask */ if (cpumask_test_cpu(cpu, mask)) { + struct clock_event_device *bc = tick_broadcast_device.evtdev; + cpumask_clear_cpu(cpu, mask); - td = &per_cpu(tick_cpu_device, cpu); - td->evtdev->event_handler(td->evtdev); + /* + * We only run the local handler, if the broadcast + * device is not hrtimer based. Otherwise we run into + * a hrtimer recursion. + * + * local timer_interrupt() + * local_handler() + * expire_hrtimers() + * bc_handler() + * local_handler() + * expire_hrtimers() + */ + local = !(bc->features & CLOCK_EVT_FEAT_HRTIMER); } if (!cpumask_empty(mask)) { @@ -279,16 +294,17 @@ static void tick_do_broadcast(struct cpumask *mask) td = &per_cpu(tick_cpu_device, cpumask_first(mask)); td->evtdev->broadcast(mask); } + return local; } /* * Periodic broadcast: * - invoke the broadcast handlers */ -static void tick_do_periodic_broadcast(void) +static bool tick_do_periodic_broadcast(void) { cpumask_and(tmpmask, cpu_online_mask, tick_broadcast_mask); - tick_do_broadcast(tmpmask); + return tick_do_broadcast(tmpmask); } /* @@ -296,34 +312,33 @@ static void tick_do_periodic_broadcast(void) */ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) { - ktime_t next; + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); + bool bc_local; raw_spin_lock(&tick_broadcast_lock); - tick_do_periodic_broadcast(); + /* Handle spurious interrupts gracefully */ + if (clockevent_state_shutdown(tick_broadcast_device.evtdev)) { + raw_spin_unlock(&tick_broadcast_lock); + return; + } - /* - * The device is in periodic mode. No reprogramming necessary: - */ - if (dev->state == CLOCK_EVT_STATE_PERIODIC) - goto unlock; + bc_local = tick_do_periodic_broadcast(); - /* - * Setup the next period for devices, which do not have - * periodic mode. We read dev->next_event first and add to it - * when the event already expired. clockevents_program_event() - * sets dev->next_event only when the event is really - * programmed to the device. - */ - for (next = dev->next_event; ;) { - next = ktime_add(next, tick_period); + if (clockevent_state_oneshot(dev)) { + ktime_t next = ktime_add(dev->next_event, tick_period); - if (!clockevents_program_event(dev, next, false)) - goto unlock; - tick_do_periodic_broadcast(); + clockevents_program_event(dev, next, true); } -unlock: raw_spin_unlock(&tick_broadcast_lock); + + /* + * We run the handler of the local cpu after dropping + * tick_broadcast_lock because the handler might deadlock when + * trying to switch to oneshot mode. + */ + if (bc_local) + td->evtdev->event_handler(td->evtdev); } /** @@ -366,8 +381,16 @@ void tick_broadcast_control(enum tick_broadcast_mode mode) case TICK_BROADCAST_ON: cpumask_set_cpu(cpu, tick_broadcast_on); if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) { - if (tick_broadcast_device.mode == - TICKDEV_MODE_PERIODIC) + /* + * Only shutdown the cpu local device, if: + * + * - the broadcast device exists + * - the broadcast device is not a hrtimer based one + * - the broadcast device is in periodic mode to + * avoid a hickup during switch to oneshot mode + */ + if (bc && !(bc->features & CLOCK_EVT_FEAT_HRTIMER) && + tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) clockevents_shutdown(dev); } break; @@ -386,14 +409,16 @@ void tick_broadcast_control(enum tick_broadcast_mode mode) break; } - if (cpumask_empty(tick_broadcast_mask)) { - if (!bc_stopped) - clockevents_shutdown(bc); - } else if (bc_stopped) { - if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) - tick_broadcast_start_periodic(bc); - else - tick_broadcast_setup_oneshot(bc); + if (bc) { + if (cpumask_empty(tick_broadcast_mask)) { + if (!bc_stopped) + clockevents_shutdown(bc); + } else if (bc_stopped) { + if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) + tick_broadcast_start_periodic(bc); + else + tick_broadcast_setup_oneshot(bc); + } } raw_spin_unlock(&tick_broadcast_lock); } @@ -532,23 +557,19 @@ static void tick_broadcast_set_affinity(struct clock_event_device *bc, irq_set_affinity(bc->irq, bc->cpumask); } -static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu, - ktime_t expires, int force) +static void tick_broadcast_set_event(struct clock_event_device *bc, int cpu, + ktime_t expires) { - int ret; - - if (bc->state != CLOCK_EVT_STATE_ONESHOT) - clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT); + if (!clockevent_state_oneshot(bc)) + clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT); - ret = clockevents_program_event(bc, expires, force); - if (!ret) - tick_broadcast_set_affinity(bc, cpumask_of(cpu)); - return ret; + clockevents_program_event(bc, expires, 1); + tick_broadcast_set_affinity(bc, cpumask_of(cpu)); } static void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { - clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT); + clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT); } /* @@ -566,7 +587,7 @@ void tick_check_oneshot_broadcast_this_cpu(void) * switched over, leave the device alone. */ if (td->mode == TICKDEV_MODE_ONESHOT) { - clockevents_set_state(td->evtdev, + clockevents_switch_state(td->evtdev, CLOCK_EVT_STATE_ONESHOT); } } @@ -580,9 +601,9 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) struct tick_device *td; ktime_t now, next_event; int cpu, next_cpu = 0; + bool bc_local; raw_spin_lock(&tick_broadcast_lock); -again: dev->next_event.tv64 = KTIME_MAX; next_event.tv64 = KTIME_MAX; cpumask_clear(tmpmask); @@ -624,7 +645,7 @@ again: /* * Wakeup the cpus which have an expired event. */ - tick_do_broadcast(tmpmask); + bc_local = tick_do_broadcast(tmpmask); /* * Two reasons for reprogram: @@ -636,15 +657,15 @@ again: * - There are pending events on sleeping CPUs which were not * in the event mask */ - if (next_event.tv64 != KTIME_MAX) { - /* - * Rearm the broadcast device. If event expired, - * repeat the above - */ - if (tick_broadcast_set_event(dev, next_cpu, next_event, 0)) - goto again; - } + if (next_event.tv64 != KTIME_MAX) + tick_broadcast_set_event(dev, next_cpu, next_event); + raw_spin_unlock(&tick_broadcast_lock); + + if (bc_local) { + td = this_cpu_ptr(&tick_cpu_device); + td->evtdev->event_handler(td->evtdev); + } } static int broadcast_needs_cpu(struct clock_event_device *bc, int cpu) @@ -670,77 +691,88 @@ static void broadcast_shutdown_local(struct clock_event_device *bc, if (dev->next_event.tv64 < bc->next_event.tv64) return; } - clockevents_set_state(dev, CLOCK_EVT_STATE_SHUTDOWN); + clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN); } -/** - * tick_broadcast_oneshot_control - Enter/exit broadcast oneshot mode - * @state: The target state (enter/exit) - * - * The system enters/leaves a state, where affected devices might stop - * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups. - * - * Called with interrupts disabled, so clockevents_lock is not - * required here because the local clock event device cannot go away - * under us. - */ -int tick_broadcast_oneshot_control(enum tick_broadcast_state state) +int __tick_broadcast_oneshot_control(enum tick_broadcast_state state) { struct clock_event_device *bc, *dev; - struct tick_device *td; int cpu, ret = 0; ktime_t now; /* - * Periodic mode does not care about the enter/exit of power - * states + * If there is no broadcast device, tell the caller not to go + * into deep idle. */ - if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) - return 0; + if (!tick_broadcast_device.evtdev) + return -EBUSY; - /* - * We are called with preemtion disabled from the depth of the - * idle code, so we can't be moved away. - */ - td = this_cpu_ptr(&tick_cpu_device); - dev = td->evtdev; - - if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) - return 0; + dev = this_cpu_ptr(&tick_cpu_device)->evtdev; raw_spin_lock(&tick_broadcast_lock); bc = tick_broadcast_device.evtdev; cpu = smp_processor_id(); if (state == TICK_BROADCAST_ENTER) { + /* + * If the current CPU owns the hrtimer broadcast + * mechanism, it cannot go deep idle and we do not add + * the CPU to the broadcast mask. We don't have to go + * through the EXIT path as the local timer is not + * shutdown. + */ + ret = broadcast_needs_cpu(bc, cpu); + if (ret) + goto out; + + /* + * If the broadcast device is in periodic mode, we + * return. + */ + if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { + /* If it is a hrtimer based broadcast, return busy */ + if (bc->features & CLOCK_EVT_FEAT_HRTIMER) + ret = -EBUSY; + goto out; + } + if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) { WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask)); + + /* Conditionally shut down the local timer. */ broadcast_shutdown_local(bc, dev); + /* * We only reprogram the broadcast timer if we * did not mark ourself in the force mask and * if the cpu local event is earlier than the * broadcast event. If the current CPU is in * the force mask, then we are going to be - * woken by the IPI right away. + * woken by the IPI right away; we return + * busy, so the CPU does not try to go deep + * idle. */ - if (!cpumask_test_cpu(cpu, tick_broadcast_force_mask) && - dev->next_event.tv64 < bc->next_event.tv64) - tick_broadcast_set_event(bc, cpu, dev->next_event, 1); + if (cpumask_test_cpu(cpu, tick_broadcast_force_mask)) { + ret = -EBUSY; + } else if (dev->next_event.tv64 < bc->next_event.tv64) { + tick_broadcast_set_event(bc, cpu, dev->next_event); + /* + * In case of hrtimer broadcasts the + * programming might have moved the + * timer to this cpu. If yes, remove + * us from the broadcast mask and + * return busy. + */ + ret = broadcast_needs_cpu(bc, cpu); + if (ret) { + cpumask_clear_cpu(cpu, + tick_broadcast_oneshot_mask); + } + } } - /* - * If the current CPU owns the hrtimer broadcast - * mechanism, it cannot go deep idle and we remove the - * CPU from the broadcast mask. We don't have to go - * through the EXIT path as the local timer is not - * shutdown. - */ - ret = broadcast_needs_cpu(bc, cpu); - if (ret) - cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); } else { if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) { - clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT); + clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT); /* * The cpu which was handling the broadcast * timer marked this cpu in the broadcast @@ -807,7 +839,6 @@ out: raw_spin_unlock(&tick_broadcast_lock); return ret; } -EXPORT_SYMBOL_GPL(tick_broadcast_oneshot_control); /* * Reset the one shot broadcast for a cpu @@ -842,7 +873,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) /* Set it up only once ! */ if (bc->event_handler != tick_handle_oneshot_broadcast) { - int was_periodic = bc->state == CLOCK_EVT_STATE_PERIODIC; + int was_periodic = clockevent_state_periodic(bc); bc->event_handler = tick_handle_oneshot_broadcast; @@ -858,10 +889,10 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) tick_broadcast_oneshot_mask, tmpmask); if (was_periodic && !cpumask_empty(tmpmask)) { - clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT); + clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT); tick_broadcast_init_next_event(tmpmask, tick_next_period); - tick_broadcast_set_event(bc, cpu, tick_next_period, 1); + tick_broadcast_set_event(bc, cpu, tick_next_period); } else bc->next_event.tv64 = KTIME_MAX; } else { @@ -949,6 +980,16 @@ bool tick_broadcast_oneshot_available(void) return bc ? bc->features & CLOCK_EVT_FEAT_ONESHOT : false; } +#else +int __tick_broadcast_oneshot_control(enum tick_broadcast_state state) +{ + struct clock_event_device *bc = tick_broadcast_device.evtdev; + + if (!bc || (bc->features & CLOCK_EVT_FEAT_HRTIMER)) + return -EBUSY; + + return 0; +} #endif void __init tick_broadcast_init(void) diff --git a/kernel/kernel/time/tick-common.c b/kernel/kernel/time/tick-common.c index 14a10917c..5a47f2e98 100644 --- a/kernel/kernel/time/tick-common.c +++ b/kernel/kernel/time/tick-common.c @@ -19,6 +19,7 @@ #include <linux/profile.h> #include <linux/sched.h> #include <linux/module.h> +#include <trace/events/power.h> #include <asm/irq_regs.h> @@ -104,7 +105,17 @@ void tick_handle_periodic(struct clock_event_device *dev) tick_periodic(cpu); - if (dev->state != CLOCK_EVT_STATE_ONESHOT) +#if defined(CONFIG_HIGH_RES_TIMERS) || defined(CONFIG_NO_HZ_COMMON) + /* + * The cpu might have transitioned to HIGHRES or NOHZ mode via + * update_process_times() -> run_local_timers() -> + * hrtimer_run_queues(). + */ + if (dev->event_handler != tick_handle_periodic) + return; +#endif + + if (!clockevent_state_oneshot(dev)) return; for (;;) { /* @@ -142,7 +153,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) && !tick_broadcast_oneshot_active()) { - clockevents_set_state(dev, CLOCK_EVT_STATE_PERIODIC); + clockevents_switch_state(dev, CLOCK_EVT_STATE_PERIODIC); } else { unsigned long seq; ktime_t next; @@ -152,7 +163,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) next = tick_next_period; } while (read_seqcount_retry(&jiffies_seq, seq)); - clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT); + clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT); for (;;) { if (!clockevents_program_event(dev, next, false)) @@ -295,9 +306,6 @@ void tick_check_new_device(struct clock_event_device *newdev) int cpu; cpu = smp_processor_id(); - if (!cpumask_test_cpu(cpu, newdev->cpumask)) - goto out_bc; - td = &per_cpu(tick_cpu_device, cpu); curdev = td->evtdev; @@ -334,6 +342,28 @@ out_bc: tick_install_broadcast_device(newdev); } +/** + * tick_broadcast_oneshot_control - Enter/exit broadcast oneshot mode + * @state: The target state (enter/exit) + * + * The system enters/leaves a state, where affected devices might stop + * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups. + * + * Called with interrupts disabled, so clockevents_lock is not + * required here because the local clock event device cannot go away + * under us. + */ +int tick_broadcast_oneshot_control(enum tick_broadcast_state state) +{ + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); + + if (!(td->evtdev->features & CLOCK_EVT_FEAT_C3STOP)) + return 0; + + return __tick_broadcast_oneshot_control(state); +} +EXPORT_SYMBOL_GPL(tick_broadcast_oneshot_control); + #ifdef CONFIG_HOTPLUG_CPU /* * Transfer the do_timer job away from a dying cpu. @@ -369,8 +399,7 @@ void tick_shutdown(unsigned int cpu) * Prevent that the clock events layer tries to call * the set mode function! */ - dev->state = CLOCK_EVT_STATE_DETACHED; - dev->mode = CLOCK_EVT_MODE_UNUSED; + clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED); clockevents_exchange_device(dev, NULL); dev->event_handler = clockevents_handle_noop; td->evtdev = NULL; @@ -442,6 +471,7 @@ void tick_resume(void) tick_resume_local(); } +#ifdef CONFIG_SUSPEND static DEFINE_RAW_SPINLOCK(tick_freeze_lock); static unsigned int tick_freeze_depth; @@ -459,10 +489,13 @@ void tick_freeze(void) raw_spin_lock(&tick_freeze_lock); tick_freeze_depth++; - if (tick_freeze_depth == num_online_cpus()) + if (tick_freeze_depth == num_online_cpus()) { + trace_suspend_resume(TPS("timekeeping_freeze"), + smp_processor_id(), true); timekeeping_suspend(); - else + } else { tick_suspend_local(); + } raw_spin_unlock(&tick_freeze_lock); } @@ -480,15 +513,19 @@ void tick_unfreeze(void) { raw_spin_lock(&tick_freeze_lock); - if (tick_freeze_depth == num_online_cpus()) + if (tick_freeze_depth == num_online_cpus()) { timekeeping_resume(); - else + trace_suspend_resume(TPS("timekeeping_freeze"), + smp_processor_id(), false); + } else { tick_resume_local(); + } tick_freeze_depth--; raw_spin_unlock(&tick_freeze_lock); } +#endif /* CONFIG_SUSPEND */ /** * tick_init - initialize the tick control diff --git a/kernel/kernel/time/tick-internal.h b/kernel/kernel/time/tick-internal.h index b64fdd805..966a5a6fd 100644 --- a/kernel/kernel/time/tick-internal.h +++ b/kernel/kernel/time/tick-internal.h @@ -36,11 +36,22 @@ static inline int tick_device_is_functional(struct clock_event_device *dev) return !(dev->features & CLOCK_EVT_FEAT_DUMMY); } +static inline enum clock_event_state clockevent_get_state(struct clock_event_device *dev) +{ + return dev->state_use_accessors; +} + +static inline void clockevent_set_state(struct clock_event_device *dev, + enum clock_event_state state) +{ + dev->state_use_accessors = state; +} + extern void clockevents_shutdown(struct clock_event_device *dev); extern void clockevents_exchange_device(struct clock_event_device *old, struct clock_event_device *new); -extern void clockevents_set_state(struct clock_event_device *dev, - enum clock_event_state state); +extern void clockevents_switch_state(struct clock_event_device *dev, + enum clock_event_state state); extern int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, bool force); extern void clockevents_handle_noop(struct clock_event_device *dev); @@ -137,3 +148,19 @@ extern void tick_nohz_init(void); # else static inline void tick_nohz_init(void) { } #endif + +#ifdef CONFIG_NO_HZ_COMMON +extern unsigned long tick_nohz_active; +#else +#define tick_nohz_active (0) +#endif + +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +extern void timers_update_migration(bool update_nohz); +#else +static inline void timers_update_migration(bool update_nohz) { } +#endif + +DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); + +extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem); diff --git a/kernel/kernel/time/tick-oneshot.c b/kernel/kernel/time/tick-oneshot.c index 67a64b167..b51344652 100644 --- a/kernel/kernel/time/tick-oneshot.c +++ b/kernel/kernel/time/tick-oneshot.c @@ -28,6 +28,22 @@ int tick_program_event(ktime_t expires, int force) { struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); + if (unlikely(expires.tv64 == KTIME_MAX)) { + /* + * We don't need the clock event device any more, stop it. + */ + clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT_STOPPED); + return 0; + } + + if (unlikely(clockevent_state_oneshot_stopped(dev))) { + /* + * We need the clock event again, configure it in ONESHOT mode + * before using it. + */ + clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT); + } + return clockevents_program_event(dev, expires, force); } @@ -38,7 +54,7 @@ void tick_resume_oneshot(void) { struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); - clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT); + clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT); clockevents_program_event(dev, ktime_get(), true); } @@ -50,7 +66,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev, ktime_t next_event) { newdev->event_handler = handler; - clockevents_set_state(newdev, CLOCK_EVT_STATE_ONESHOT); + clockevents_switch_state(newdev, CLOCK_EVT_STATE_ONESHOT); clockevents_program_event(newdev, next_event, true); } @@ -81,7 +97,7 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) td->mode = TICKDEV_MODE_ONESHOT; dev->event_handler = handler; - clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT); + clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT); tick_broadcast_switch_to_oneshot(); return 0; } diff --git a/kernel/kernel/time/tick-sched.c b/kernel/kernel/time/tick-sched.c index f61dbf202..d536824cb 100644 --- a/kernel/kernel/time/tick-sched.c +++ b/kernel/kernel/time/tick-sched.c @@ -207,27 +207,9 @@ static bool can_stop_full_tick(void) return true; } -static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now); - -/* - * Re-evaluate the need for the tick on the current CPU - * and restart it if necessary. - */ -void __tick_nohz_full_check(void) -{ - struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); - - if (tick_nohz_full_cpu(smp_processor_id())) { - if (ts->tick_stopped && !is_idle_task(current)) { - if (!can_stop_full_tick()) - tick_nohz_restart_sched_tick(ts, ktime_get()); - } - } -} - static void nohz_full_kick_work_func(struct irq_work *work) { - __tick_nohz_full_check(); + /* Empty, the tick restart happens on tick_nohz_irq_exit() */ } static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { @@ -263,7 +245,7 @@ void tick_nohz_full_kick_cpu(int cpu) static void nohz_full_kick_ipi(void *info) { - __tick_nohz_full_check(); + /* Empty, the tick restart happens on tick_nohz_irq_exit() */ } /* @@ -287,7 +269,7 @@ void tick_nohz_full_kick_all(void) * It might need the tick due to per task/process properties: * perf events, posix cpu timers, ... */ -void __tick_nohz_task_switch(struct task_struct *tsk) +void __tick_nohz_task_switch(void) { unsigned long flags; @@ -319,16 +301,17 @@ static int __init tick_nohz_full_setup(char *str) __setup("nohz_full=", tick_nohz_full_setup); static int tick_nohz_cpu_down_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) + unsigned long action, + void *hcpu) { unsigned int cpu = (unsigned long)hcpu; switch (action & ~CPU_TASKS_FROZEN) { case CPU_DOWN_PREPARE: /* - * If we handle the timekeeping duty for full dynticks CPUs, - * we can't safely shutdown that CPU. + * The boot CPU handles housekeeping duty (unbound timers, + * workqueues, timekeeping, ...) on behalf of full dynticks + * CPUs. It must remain online when nohz full is enabled. */ if (tick_nohz_full_running && tick_do_timer_cpu == cpu) return NOTIFY_BAD; @@ -399,6 +382,12 @@ void __init tick_nohz_init(void) cpu_notifier(tick_nohz_cpu_down_callback, 0); pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n", cpumask_pr_args(tick_nohz_full_mask)); + + /* + * We need at least one CPU to handle housekeeping work such + * as timekeeping, unbound timers, workqueues, ... + */ + WARN_ON_ONCE(cpumask_empty(housekeeping_mask)); } #endif @@ -410,7 +399,7 @@ void __init tick_nohz_init(void) * NO HZ enabled ? */ static int tick_nohz_enabled __read_mostly = 1; -int tick_nohz_active __read_mostly; +unsigned long tick_nohz_active __read_mostly; /* * Enable / Disable tickless mode */ @@ -578,179 +567,176 @@ EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) { - hrtimer_cancel(&ts->sched_timer); - hrtimer_set_expires(&ts->sched_timer, ts->last_tick); + hrtimer_cancel(&ts->sched_timer); + hrtimer_set_expires(&ts->sched_timer, ts->last_tick); - /* Forward the time to expire in the future */ - hrtimer_forward(&ts->sched_timer, now, tick_period); + /* Forward the time to expire in the future */ + hrtimer_forward(&ts->sched_timer, now, tick_period); - if (ts->nohz_mode == NOHZ_MODE_HIGHRES) - hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED); - else - tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) + hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED); + else + tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); } static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, ktime_t now, int cpu) { - unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; - ktime_t last_update, expires, ret = { .tv64 = 0 }; - unsigned long rcu_delta_jiffies; struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); - u64 time_delta; - - time_delta = timekeeping_max_deferment(); + u64 basemono, next_tick, next_tmr, next_rcu, delta, expires; + unsigned long seq, basejiff; + ktime_t tick; /* Read jiffies and the time when jiffies were updated last */ do { seq = read_seqcount_begin(&jiffies_seq); - last_update = last_jiffies_update; - last_jiffies = jiffies; + basemono = last_jiffies_update.tv64; + basejiff = jiffies; } while (read_seqcount_retry(&jiffies_seq, seq)); + ts->last_jiffies = basejiff; - if (rcu_needs_cpu(&rcu_delta_jiffies) || + if (rcu_needs_cpu(basemono, &next_rcu) || arch_needs_cpu() || irq_work_needs_cpu()) { - next_jiffies = last_jiffies + 1; - delta_jiffies = 1; + next_tick = basemono + TICK_NSEC; } else { - /* Get the next timer wheel timer */ - next_jiffies = get_next_timer_interrupt(last_jiffies); - delta_jiffies = next_jiffies - last_jiffies; - if (rcu_delta_jiffies < delta_jiffies) { - next_jiffies = last_jiffies + rcu_delta_jiffies; - delta_jiffies = rcu_delta_jiffies; - } + /* + * Get the next pending timer. If high resolution + * timers are enabled this only takes the timer wheel + * timers into account. If high resolution timers are + * disabled this also looks at the next expiring + * hrtimer. + */ + next_tmr = get_next_timer_interrupt(basejiff, basemono); + ts->next_timer = next_tmr; + /* Take the next rcu event into account */ + next_tick = next_rcu < next_tmr ? next_rcu : next_tmr; } /* - * Do not stop the tick, if we are only one off (or less) - * or if the cpu is required for RCU: + * If the tick is due in the next period, keep it ticking or + * restart it proper. */ - if (!ts->tick_stopped && delta_jiffies <= 1) - goto out; - - /* Schedule the tick, if we are at least one jiffie off */ - if ((long)delta_jiffies >= 1) { - - /* - * If this cpu is the one which updates jiffies, then - * give up the assignment and let it be taken by the - * cpu which runs the tick timer next, which might be - * this cpu as well. If we don't drop this here the - * jiffies might be stale and do_timer() never - * invoked. Keep track of the fact that it was the one - * which had the do_timer() duty last. If this cpu is - * the one which had the do_timer() duty last, we - * limit the sleep time to the timekeeping - * max_deferement value which we retrieved - * above. Otherwise we can sleep as long as we want. - */ - if (cpu == tick_do_timer_cpu) { - tick_do_timer_cpu = TICK_DO_TIMER_NONE; - ts->do_timer_last = 1; - } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) { - time_delta = KTIME_MAX; - ts->do_timer_last = 0; - } else if (!ts->do_timer_last) { - time_delta = KTIME_MAX; + delta = next_tick - basemono; + if (delta <= (u64)TICK_NSEC) { + tick.tv64 = 0; + if (!ts->tick_stopped) + goto out; + if (delta == 0) { + /* Tick is stopped, but required now. Enforce it */ + tick_nohz_restart(ts, now); + goto out; } + } + + /* + * If this cpu is the one which updates jiffies, then give up + * the assignment and let it be taken by the cpu which runs + * the tick timer next, which might be this cpu as well. If we + * don't drop this here the jiffies might be stale and + * do_timer() never invoked. Keep track of the fact that it + * was the one which had the do_timer() duty last. If this cpu + * is the one which had the do_timer() duty last, we limit the + * sleep time to the timekeeping max_deferement value. + * Otherwise we can sleep as long as we want. + */ + delta = timekeeping_max_deferment(); + if (cpu == tick_do_timer_cpu) { + tick_do_timer_cpu = TICK_DO_TIMER_NONE; + ts->do_timer_last = 1; + } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) { + delta = KTIME_MAX; + ts->do_timer_last = 0; + } else if (!ts->do_timer_last) { + delta = KTIME_MAX; + } #ifdef CONFIG_NO_HZ_FULL - if (!ts->inidle) { - time_delta = min(time_delta, - scheduler_tick_max_deferment()); - } + /* Limit the tick delta to the maximum scheduler deferment */ + if (!ts->inidle) + delta = min(delta, scheduler_tick_max_deferment()); #endif - /* - * calculate the expiry time for the next timer wheel - * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals - * that there is no timer pending or at least extremely - * far into the future (12 days for HZ=1000). In this - * case we set the expiry to the end of time. - */ - if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) { - /* - * Calculate the time delta for the next timer event. - * If the time delta exceeds the maximum time delta - * permitted by the current clocksource then adjust - * the time delta accordingly to ensure the - * clocksource does not wrap. - */ - time_delta = min_t(u64, time_delta, - tick_period.tv64 * delta_jiffies); - } - - if (time_delta < KTIME_MAX) - expires = ktime_add_ns(last_update, time_delta); - else - expires.tv64 = KTIME_MAX; - - /* Skip reprogram of event if its not changed */ - if (ts->tick_stopped && ktime_equal(expires, dev->next_event)) - goto out; + /* Calculate the next expiry time */ + if (delta < (KTIME_MAX - basemono)) + expires = basemono + delta; + else + expires = KTIME_MAX; - ret = expires; + expires = min_t(u64, expires, next_tick); + tick.tv64 = expires; - /* - * nohz_stop_sched_tick can be called several times before - * the nohz_restart_sched_tick is called. This happens when - * interrupts arrive which do not cause a reschedule. In the - * first call we save the current tick time, so we can restart - * the scheduler tick in nohz_restart_sched_tick. - */ - if (!ts->tick_stopped) { - nohz_balance_enter_idle(cpu); - calc_load_enter_idle(); - - ts->last_tick = hrtimer_get_expires(&ts->sched_timer); - ts->tick_stopped = 1; - trace_tick_stop(1, " "); - } + /* Skip reprogram of event if its not changed */ + if (ts->tick_stopped && (expires == dev->next_event.tv64)) + goto out; - /* - * If the expiration time == KTIME_MAX, then - * in this case we simply stop the tick timer. - */ - if (unlikely(expires.tv64 == KTIME_MAX)) { - if (ts->nohz_mode == NOHZ_MODE_HIGHRES) - hrtimer_cancel(&ts->sched_timer); - goto out; - } + /* + * nohz_stop_sched_tick can be called several times before + * the nohz_restart_sched_tick is called. This happens when + * interrupts arrive which do not cause a reschedule. In the + * first call we save the current tick time, so we can restart + * the scheduler tick in nohz_restart_sched_tick. + */ + if (!ts->tick_stopped) { + nohz_balance_enter_idle(cpu); + calc_load_enter_idle(); - if (ts->nohz_mode == NOHZ_MODE_HIGHRES) - hrtimer_start(&ts->sched_timer, expires, - HRTIMER_MODE_ABS_PINNED); - else - tick_program_event(expires, 1); - } else { - /* Tick is stopped, but required now. Enforce it */ - tick_nohz_restart(ts, now); + ts->last_tick = hrtimer_get_expires(&ts->sched_timer); + ts->tick_stopped = 1; + trace_tick_stop(1, " "); + } + /* + * If the expiration time == KTIME_MAX, then we simply stop + * the tick timer. + */ + if (unlikely(expires == KTIME_MAX)) { + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) + hrtimer_cancel(&ts->sched_timer); + goto out; } + + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) + hrtimer_start(&ts->sched_timer, tick, HRTIMER_MODE_ABS_PINNED); + else + tick_program_event(tick, 1); out: - ts->next_jiffies = next_jiffies; - ts->last_jiffies = last_jiffies; + /* Update the estimated sleep length */ ts->sleep_length = ktime_sub(dev->next_event, now); + return tick; +} - return ret; +static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) +{ + /* Update jiffies first */ + tick_do_update_jiffies64(now); + update_cpu_load_nohz(); + + calc_load_exit_idle(); + touch_softlockup_watchdog(); + /* + * Cancel the scheduled timer and restore the tick + */ + ts->tick_stopped = 0; + ts->idle_exittime = now; + + tick_nohz_restart(ts, now); } -static void tick_nohz_full_stop_tick(struct tick_sched *ts) +static void tick_nohz_full_update_tick(struct tick_sched *ts) { #ifdef CONFIG_NO_HZ_FULL int cpu = smp_processor_id(); - if (!tick_nohz_full_cpu(cpu) || is_idle_task(current)) + if (!tick_nohz_full_cpu(cpu)) return; if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) return; - if (!can_stop_full_tick()) - return; - - tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); + if (can_stop_full_tick()) + tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); + else if (ts->tick_stopped) + tick_nohz_restart_sched_tick(ts, ktime_get()); #endif } @@ -873,7 +859,7 @@ void tick_nohz_irq_exit(void) if (ts->inidle) __tick_nohz_idle_enter(ts); else - tick_nohz_full_stop_tick(ts); + tick_nohz_full_update_tick(ts); } /** @@ -888,23 +874,6 @@ ktime_t tick_nohz_get_sleep_length(void) return ts->sleep_length; } -static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) -{ - /* Update jiffies first */ - tick_do_update_jiffies64(now); - update_cpu_load_nohz(); - - calc_load_exit_idle(); - touch_softlockup_watchdog(); - /* - * Cancel the scheduled timer and restore the tick - */ - ts->tick_stopped = 0; - ts->idle_exittime = now; - - tick_nohz_restart(ts, now); -} - static void tick_nohz_account_idle_ticks(struct tick_sched *ts) { #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE @@ -958,12 +927,6 @@ void tick_nohz_idle_exit(void) local_irq_enable(); } -static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now) -{ - hrtimer_forward(&ts->sched_timer, now, tick_period); - return tick_program_event(hrtimer_get_expires(&ts->sched_timer), 0); -} - /* * The nohz low res interrupt handler */ @@ -982,10 +945,18 @@ static void tick_nohz_handler(struct clock_event_device *dev) if (unlikely(ts->tick_stopped)) return; - while (tick_nohz_reprogram(ts, now)) { - now = ktime_get(); - tick_do_update_jiffies64(now); - } + hrtimer_forward(&ts->sched_timer, now, tick_period); + tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); +} + +static inline void tick_nohz_activate(struct tick_sched *ts, int mode) +{ + if (!tick_nohz_enabled) + return; + ts->nohz_mode = mode; + /* One update is enough */ + if (!test_and_set_bit(0, &tick_nohz_active)) + timers_update_migration(true); } /** @@ -999,13 +970,8 @@ static void tick_nohz_switch_to_nohz(void) if (!tick_nohz_enabled) return; - local_irq_disable(); - if (tick_switch_to_oneshot(tick_nohz_handler)) { - local_irq_enable(); + if (tick_switch_to_oneshot(tick_nohz_handler)) return; - } - tick_nohz_active = 1; - ts->nohz_mode = NOHZ_MODE_LOWRES; /* * Recycle the hrtimer in ts, so we can share the @@ -1015,13 +981,10 @@ static void tick_nohz_switch_to_nohz(void) /* Get the next period */ next = tick_init_jiffy_update(); - for (;;) { - hrtimer_set_expires(&ts->sched_timer, next); - if (!tick_program_event(next, 0)) - break; - next = ktime_add(next, tick_period); - } - local_irq_enable(); + hrtimer_set_expires(&ts->sched_timer, next); + hrtimer_forward_now(&ts->sched_timer, tick_period); + tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); + tick_nohz_activate(ts, NOHZ_MODE_LOWRES); } /* @@ -1073,6 +1036,7 @@ static inline void tick_nohz_irq_enter(void) static inline void tick_nohz_switch_to_nohz(void) { } static inline void tick_nohz_irq_enter(void) { } +static inline void tick_nohz_activate(struct tick_sched *ts, int mode) { } #endif /* CONFIG_NO_HZ_COMMON */ @@ -1154,22 +1118,9 @@ void tick_setup_sched_timer(void) hrtimer_add_expires_ns(&ts->sched_timer, offset); } - for (;;) { - hrtimer_forward(&ts->sched_timer, now, tick_period); - hrtimer_start_expires(&ts->sched_timer, - HRTIMER_MODE_ABS_PINNED); - /* Check, if the timer was already in the past */ - if (hrtimer_active(&ts->sched_timer)) - break; - now = ktime_get(); - } - -#ifdef CONFIG_NO_HZ_COMMON - if (tick_nohz_enabled) { - ts->nohz_mode = NOHZ_MODE_HIGHRES; - tick_nohz_active = 1; - } -#endif + hrtimer_forward(&ts->sched_timer, now, tick_period); + hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED); + tick_nohz_activate(ts, NOHZ_MODE_HIGHRES); } #endif /* HIGH_RES_TIMERS */ @@ -1214,7 +1165,7 @@ void tick_oneshot_notify(void) * Called cyclic from the hrtimer softirq (driven by the timer * softirq) allow_nohz signals, that we can switch into low-res nohz * mode, because high resolution timers are disabled (either compile - * or runtime). + * or runtime). Called with interrupts disabled. */ int tick_check_oneshot_change(int allow_nohz) { diff --git a/kernel/kernel/time/tick-sched.h b/kernel/kernel/time/tick-sched.h index 28b5da3e1..a4a8d4e9b 100644 --- a/kernel/kernel/time/tick-sched.h +++ b/kernel/kernel/time/tick-sched.h @@ -57,7 +57,7 @@ struct tick_sched { ktime_t iowait_sleeptime; ktime_t sleep_length; unsigned long last_jiffies; - unsigned long next_jiffies; + u64 next_timer; ktime_t idle_expires; int do_timer_last; }; @@ -71,4 +71,14 @@ extern void tick_cancel_sched_timer(int cpu); static inline void tick_cancel_sched_timer(int cpu) { } #endif +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +extern int __tick_broadcast_oneshot_control(enum tick_broadcast_state state); +#else +static inline int +__tick_broadcast_oneshot_control(enum tick_broadcast_state state) +{ + return -EBUSY; +} +#endif + #endif diff --git a/kernel/kernel/time/time.c b/kernel/kernel/time/time.c index 2c85b7724..86751c68e 100644 --- a/kernel/kernel/time/time.c +++ b/kernel/kernel/time/time.c @@ -41,7 +41,7 @@ #include <asm/uaccess.h> #include <asm/unistd.h> -#include "timeconst.h" +#include <generated/timeconst.h> #include "timekeeping.h" /* @@ -173,6 +173,10 @@ int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz) return error; if (tz) { + /* Verify we're witin the +-15 hrs range */ + if (tz->tz_minuteswest > 15*60 || tz->tz_minuteswest < -15*60) + return -EINVAL; + sys_tz = *tz; update_vsyscall_tz(); if (firsttime) { @@ -264,10 +268,14 @@ EXPORT_SYMBOL(jiffies_to_msecs); unsigned int jiffies_to_usecs(const unsigned long j) { -#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) + /* + * Hz usually doesn't go much further MSEC_PER_SEC. + * jiffies_to_usecs() and usecs_to_jiffies() depend on that. + */ + BUILD_BUG_ON(HZ > USEC_PER_SEC); + +#if !(USEC_PER_SEC % HZ) return (USEC_PER_SEC / HZ) * j; -#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) - return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC); #else # if BITS_PER_LONG == 32 return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32; @@ -283,26 +291,20 @@ EXPORT_SYMBOL(jiffies_to_usecs); * @t: Timespec * @gran: Granularity in ns. * - * Truncate a timespec to a granularity. gran must be smaller than a second. - * Always rounds down. - * - * This function should be only used for timestamps returned by - * current_kernel_time() or CURRENT_TIME, not with do_gettimeofday() because - * it doesn't handle the better resolution of the latter. + * Truncate a timespec to a granularity. Always rounds down. gran must + * not be 0 nor greater than a second (NSEC_PER_SEC, or 10^9 ns). */ struct timespec timespec_trunc(struct timespec t, unsigned gran) { - /* - * Division is pretty slow so avoid it for common cases. - * Currently current_kernel_time() never returns better than - * jiffies resolution. Exploit that. - */ - if (gran <= jiffies_to_usecs(1) * 1000) { + /* Avoid division in the common cases 1 ns and 1 s. */ + if (gran == 1) { /* nothing */ - } else if (gran == 1000000000) { + } else if (gran == NSEC_PER_SEC) { t.tv_nsec = 0; - } else { + } else if (gran > 1 && gran < NSEC_PER_SEC) { t.tv_nsec -= t.tv_nsec % gran; + } else { + WARN(1, "illegal file time granularity: %u", gran); } return t; } @@ -483,9 +485,11 @@ struct timespec64 ns_to_timespec64(const s64 nsec) } EXPORT_SYMBOL(ns_to_timespec64); #endif -/* - * When we convert to jiffies then we interpret incoming values - * the following way: +/** + * msecs_to_jiffies: - convert milliseconds to jiffies + * @m: time in milliseconds + * + * conversion is done as follows: * * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET) * @@ -493,66 +497,36 @@ EXPORT_SYMBOL(ns_to_timespec64); * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too. * * - all other values are converted to jiffies by either multiplying - * the input value by a factor or dividing it with a factor - * - * We must also be careful about 32-bit overflows. + * the input value by a factor or dividing it with a factor and + * handling any 32-bit overflows. + * for the details see __msecs_to_jiffies() + * + * msecs_to_jiffies() checks for the passed in value being a constant + * via __builtin_constant_p() allowing gcc to eliminate most of the + * code, __msecs_to_jiffies() is called if the value passed does not + * allow constant folding and the actual conversion must be done at + * runtime. + * the _msecs_to_jiffies helpers are the HZ dependent conversion + * routines found in include/linux/jiffies.h */ -unsigned long msecs_to_jiffies(const unsigned int m) +unsigned long __msecs_to_jiffies(const unsigned int m) { /* * Negative value, means infinite timeout: */ if ((int)m < 0) return MAX_JIFFY_OFFSET; - -#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) - /* - * HZ is equal to or smaller than 1000, and 1000 is a nice - * round multiple of HZ, divide with the factor between them, - * but round upwards: - */ - return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ); -#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) - /* - * HZ is larger than 1000, and HZ is a nice round multiple of - * 1000 - simply multiply with the factor between them. - * - * But first make sure the multiplication result cannot - * overflow: - */ - if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) - return MAX_JIFFY_OFFSET; - - return m * (HZ / MSEC_PER_SEC); -#else - /* - * Generic case - multiply, round and divide. But first - * check that if we are doing a net multiplication, that - * we wouldn't overflow: - */ - if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) - return MAX_JIFFY_OFFSET; - - return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32) - >> MSEC_TO_HZ_SHR32; -#endif + return _msecs_to_jiffies(m); } -EXPORT_SYMBOL(msecs_to_jiffies); +EXPORT_SYMBOL(__msecs_to_jiffies); -unsigned long usecs_to_jiffies(const unsigned int u) +unsigned long __usecs_to_jiffies(const unsigned int u) { if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET)) return MAX_JIFFY_OFFSET; -#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) - return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ); -#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) - return u * (HZ / USEC_PER_SEC); -#else - return (USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32) - >> USEC_TO_HZ_SHR32; -#endif + return _usecs_to_jiffies(u); } -EXPORT_SYMBOL(usecs_to_jiffies); +EXPORT_SYMBOL(__usecs_to_jiffies); /* * The TICK_NSEC - 1 rounds up the value to the next resolution. Note @@ -570,7 +544,7 @@ EXPORT_SYMBOL(usecs_to_jiffies); * value to a scaled second value. */ static unsigned long -__timespec_to_jiffies(unsigned long sec, long nsec) +__timespec64_to_jiffies(u64 sec, long nsec) { nsec = nsec + TICK_NSEC - 1; @@ -578,22 +552,27 @@ __timespec_to_jiffies(unsigned long sec, long nsec) sec = MAX_SEC_IN_JIFFIES; nsec = 0; } - return (((u64)sec * SEC_CONVERSION) + + return ((sec * SEC_CONVERSION) + (((u64)nsec * NSEC_CONVERSION) >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; } -unsigned long -timespec_to_jiffies(const struct timespec *value) +static unsigned long +__timespec_to_jiffies(unsigned long sec, long nsec) { - return __timespec_to_jiffies(value->tv_sec, value->tv_nsec); + return __timespec64_to_jiffies((u64)sec, nsec); } -EXPORT_SYMBOL(timespec_to_jiffies); +unsigned long +timespec64_to_jiffies(const struct timespec64 *value) +{ + return __timespec64_to_jiffies(value->tv_sec, value->tv_nsec); +} +EXPORT_SYMBOL(timespec64_to_jiffies); void -jiffies_to_timespec(const unsigned long jiffies, struct timespec *value) +jiffies_to_timespec64(const unsigned long jiffies, struct timespec64 *value) { /* * Convert jiffies to nanoseconds and separate with @@ -604,7 +583,7 @@ jiffies_to_timespec(const unsigned long jiffies, struct timespec *value) NSEC_PER_SEC, &rem); value->tv_nsec = rem; } -EXPORT_SYMBOL(jiffies_to_timespec); +EXPORT_SYMBOL(jiffies_to_timespec64); /* * We could use a similar algorithm to timespec_to_jiffies (with a diff --git a/kernel/kernel/time/timeconst.bc b/kernel/kernel/time/timeconst.bc index 511bdf2ca..c48688904 100644 --- a/kernel/kernel/time/timeconst.bc +++ b/kernel/kernel/time/timeconst.bc @@ -39,7 +39,7 @@ define fmuls(b,n,d) { } define timeconst(hz) { - print "/* Automatically generated by kernel/timeconst.bc */\n" + print "/* Automatically generated by kernel/time/timeconst.bc */\n" print "/* Time conversion constants for HZ == ", hz, " */\n" print "\n" @@ -50,7 +50,7 @@ define timeconst(hz) { print "#include <linux/types.h>\n\n" print "#if HZ != ", hz, "\n" - print "#error \qkernel/timeconst.h has the wrong HZ value!\q\n" + print "#error \qinclude/generated/timeconst.h has the wrong HZ value!\q\n" print "#endif\n\n" if (hz < 2) { @@ -105,4 +105,5 @@ define timeconst(hz) { halt } +hz = read(); timeconst(hz) diff --git a/kernel/kernel/time/timekeeping.c b/kernel/kernel/time/timekeeping.c index 17d4a8933..a1c5c6fc2 100644 --- a/kernel/kernel/time/timekeeping.c +++ b/kernel/kernel/time/timekeeping.c @@ -118,18 +118,6 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) #ifdef CONFIG_DEBUG_TIMEKEEPING #define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */ -/* - * These simple flag variables are managed - * without locks, which is racy, but ok since - * we don't really care about being super - * precise about how many events were seen, - * just that a problem was observed. - */ -static int timekeeping_underflow_seen; -static int timekeeping_overflow_seen; - -/* last_warning is only modified under the timekeeping lock */ -static long timekeeping_last_warning; static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset) { @@ -149,29 +137,30 @@ static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset) } } - if (timekeeping_underflow_seen) { - if (jiffies - timekeeping_last_warning > WARNING_FREQ) { + if (tk->underflow_seen) { + if (jiffies - tk->last_warning > WARNING_FREQ) { printk_deferred("WARNING: Underflow in clocksource '%s' observed, time update ignored.\n", name); printk_deferred(" Please report this, consider using a different clocksource, if possible.\n"); printk_deferred(" Your kernel is probably still fine.\n"); - timekeeping_last_warning = jiffies; + tk->last_warning = jiffies; } - timekeeping_underflow_seen = 0; + tk->underflow_seen = 0; } - if (timekeeping_overflow_seen) { - if (jiffies - timekeeping_last_warning > WARNING_FREQ) { + if (tk->overflow_seen) { + if (jiffies - tk->last_warning > WARNING_FREQ) { printk_deferred("WARNING: Overflow in clocksource '%s' observed, time update capped.\n", name); printk_deferred(" Please report this, consider using a different clocksource, if possible.\n"); printk_deferred(" Your kernel is probably still fine.\n"); - timekeeping_last_warning = jiffies; + tk->last_warning = jiffies; } - timekeeping_overflow_seen = 0; + tk->overflow_seen = 0; } } static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr) { + struct timekeeper *tk = &tk_core.timekeeper; cycle_t now, last, mask, max, delta; unsigned int seq; @@ -197,13 +186,13 @@ static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr) * mask-relative negative values. */ if (unlikely((~delta & mask) < (mask >> 3))) { - timekeeping_underflow_seen = 1; + tk->underflow_seen = 1; delta = 0; } /* Cap delta value to the max_cycles values to avoid mult overflows */ if (unlikely(delta > max)) { - timekeeping_overflow_seen = 1; + tk->overflow_seen = 1; delta = tkr->clock->max_cycles; } @@ -316,8 +305,7 @@ static inline s64 timekeeping_get_ns(struct tk_read_base *tkr) delta = timekeeping_get_delta(tkr); - nsec = delta * tkr->mult + tkr->xtime_nsec; - nsec >>= tkr->shift; + nsec = (delta * tkr->mult + tkr->xtime_nsec) >> tkr->shift; /* If arch requires, add in get_arch_timeoffset() */ return nsec + arch_gettimeoffset(); @@ -330,32 +318,7 @@ static inline s64 timekeeping_get_ns(struct tk_read_base *tkr) * We want to use this from any context including NMI and tracing / * instrumenting the timekeeping code itself. * - * So we handle this differently than the other timekeeping accessor - * functions which retry when the sequence count has changed. The - * update side does: - * - * smp_wmb(); <- Ensure that the last base[1] update is visible - * tkf->seq++; - * smp_wmb(); <- Ensure that the seqcount update is visible - * update(tkf->base[0], tkr); - * smp_wmb(); <- Ensure that the base[0] update is visible - * tkf->seq++; - * smp_wmb(); <- Ensure that the seqcount update is visible - * update(tkf->base[1], tkr); - * - * The reader side does: - * - * do { - * seq = tkf->seq; - * smp_rmb(); - * idx = seq & 0x01; - * now = now(tkf->base[idx]); - * smp_rmb(); - * } while (seq != tkf->seq) - * - * As long as we update base[0] readers are forced off to - * base[1]. Once base[0] is updated readers are redirected to base[0] - * and the base[1] update takes place. + * Employ the latch technique; see @raw_write_seqcount_latch. * * So if a NMI hits the update of base[0] then it will use base[1] * which is still consistent. In the worst case this can result is a @@ -418,7 +381,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) u64 now; do { - seq = raw_read_seqcount(&tkf->seq); + seq = raw_read_seqcount_latch(&tkf->seq); tkr = tkf->base + (seq & 0x01); now = ktime_to_ns(tkr->base) + timekeeping_get_ns(tkr); } while (read_seqcount_retry(&tkf->seq, seq)); @@ -551,6 +514,17 @@ int pvclock_gtod_unregister_notifier(struct notifier_block *nb) EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); /* + * tk_update_leap_state - helper to update the next_leap_ktime + */ +static inline void tk_update_leap_state(struct timekeeper *tk) +{ + tk->next_leap_ktime = ntp_get_next_leap(); + if (tk->next_leap_ktime.tv64 != KTIME_MAX) + /* Convert to monotonic time */ + tk->next_leap_ktime = ktime_sub(tk->next_leap_ktime, tk->offs_real); +} + +/* * Update the ktime_t based scalar nsec members of the timekeeper */ static inline void tk_update_ktime_data(struct timekeeper *tk) @@ -591,17 +565,25 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) ntp_clear(); } + tk_update_leap_state(tk); tk_update_ktime_data(tk); update_vsyscall(tk); update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); + update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono); + update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw); + + if (action & TK_CLOCK_WAS_SET) + tk->clock_was_set_seq++; + /* + * The mirroring of the data to the shadow-timekeeper needs + * to happen last here to ensure we don't over-write the + * timekeeper structure on the next update with stale data + */ if (action & TK_MIRROR) memcpy(&shadow_timekeeper, &tk_core.timekeeper, sizeof(tk_core.timekeeper)); - - update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono); - update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw); } /** @@ -699,6 +681,23 @@ ktime_t ktime_get(void) } EXPORT_SYMBOL_GPL(ktime_get); +u32 ktime_get_resolution_ns(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned int seq; + u32 nsecs; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqcount_begin(&tk_core.seq); + nsecs = tk->tkr_mono.mult >> tk->tkr_mono.shift; + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return nsecs; +} +EXPORT_SYMBOL_GPL(ktime_get_resolution_ns); + static ktime_t *offsets[TK_OFFS_MAX] = { [TK_OFFS_REAL] = &tk_core.timekeeper.offs_real, [TK_OFFS_BOOT] = &tk_core.timekeeper.offs_boot, @@ -849,7 +848,7 @@ EXPORT_SYMBOL_GPL(ktime_get_real_seconds); #ifdef CONFIG_NTP_PPS /** - * getnstime_raw_and_real - get day and raw monotonic time in timespec format + * ktime_get_raw_and_real_ts64 - get day and raw monotonic time in timespec format * @ts_raw: pointer to the timespec to be set to raw monotonic time * @ts_real: pointer to the timespec to be set to the time of day * @@ -857,7 +856,7 @@ EXPORT_SYMBOL_GPL(ktime_get_real_seconds); * same time atomically and stores the resulting timestamps in timespec * format. */ -void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) +void ktime_get_raw_and_real_ts64(struct timespec64 *ts_raw, struct timespec64 *ts_real) { struct timekeeper *tk = &tk_core.timekeeper; unsigned long seq; @@ -868,7 +867,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) do { seq = read_seqcount_begin(&tk_core.seq); - *ts_raw = timespec64_to_timespec(tk->raw_time); + *ts_raw = tk->raw_time; ts_real->tv_sec = tk->xtime_sec; ts_real->tv_nsec = 0; @@ -877,10 +876,10 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) } while (read_seqcount_retry(&tk_core.seq, seq)); - timespec_add_ns(ts_raw, nsecs_raw); - timespec_add_ns(ts_real, nsecs_real); + timespec64_add_ns(ts_raw, nsecs_raw); + timespec64_add_ns(ts_real, nsecs_real); } -EXPORT_SYMBOL(getnstime_raw_and_real); +EXPORT_SYMBOL(ktime_get_raw_and_real_ts64); #endif /* CONFIG_NTP_PPS */ @@ -911,6 +910,7 @@ int do_settimeofday64(const struct timespec64 *ts) struct timekeeper *tk = &tk_core.timekeeper; struct timespec64 ts_delta, xt; unsigned long flags; + int ret = 0; if (!timespec64_valid_strict(ts)) return -EINVAL; @@ -924,10 +924,15 @@ int do_settimeofday64(const struct timespec64 *ts) ts_delta.tv_sec = ts->tv_sec - xt.tv_sec; ts_delta.tv_nsec = ts->tv_nsec - xt.tv_nsec; + if (timespec64_compare(&tk->wall_to_monotonic, &ts_delta) > 0) { + ret = -EINVAL; + goto out; + } + tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta)); tk_set_xtime(tk, ts); - +out: timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); write_seqcount_end(&tk_core.seq); @@ -936,7 +941,7 @@ int do_settimeofday64(const struct timespec64 *ts) /* signal hrtimers about time change */ clock_was_set(); - return 0; + return ret; } EXPORT_SYMBOL(do_settimeofday64); @@ -965,7 +970,8 @@ int timekeeping_inject_offset(struct timespec *ts) /* Make sure the proposed value is valid */ tmp = timespec64_add(tk_xtime(tk), ts64); - if (!timespec64_valid_strict(&tmp)) { + if (timespec64_compare(&tk->wall_to_monotonic, &ts64) > 0 || + !timespec64_valid_strict(&tmp)) { ret = -EINVAL; goto error; } @@ -1179,28 +1185,20 @@ void __weak read_persistent_clock64(struct timespec64 *ts64) } /** - * read_boot_clock - Return time of the system start. + * read_boot_clock64 - Return time of the system start. * * Weak dummy function for arches that do not yet support it. * Function to read the exact time the system has been started. - * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported. + * Returns a timespec64 with tv_sec=0 and tv_nsec=0 if unsupported. * * XXX - Do be sure to remove it once all arches implement it. */ -void __weak read_boot_clock(struct timespec *ts) +void __weak read_boot_clock64(struct timespec64 *ts) { ts->tv_sec = 0; ts->tv_nsec = 0; } -void __weak read_boot_clock64(struct timespec64 *ts64) -{ - struct timespec ts; - - read_boot_clock(&ts); - *ts64 = timespec_to_timespec64(ts); -} - /* Flag for if timekeeping_resume() has injected sleeptime */ static bool sleeptime_injected; @@ -1252,7 +1250,7 @@ void __init timekeeping_init(void) set_normalized_timespec64(&tmp, -boot.tv_sec, -boot.tv_nsec); tk_set_wall_to_mono(tk, tmp); - timekeeping_update(tk, TK_MIRROR); + timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); @@ -1675,7 +1673,7 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) /** * accumulate_nsecs_to_secs - Accumulates nsecs into secs * - * Helper function that accumulates a the nsecs greater then a second + * Helper function that accumulates the nsecs greater than a second * from the xtime_nsec field to the xtime_secs field. * It also calls into the NTP code to handle leapsecond processing. * @@ -1727,7 +1725,7 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, cycle_t interval = tk->cycle_interval << shift; u64 raw_nsecs; - /* If the offset is smaller then a shifted interval, do nothing */ + /* If the offset is smaller than a shifted interval, do nothing */ if (offset < interval) return offset; @@ -1836,8 +1834,9 @@ void update_wall_time(void) * memcpy under the tk_core.seq against one before we start * updating. */ + timekeeping_update(tk, clock_set); memcpy(real_tk, tk, sizeof(*tk)); - timekeeping_update(real_tk, clock_set); + /* The memcpy must come last. Do not put anything here! */ write_seqcount_end(&tk_core.seq); out: raw_spin_unlock_irqrestore(&timekeeper_lock, flags); @@ -1881,7 +1880,7 @@ struct timespec __current_kernel_time(void) return timespec64_to_timespec(tk_xtime(tk)); } -struct timespec current_kernel_time(void) +struct timespec64 current_kernel_time64(void) { struct timekeeper *tk = &tk_core.timekeeper; struct timespec64 now; @@ -1893,9 +1892,9 @@ struct timespec current_kernel_time(void) now = tk_xtime(tk); } while (read_seqcount_retry(&tk_core.seq, seq)); - return timespec64_to_timespec(now); + return now; } -EXPORT_SYMBOL(current_kernel_time); +EXPORT_SYMBOL(current_kernel_time64); struct timespec64 get_monotonic_coarse64(void) { @@ -1926,47 +1925,20 @@ void do_timer(unsigned long ticks) } /** - * ktime_get_update_offsets_tick - hrtimer helper - * @offs_real: pointer to storage for monotonic -> realtime offset - * @offs_boot: pointer to storage for monotonic -> boottime offset - * @offs_tai: pointer to storage for monotonic -> clock tai offset - * - * Returns monotonic time at last tick and various offsets - */ -ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, ktime_t *offs_boot, - ktime_t *offs_tai) -{ - struct timekeeper *tk = &tk_core.timekeeper; - unsigned int seq; - ktime_t base; - u64 nsecs; - - do { - seq = read_seqcount_begin(&tk_core.seq); - - base = tk->tkr_mono.base; - nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; - - *offs_real = tk->offs_real; - *offs_boot = tk->offs_boot; - *offs_tai = tk->offs_tai; - } while (read_seqcount_retry(&tk_core.seq, seq)); - - return ktime_add_ns(base, nsecs); -} - -#ifdef CONFIG_HIGH_RES_TIMERS -/** * ktime_get_update_offsets_now - hrtimer helper + * @cwsseq: pointer to check and store the clock was set sequence number * @offs_real: pointer to storage for monotonic -> realtime offset * @offs_boot: pointer to storage for monotonic -> boottime offset * @offs_tai: pointer to storage for monotonic -> clock tai offset * - * Returns current monotonic time and updates the offsets + * Returns current monotonic time and updates the offsets if the + * sequence number in @cwsseq and timekeeper.clock_was_set_seq are + * different. + * * Called from hrtimer_interrupt() or retrigger_next_event() */ -ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot, - ktime_t *offs_tai) +ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, + ktime_t *offs_boot, ktime_t *offs_tai) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; @@ -1978,15 +1950,23 @@ ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot, base = tk->tkr_mono.base; nsecs = timekeeping_get_ns(&tk->tkr_mono); + base = ktime_add_ns(base, nsecs); + + if (*cwsseq != tk->clock_was_set_seq) { + *cwsseq = tk->clock_was_set_seq; + *offs_real = tk->offs_real; + *offs_boot = tk->offs_boot; + *offs_tai = tk->offs_tai; + } + + /* Handle leapsecond insertion adjustments */ + if (unlikely(base.tv64 >= tk->next_leap_ktime.tv64)) + *offs_real = ktime_sub(tk->offs_real, ktime_set(1, 0)); - *offs_real = tk->offs_real; - *offs_boot = tk->offs_boot; - *offs_tai = tk->offs_tai; } while (read_seqcount_retry(&tk_core.seq, seq)); - return ktime_add_ns(base, nsecs); + return base; } -#endif /** * do_adjtimex() - Accessor function to NTP __do_adjtimex function @@ -2027,6 +2007,8 @@ int do_adjtimex(struct timex *txc) __timekeeping_set_tai_offset(tk, tai); timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); } + tk_update_leap_state(tk); + write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); @@ -2042,7 +2024,7 @@ int do_adjtimex(struct timex *txc) /** * hardpps() - Accessor function to NTP __hardpps function */ -void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) +void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts) { unsigned long flags; diff --git a/kernel/kernel/time/timekeeping.h b/kernel/kernel/time/timekeeping.h index d7a9120a9..763a3e512 100644 --- a/kernel/kernel/time/timekeeping.h +++ b/kernel/kernel/time/timekeeping.h @@ -3,19 +3,16 @@ /* * Internal interfaces for kernel/time/ */ -extern ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, - ktime_t *offs_boot, - ktime_t *offs_tai); -extern ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, - ktime_t *offs_boot, - ktime_t *offs_tai); +extern ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, + ktime_t *offs_real, + ktime_t *offs_boot, + ktime_t *offs_tai); extern int timekeeping_valid_for_hres(void); extern u64 timekeeping_max_deferment(void); extern int timekeeping_inject_offset(struct timespec *ts); extern s32 timekeeping_get_tai_offset(void); extern void timekeeping_set_tai_offset(s32 tai_offset); -extern void timekeeping_clocktai(struct timespec *ts); extern int timekeeping_suspend(void); extern void timekeeping_resume(void); diff --git a/kernel/kernel/time/timer.c b/kernel/kernel/time/timer.c index adb1d82d6..fee8682c2 100644 --- a/kernel/kernel/time/timer.c +++ b/kernel/kernel/time/timer.c @@ -49,6 +49,8 @@ #include <asm/timex.h> #include <asm/io.h> +#include "tick-internal.h" + #define CREATE_TRACE_POINTS #include <trace/events/timer.h> @@ -68,11 +70,11 @@ EXPORT_SYMBOL(jiffies_64); #define MAX_TVAL ((unsigned long)((1ULL << (TVR_BITS + 4*TVN_BITS)) - 1)) struct tvec { - struct list_head vec[TVN_SIZE]; + struct hlist_head vec[TVN_SIZE]; }; struct tvec_root { - struct list_head vec[TVR_SIZE]; + struct hlist_head vec[TVR_SIZE]; }; struct tvec_base { @@ -86,6 +88,8 @@ struct tvec_base { unsigned long active_timers; unsigned long all_timers; int cpu; + bool migration_enabled; + bool nohz_active; struct tvec_root tv1; struct tvec tv2; struct tvec tv3; @@ -93,43 +97,60 @@ struct tvec_base { struct tvec tv5; } ____cacheline_aligned; -/* - * __TIMER_INITIALIZER() needs to set ->base to a valid pointer (because we've - * made NULL special, hint: lock_timer_base()) and we cannot get a compile time - * pointer to per-cpu entries because we don't know where we'll map the section, - * even for the boot cpu. - * - * And so we use boot_tvec_bases for boot CPU and per-cpu __tvec_bases for the - * rest of them. - */ -struct tvec_base boot_tvec_bases; -EXPORT_SYMBOL(boot_tvec_bases); -static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; +static DEFINE_PER_CPU(struct tvec_base, tvec_bases); -/* Functions below help us manage 'deferrable' flag */ -static inline unsigned int tbase_get_deferrable(struct tvec_base *base) +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +unsigned int sysctl_timer_migration = 1; + +void timers_update_migration(bool update_nohz) { - return ((unsigned int)(unsigned long)base & TIMER_DEFERRABLE); + bool on = sysctl_timer_migration && tick_nohz_active; + unsigned int cpu; + + /* Avoid the loop, if nothing to update */ + if (this_cpu_read(tvec_bases.migration_enabled) == on) + return; + + for_each_possible_cpu(cpu) { + per_cpu(tvec_bases.migration_enabled, cpu) = on; + per_cpu(hrtimer_bases.migration_enabled, cpu) = on; + if (!update_nohz) + continue; + per_cpu(tvec_bases.nohz_active, cpu) = true; + per_cpu(hrtimer_bases.nohz_active, cpu) = true; + } } -static inline unsigned int tbase_get_irqsafe(struct tvec_base *base) +int timer_migration_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) { - return ((unsigned int)(unsigned long)base & TIMER_IRQSAFE); + static DEFINE_MUTEX(mutex); + int ret; + + mutex_lock(&mutex); + ret = proc_dointvec(table, write, buffer, lenp, ppos); + if (!ret && write) + timers_update_migration(false); + mutex_unlock(&mutex); + return ret; } -static inline struct tvec_base *tbase_get_base(struct tvec_base *base) +static inline struct tvec_base *get_target_base(struct tvec_base *base, + int pinned) { - return ((struct tvec_base *)((unsigned long)base & ~TIMER_FLAG_MASK)); + if (pinned || !base->migration_enabled) + return this_cpu_ptr(&tvec_bases); + return per_cpu_ptr(&tvec_bases, get_nohz_timer_target()); } - -static inline void -timer_set_base(struct timer_list *timer, struct tvec_base *new_base) +#else +static inline struct tvec_base *get_target_base(struct tvec_base *base, + int pinned) { - unsigned long flags = (unsigned long)timer->base & TIMER_FLAG_MASK; - - timer->base = (struct tvec_base *)((unsigned long)(new_base) | flags); + return this_cpu_ptr(&tvec_bases); } +#endif static unsigned long round_jiffies_common(unsigned long j, int cpu, bool force_up) @@ -352,26 +373,12 @@ void set_timer_slack(struct timer_list *timer, int slack_hz) } EXPORT_SYMBOL_GPL(set_timer_slack); -/* - * If the list is empty, catch up ->timer_jiffies to the current time. - * The caller must hold the tvec_base lock. Returns true if the list - * was empty and therefore ->timer_jiffies was updated. - */ -static bool catchup_timer_jiffies(struct tvec_base *base) -{ - if (!base->all_timers) { - base->timer_jiffies = jiffies; - return true; - } - return false; -} - static void __internal_add_timer(struct tvec_base *base, struct timer_list *timer) { unsigned long expires = timer->expires; unsigned long idx = expires - base->timer_jiffies; - struct list_head *vec; + struct hlist_head *vec; if (idx < TVR_SIZE) { int i = expires & TVR_MASK; @@ -404,25 +411,25 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer) i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; vec = base->tv5.vec + i; } - /* - * Timers are FIFO: - */ - list_add_tail(&timer->entry, vec); + + hlist_add_head(&timer->entry, vec); } static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) { - (void)catchup_timer_jiffies(base); + /* Advance base->jiffies, if the base is empty */ + if (!base->all_timers++) + base->timer_jiffies = jiffies; + __internal_add_timer(base, timer); /* * Update base->active_timers and base->next_timer */ - if (!tbase_get_deferrable(timer->base)) { + if (!(timer->flags & TIMER_DEFERRABLE)) { if (!base->active_timers++ || time_before(timer->expires, base->next_timer)) base->next_timer = timer->expires; } - base->all_timers++; /* * Check whether the other CPU is in dynticks mode and needs @@ -437,8 +444,11 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) * require special care against races with idle_cpu(), lets deal * with that later. */ - if (!tbase_get_deferrable(base) || tick_nohz_full_cpu(base->cpu)) - wake_up_nohz_cpu(base->cpu); + if (base->nohz_active) { + if (!(timer->flags & TIMER_DEFERRABLE) || + tick_nohz_full_cpu(base->cpu)) + wake_up_nohz_cpu(base->cpu); + } } #ifdef CONFIG_TIMER_STATS @@ -454,15 +464,19 @@ void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr) static void timer_stats_account_timer(struct timer_list *timer) { - unsigned int flag = 0; + void *site; - if (likely(!timer->start_site)) + /* + * start_site can be concurrently reset by + * timer_stats_timer_clear_start_info() + */ + site = READ_ONCE(timer->start_site); + if (likely(!site)) return; - if (unlikely(tbase_get_deferrable(timer->base))) - flag |= TIMER_STATS_FLAG_DEFERRABLE; - timer_stats_update_stats(timer, timer->start_pid, timer->start_site, - timer->function, timer->start_comm, flag); + timer_stats_update_stats(timer, timer->start_pid, site, + timer->function, timer->start_comm, + timer->flags); } #else @@ -519,8 +533,8 @@ static int timer_fixup_activate(void *addr, enum debug_obj_state state) * statically initialized. We just make sure that it * is tracked in the object tracker. */ - if (timer->entry.next == NULL && - timer->entry.prev == TIMER_ENTRY_STATIC) { + if (timer->entry.pprev == NULL && + timer->entry.next == TIMER_ENTRY_STATIC) { debug_object_init(timer, &timer_debug_descr); debug_object_activate(timer, &timer_debug_descr); return 0; @@ -566,7 +580,7 @@ static int timer_fixup_assert_init(void *addr, enum debug_obj_state state) switch (state) { case ODEBUG_STATE_NOTAVAILABLE: - if (timer->entry.prev == TIMER_ENTRY_STATIC) { + if (timer->entry.next == TIMER_ENTRY_STATIC) { /* * This is not really a fixup. The timer was * statically initialized. We just make sure that it @@ -651,7 +665,7 @@ static inline void debug_activate(struct timer_list *timer, unsigned long expires) { debug_timer_activate(timer); - trace_timer_start(timer, expires); + trace_timer_start(timer, expires, timer->flags); } static inline void debug_deactivate(struct timer_list *timer) @@ -668,10 +682,8 @@ static inline void debug_assert_init(struct timer_list *timer) static void do_init_timer(struct timer_list *timer, unsigned int flags, const char *name, struct lock_class_key *key) { - struct tvec_base *base = raw_cpu_read(tvec_bases); - - timer->entry.next = NULL; - timer->base = (void *)((unsigned long)base | flags); + timer->entry.pprev = NULL; + timer->flags = flags | raw_smp_processor_id(); timer->slack = -1; #ifdef CONFIG_TIMER_STATS timer->start_site = NULL; @@ -702,24 +714,23 @@ EXPORT_SYMBOL(init_timer_key); static inline void detach_timer(struct timer_list *timer, bool clear_pending) { - struct list_head *entry = &timer->entry; + struct hlist_node *entry = &timer->entry; debug_deactivate(timer); - __list_del(entry->prev, entry->next); + __hlist_del(entry); if (clear_pending) - entry->next = NULL; - entry->prev = LIST_POISON2; + entry->pprev = NULL; + entry->next = LIST_POISON2; } static inline void detach_expired_timer(struct timer_list *timer, struct tvec_base *base) { detach_timer(timer, true); - if (!tbase_get_deferrable(timer->base)) + if (!(timer->flags & TIMER_DEFERRABLE)) base->active_timers--; base->all_timers--; - (void)catchup_timer_jiffies(base); } static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, @@ -729,13 +740,14 @@ static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, return 0; detach_timer(timer, clear_pending); - if (!tbase_get_deferrable(timer->base)) { + if (!(timer->flags & TIMER_DEFERRABLE)) { base->active_timers--; if (timer->expires == base->next_timer) base->next_timer = base->timer_jiffies; } - base->all_timers--; - (void)catchup_timer_jiffies(base); + /* If this was the last timer, advance base->jiffies */ + if (!--base->all_timers) + base->timer_jiffies = jiffies; return 1; } @@ -747,67 +759,68 @@ static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, * So __run_timers/migrate_timers can safely modify all timers which could * be found on ->tvX lists. * - * When the timer's base is locked, and the timer removed from list, it is - * possible to set timer->base = NULL and drop the lock: the timer remains - * locked. + * When the timer's base is locked and removed from the list, the + * TIMER_MIGRATING flag is set, FIXME */ static struct tvec_base *lock_timer_base(struct timer_list *timer, unsigned long *flags) __acquires(timer->base->lock) { - struct tvec_base *base; - for (;;) { - struct tvec_base *prelock_base = timer->base; - base = tbase_get_base(prelock_base); - if (likely(base != NULL)) { + u32 tf = timer->flags; + struct tvec_base *base; + + if (!(tf & TIMER_MIGRATING)) { + base = per_cpu_ptr(&tvec_bases, tf & TIMER_CPUMASK); spin_lock_irqsave(&base->lock, *flags); - if (likely(prelock_base == timer->base)) + if (timer->flags == tf) return base; - /* The timer has migrated to another CPU */ spin_unlock_irqrestore(&base->lock, *flags); } cpu_relax(); } } - -#ifndef CONFIG_PREEMPT_RT_FULL -static inline struct tvec_base *switch_timer_base(struct timer_list *timer, - struct tvec_base *old, - struct tvec_base *new) -{ - /* See the comment in lock_timer_base() */ - timer_set_base(timer, NULL); - spin_unlock(&old->lock); - spin_lock(&new->lock); - timer_set_base(timer, new); - return new; -} -#else +#ifdef CONFIG_PREEMPT_RT_FULL static inline struct tvec_base *switch_timer_base(struct timer_list *timer, struct tvec_base *old, struct tvec_base *new) { /* - * We cannot do the above because we might be preempted and + * We cannot do the below because we might be preempted and * then the preempter would see NULL and loop forever. */ if (spin_trylock(&new->lock)) { - timer_set_base(timer, new); + WRITE_ONCE(timer->flags, + (timer->flags & ~TIMER_BASEMASK) | new->cpu); spin_unlock(&old->lock); return new; } return old; } + +#else +static inline struct tvec_base *switch_timer_base(struct timer_list *timer, + struct tvec_base *old, + struct tvec_base *new) +{ + /* See the comment in lock_timer_base() */ + timer->flags |= TIMER_MIGRATING; + + spin_unlock(&old->lock); + spin_lock(&new->lock); + WRITE_ONCE(timer->flags, + (timer->flags & ~TIMER_BASEMASK) | new->cpu); + return new; +} #endif static inline int __mod_timer(struct timer_list *timer, unsigned long expires, - bool pending_only, int pinned) + bool pending_only, int pinned) { struct tvec_base *base, *new_base; unsigned long flags; - int ret = 0 , cpu; + int ret = 0; timer_stats_timer_set_start_info(timer); BUG_ON(!timer->function); @@ -820,8 +833,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, debug_activate(timer, expires); - cpu = get_nohz_timer_target(pinned); - new_base = per_cpu(tvec_bases, cpu); + new_base = get_target_base(base, pinned); if (base != new_base) { /* @@ -890,7 +902,7 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires) if (mask == 0) return expires; - bit = find_last_bit(&mask, BITS_PER_LONG); + bit = __fls(mask); mask = (1UL << bit) - 1; @@ -993,13 +1005,29 @@ EXPORT_SYMBOL(add_timer); */ void add_timer_on(struct timer_list *timer, int cpu) { - struct tvec_base *base = per_cpu(tvec_bases, cpu); + struct tvec_base *new_base = per_cpu_ptr(&tvec_bases, cpu); + struct tvec_base *base; unsigned long flags; timer_stats_timer_set_start_info(timer); BUG_ON(timer_pending(timer) || !timer->function); - spin_lock_irqsave(&base->lock, flags); - timer_set_base(timer, base); + + /* + * If @timer was on a different CPU, it should be migrated with the + * old base locked to prevent other operations proceeding with the + * wrong base locked. See lock_timer_base(). + */ + base = lock_timer_base(timer, &flags); + if (base != new_base) { + timer->flags |= TIMER_MIGRATING; + + spin_unlock(&base->lock); + base = new_base; + spin_lock(&base->lock); + WRITE_ONCE(timer->flags, + (timer->flags & ~TIMER_BASEMASK) | cpu); + } + debug_activate(timer, timer->expires); internal_add_timer(base, timer); spin_unlock_irqrestore(&base->lock, flags); @@ -1012,11 +1040,15 @@ EXPORT_SYMBOL_GPL(add_timer_on); */ static void wait_for_running_timer(struct timer_list *timer) { - struct tvec_base *base = timer->base; + struct tvec_base *base; + u32 tf = timer->flags; - if (base->running_timer == timer) - wait_event(base->wait_for_running_timer, - base->running_timer != timer); + if (tf & TIMER_MIGRATING) + return; + + base = per_cpu_ptr(&tvec_bases, tf & TIMER_CPUMASK); + wait_event(base->wait_for_running_timer, + base->running_timer != timer); } # define wakeup_timer_waiters(b) wake_up(&(b)->wait_for_running_timer) @@ -1087,8 +1119,6 @@ int try_to_del_timer_sync(struct timer_list *timer) EXPORT_SYMBOL(try_to_del_timer_sync); #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL) -static DEFINE_PER_CPU(struct tvec_base, __tvec_bases); - /** * del_timer_sync - deactivate a timer and wait for the handler to finish. * @timer: the timer to be deactivated @@ -1143,7 +1173,7 @@ int del_timer_sync(struct timer_list *timer) * don't use it in hardirq context, because it * could lead to deadlock. */ - WARN_ON(in_irq() && !tbase_get_irqsafe(timer->base)); + WARN_ON(in_irq() && !(timer->flags & TIMER_IRQSAFE)); for (;;) { int ret = try_to_del_timer_sync(timer); if (ret >= 0) @@ -1157,17 +1187,17 @@ EXPORT_SYMBOL(del_timer_sync); static int cascade(struct tvec_base *base, struct tvec *tv, int index) { /* cascade all the timers from tv up one level */ - struct timer_list *timer, *tmp; - struct list_head tv_list; + struct timer_list *timer; + struct hlist_node *tmp; + struct hlist_head tv_list; - list_replace_init(tv->vec + index, &tv_list); + hlist_move_list(tv->vec + index, &tv_list); /* * We are removing _all_ timers from the list, so we * don't have to detach them individually. */ - list_for_each_entry_safe(timer, tmp, &tv_list, entry) { - BUG_ON(tbase_get_base(timer->base) != base); + hlist_for_each_entry_safe(timer, tmp, &tv_list, entry) { /* No accounting, while moving them */ __internal_add_timer(base, timer); } @@ -1232,14 +1262,18 @@ static inline void __run_timers(struct tvec_base *base) struct timer_list *timer; spin_lock_irq(&base->lock); - if (catchup_timer_jiffies(base)) { - spin_unlock_irq(&base->lock); - return; - } + while (time_after_eq(jiffies, base->timer_jiffies)) { - struct list_head work_list; - struct list_head *head = &work_list; - int index = base->timer_jiffies & TVR_MASK; + struct hlist_head work_list; + struct hlist_head *head = &work_list; + int index; + + if (!base->all_timers) { + base->timer_jiffies = jiffies; + break; + } + + index = base->timer_jiffies & TVR_MASK; /* * Cascade timers: @@ -1250,16 +1284,16 @@ static inline void __run_timers(struct tvec_base *base) !cascade(base, &base->tv4, INDEX(2))) cascade(base, &base->tv5, INDEX(3)); ++base->timer_jiffies; - list_replace_init(base->tv1.vec + index, head); - while (!list_empty(head)) { + hlist_move_list(base->tv1.vec + index, head); + while (!hlist_empty(head)) { void (*fn)(unsigned long); unsigned long data; bool irqsafe; - timer = list_first_entry(head, struct timer_list,entry); + timer = hlist_entry(head->first, struct timer_list, entry); fn = timer->function; data = timer->data; - irqsafe = tbase_get_irqsafe(timer->base); + irqsafe = timer->flags & TIMER_IRQSAFE; timer_stats_account_timer(timer); @@ -1300,8 +1334,8 @@ static unsigned long __next_timer_interrupt(struct tvec_base *base) /* Look for timer events in tv1. */ index = slot = timer_jiffies & TVR_MASK; do { - list_for_each_entry(nte, base->tv1.vec + slot, entry) { - if (tbase_get_deferrable(nte->base)) + hlist_for_each_entry(nte, base->tv1.vec + slot, entry) { + if (nte->flags & TIMER_DEFERRABLE) continue; found = 1; @@ -1331,8 +1365,8 @@ cascade: index = slot = timer_jiffies & TVN_MASK; do { - list_for_each_entry(nte, varp->vec + slot, entry) { - if (tbase_get_deferrable(nte->base)) + hlist_for_each_entry(nte, varp->vec + slot, entry) { + if (nte->flags & TIMER_DEFERRABLE) continue; found = 1; @@ -1363,54 +1397,48 @@ cascade: * Check, if the next hrtimer event is before the next timer wheel * event: */ -static unsigned long cmp_next_hrtimer_event(unsigned long now, - unsigned long expires) +static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) { - ktime_t hr_delta = hrtimer_get_next_event(); - struct timespec tsdelta; - unsigned long delta; - - if (hr_delta.tv64 == KTIME_MAX) - return expires; + u64 nextevt = hrtimer_get_next_event(); /* - * Expired timer available, let it expire in the next tick + * If high resolution timers are enabled + * hrtimer_get_next_event() returns KTIME_MAX. */ - if (hr_delta.tv64 <= 0) - return now + 1; - - tsdelta = ktime_to_timespec(hr_delta); - delta = timespec_to_jiffies(&tsdelta); + if (expires <= nextevt) + return expires; /* - * Limit the delta to the max value, which is checked in - * tick_nohz_stop_sched_tick(): + * If the next timer is already expired, return the tick base + * time so the tick is fired immediately. */ - if (delta > NEXT_TIMER_MAX_DELTA) - delta = NEXT_TIMER_MAX_DELTA; + if (nextevt <= basem) + return basem; /* - * Take rounding errors in to account and make sure, that it - * expires in the next tick. Otherwise we go into an endless - * ping pong due to tick_nohz_stop_sched_tick() retriggering - * the timer softirq + * Round up to the next jiffie. High resolution timers are + * off, so the hrtimers are expired in the tick and we need to + * make sure that this tick really expires the timer to avoid + * a ping pong of the nohz stop code. + * + * Use DIV_ROUND_UP_ULL to prevent gcc calling __divdi3 */ - if (delta < 1) - delta = 1; - now += delta; - if (time_before(now, expires)) - return now; - return expires; + return DIV_ROUND_UP_ULL(nextevt, TICK_NSEC) * TICK_NSEC; } /** - * get_next_timer_interrupt - return the jiffy of the next pending timer - * @now: current time (in jiffies) + * get_next_timer_interrupt - return the time (clock mono) of the next timer + * @basej: base time jiffies + * @basem: base time clock monotonic + * + * Returns the tick aligned clock monotonic time of the next pending + * timer or KTIME_MAX if no timer is pending. */ -unsigned long get_next_timer_interrupt(unsigned long now) +u64 get_next_timer_interrupt(unsigned long basej, u64 basem) { - struct tvec_base *base = __this_cpu_read(tvec_bases); - unsigned long expires = now + NEXT_TIMER_MAX_DELTA; + struct tvec_base *base = this_cpu_ptr(&tvec_bases); + u64 expires = KTIME_MAX; + unsigned long nextevt; /* * Pretend that there is no timer pending if the cpu is offline. @@ -1425,20 +1453,21 @@ unsigned long get_next_timer_interrupt(unsigned long now) * the base lock to check when the next timer is pending and so * we assume the next jiffy. */ - return now + 1; + return basem + TICK_NSEC; #endif spin_lock(&base->lock); if (base->active_timers) { if (time_before_eq(base->next_timer, base->timer_jiffies)) base->next_timer = __next_timer_interrupt(base); - expires = base->next_timer; + nextevt = base->next_timer; + if (time_before_eq(nextevt, basej)) + expires = basem; + else + expires = basem + (nextevt - basej) * TICK_NSEC; } spin_unlock(&base->lock); - if (time_before_eq(expires, now)) - return now; - - return cmp_next_hrtimer_event(now, expires); + return cmp_next_hrtimer_event(basem, expires); } #endif @@ -1455,7 +1484,7 @@ void update_process_times(int user_tick) scheduler_tick(); run_local_timers(); rcu_check_callbacks(user_tick); -#if defined(CONFIG_IRQ_WORK) && !defined(CONFIG_PREEMPT_RT_FULL) +#if defined(CONFIG_IRQ_WORK) if (in_irq()) irq_work_tick(); #endif @@ -1467,13 +1496,9 @@ void update_process_times(int user_tick) */ static void run_timer_softirq(struct softirq_action *h) { - struct tvec_base *base = __this_cpu_read(tvec_bases); - - hrtimer_run_pending(); + struct tvec_base *base = this_cpu_ptr(&tvec_bases); -#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL) - irq_work_tick(); -#endif + irq_work_tick_soft(); if (time_after_eq(jiffies, base->timer_jiffies)) __run_timers(base); @@ -1609,15 +1634,16 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout) EXPORT_SYMBOL(schedule_timeout_uninterruptible); #ifdef CONFIG_HOTPLUG_CPU -static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head) +static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head) { struct timer_list *timer; + int cpu = new_base->cpu; - while (!list_empty(head)) { - timer = list_first_entry(head, struct timer_list, entry); + while (!hlist_empty(head)) { + timer = hlist_entry(head->first, struct timer_list, entry); /* We ignore the accounting on the dying cpu */ detach_timer(timer, false); - timer_set_base(timer, new_base); + timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu; internal_add_timer(new_base, timer); } } @@ -1629,8 +1655,8 @@ static void migrate_timers(int cpu) int i; BUG_ON(cpu_online(cpu)); - old_base = per_cpu(tvec_bases, cpu); - new_base = get_local_var(tvec_bases); + old_base = per_cpu_ptr(&tvec_bases, cpu); + new_base = get_local_ptr(&tvec_bases); /* * The caller is globally serialized and nobody else * takes two locks at once, deadlock is not possible. @@ -1654,7 +1680,7 @@ static void migrate_timers(int cpu) spin_unlock(&old_base->lock); spin_unlock_irq(&new_base->lock); - put_local_var(tvec_bases); + put_local_ptr(&tvec_bases); } static int timer_cpu_notify(struct notifier_block *self, @@ -1680,55 +1706,30 @@ static inline void timer_register_cpu_notifier(void) static inline void timer_register_cpu_notifier(void) { } #endif /* CONFIG_HOTPLUG_CPU */ -static void __init init_timer_cpu(struct tvec_base *base, int cpu) +static void __init init_timer_cpu(int cpu) { - int j; - - BUG_ON(base != tbase_get_base(base)); + struct tvec_base *base = per_cpu_ptr(&tvec_bases, cpu); base->cpu = cpu; - per_cpu(tvec_bases, cpu) = base; spin_lock_init(&base->lock); #ifdef CONFIG_PREEMPT_RT_FULL init_waitqueue_head(&base->wait_for_running_timer); #endif - for (j = 0; j < TVN_SIZE; j++) { - INIT_LIST_HEAD(base->tv5.vec + j); - INIT_LIST_HEAD(base->tv4.vec + j); - INIT_LIST_HEAD(base->tv3.vec + j); - INIT_LIST_HEAD(base->tv2.vec + j); - } - for (j = 0; j < TVR_SIZE; j++) - INIT_LIST_HEAD(base->tv1.vec + j); - base->timer_jiffies = jiffies; base->next_timer = base->timer_jiffies; } static void __init init_timer_cpus(void) { - struct tvec_base *base; - int local_cpu = smp_processor_id(); int cpu; - for_each_possible_cpu(cpu) { - if (cpu == local_cpu) - base = &boot_tvec_bases; -#ifdef CONFIG_SMP - else - base = per_cpu_ptr(&__tvec_bases, cpu); -#endif - - init_timer_cpu(base, cpu); - } + for_each_possible_cpu(cpu) + init_timer_cpu(cpu); } void __init init_timers(void) { - /* ensure there are enough low bits for flags in timer->base pointer */ - BUILD_BUG_ON(__alignof__(struct tvec_base) & TIMER_FLAG_MASK); - init_timer_cpus(); init_timer_stats(); timer_register_cpu_notifier(); @@ -1764,14 +1765,14 @@ unsigned long msleep_interruptible(unsigned int msecs) EXPORT_SYMBOL(msleep_interruptible); -static int __sched do_usleep_range(unsigned long min, unsigned long max) +static void __sched do_usleep_range(unsigned long min, unsigned long max) { ktime_t kmin; unsigned long delta; kmin = ktime_set(0, min * NSEC_PER_USEC); delta = (max - min) * NSEC_PER_USEC; - return schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL); + schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL); } /** @@ -1779,7 +1780,7 @@ static int __sched do_usleep_range(unsigned long min, unsigned long max) * @min: Minimum time in usecs to sleep * @max: Maximum time in usecs to sleep */ -void usleep_range(unsigned long min, unsigned long max) +void __sched usleep_range(unsigned long min, unsigned long max) { __set_current_state(TASK_UNINTERRUPTIBLE); do_usleep_range(min, max); diff --git a/kernel/kernel/time/timer_list.c b/kernel/kernel/time/timer_list.c index e878c2e0b..ba7d8b288 100644 --- a/kernel/kernel/time/timer_list.c +++ b/kernel/kernel/time/timer_list.c @@ -29,19 +29,24 @@ struct timer_list_iter { typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes); -DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); - /* * This allows printing both to /proc/timer_list and * to the console (on SysRq-Q): */ -#define SEQ_printf(m, x...) \ - do { \ - if (m) \ - seq_printf(m, x); \ - else \ - printk(x); \ - } while (0) +__printf(2, 3) +static void SEQ_printf(struct seq_file *m, const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + + if (m) + seq_vprintf(m, fmt, args); + else + vprintk(fmt, args); + + va_end(args); +} static void print_name_offset(struct seq_file *m, void *sym) { @@ -64,7 +69,7 @@ print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer, print_name_offset(m, taddr); SEQ_printf(m, ", "); print_name_offset(m, timer->function); - SEQ_printf(m, ", S:%02lx", timer->state); + SEQ_printf(m, ", S:%02x", timer->state); #ifdef CONFIG_TIMER_STATS SEQ_printf(m, ", "); print_name_offset(m, timer->start_site); @@ -120,10 +125,10 @@ static void print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) { SEQ_printf(m, " .base: %pK\n", base); - SEQ_printf(m, " .index: %d\n", - base->index); - SEQ_printf(m, " .resolution: %Lu nsecs\n", - (unsigned long long)ktime_to_ns(base->resolution)); + SEQ_printf(m, " .index: %d\n", base->index); + + SEQ_printf(m, " .resolution: %u nsecs\n", (unsigned) hrtimer_resolution); + SEQ_printf(m, " .get_time: "); print_name_offset(m, base->get_time); SEQ_printf(m, "\n"); @@ -132,7 +137,7 @@ print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) (unsigned long long) ktime_to_ns(base->offset)); #endif SEQ_printf(m, "active timers:\n"); - print_active_timers(m, base, now); + print_active_timers(m, base, now + ktime_to_ns(base->offset)); } static void print_cpu(struct seq_file *m, int cpu, u64 now) @@ -158,7 +163,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) P(nr_events); P(nr_retries); P(nr_hangs); - P_ns(max_hang_time); + P(max_hang_time); #endif #undef P #undef P_ns @@ -184,7 +189,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) P_ns(idle_sleeptime); P_ns(iowait_sleeptime); P(last_jiffies); - P(next_jiffies); + P(next_timer); P_ns(idle_expires); SEQ_printf(m, "jiffies: %Lu\n", (unsigned long long)jiffies); @@ -220,7 +225,7 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) (unsigned long long) dev->min_delta_ns); SEQ_printf(m, " mult: %u\n", dev->mult); SEQ_printf(m, " shift: %u\n", dev->shift); - SEQ_printf(m, " mode: %d\n", dev->mode); + SEQ_printf(m, " mode: %d\n", clockevent_get_state(dev)); SEQ_printf(m, " next_event: %Ld nsecs\n", (unsigned long long) ktime_to_ns(dev->next_event)); @@ -228,34 +233,34 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) print_name_offset(m, dev->set_next_event); SEQ_printf(m, "\n"); - if (dev->set_mode) { - SEQ_printf(m, " set_mode: "); - print_name_offset(m, dev->set_mode); + if (dev->set_state_shutdown) { + SEQ_printf(m, " shutdown: "); + print_name_offset(m, dev->set_state_shutdown); SEQ_printf(m, "\n"); - } else { - if (dev->set_state_shutdown) { - SEQ_printf(m, " shutdown: "); - print_name_offset(m, dev->set_state_shutdown); - SEQ_printf(m, "\n"); - } + } - if (dev->set_state_periodic) { - SEQ_printf(m, " periodic: "); - print_name_offset(m, dev->set_state_periodic); - SEQ_printf(m, "\n"); - } + if (dev->set_state_periodic) { + SEQ_printf(m, " periodic: "); + print_name_offset(m, dev->set_state_periodic); + SEQ_printf(m, "\n"); + } - if (dev->set_state_oneshot) { - SEQ_printf(m, " oneshot: "); - print_name_offset(m, dev->set_state_oneshot); - SEQ_printf(m, "\n"); - } + if (dev->set_state_oneshot) { + SEQ_printf(m, " oneshot: "); + print_name_offset(m, dev->set_state_oneshot); + SEQ_printf(m, "\n"); + } - if (dev->tick_resume) { - SEQ_printf(m, " resume: "); - print_name_offset(m, dev->tick_resume); - SEQ_printf(m, "\n"); - } + if (dev->set_state_oneshot_stopped) { + SEQ_printf(m, " oneshot stopped: "); + print_name_offset(m, dev->set_state_oneshot_stopped); + SEQ_printf(m, "\n"); + } + + if (dev->tick_resume) { + SEQ_printf(m, " resume: "); + print_name_offset(m, dev->tick_resume); + SEQ_printf(m, "\n"); } SEQ_printf(m, " event_handler: "); @@ -269,11 +274,11 @@ static void timer_list_show_tickdevices_header(struct seq_file *m) { #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST print_tickdevice(m, tick_get_broadcast_device(), -1); - SEQ_printf(m, "tick_broadcast_mask: %08lx\n", - cpumask_bits(tick_get_broadcast_mask())[0]); + SEQ_printf(m, "tick_broadcast_mask: %*pb\n", + cpumask_pr_args(tick_get_broadcast_mask())); #ifdef CONFIG_TICK_ONESHOT - SEQ_printf(m, "tick_broadcast_oneshot_mask: %08lx\n", - cpumask_bits(tick_get_broadcast_oneshot_mask())[0]); + SEQ_printf(m, "tick_broadcast_oneshot_mask: %*pb\n", + cpumask_pr_args(tick_get_broadcast_oneshot_mask())); #endif SEQ_printf(m, "\n"); #endif @@ -282,7 +287,7 @@ static void timer_list_show_tickdevices_header(struct seq_file *m) static inline void timer_list_header(struct seq_file *m, u64 now) { - SEQ_printf(m, "Timer List Version: v0.7\n"); + SEQ_printf(m, "Timer List Version: v0.8\n"); SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); SEQ_printf(m, "\n"); diff --git a/kernel/kernel/time/timer_stats.c b/kernel/kernel/time/timer_stats.c index 1fb08f213..1adecb4b8 100644 --- a/kernel/kernel/time/timer_stats.c +++ b/kernel/kernel/time/timer_stats.c @@ -68,7 +68,7 @@ struct entry { * Number of timeout events: */ unsigned long count; - unsigned int timer_flag; + u32 flags; /* * We save the command-line string to preserve @@ -227,13 +227,13 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm) * @startf: pointer to the function which did the timer setup * @timerf: pointer to the timer callback function of the timer * @comm: name of the process which set up the timer + * @tflags: The flags field of the timer * * When the timer is already registered, then the event counter is * incremented. Otherwise the timer is registered in a free slot. */ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, - void *timerf, char *comm, - unsigned int timer_flag) + void *timerf, char *comm, u32 tflags) { /* * It doesn't matter which lock we take: @@ -251,7 +251,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, input.start_func = startf; input.expire_func = timerf; input.pid = pid; - input.timer_flag = timer_flag; + input.flags = tflags; raw_spin_lock_irqsave(lock, flags); if (!timer_stats_active) @@ -306,7 +306,7 @@ static int tstats_show(struct seq_file *m, void *v) for (i = 0; i < nr_entries; i++) { entry = entries + i; - if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) { + if (entry->flags & TIMER_DEFERRABLE) { seq_printf(m, "%4luD, %5d %-16s ", entry->count, entry->pid, entry->comm); } else { diff --git a/kernel/kernel/torture.c b/kernel/kernel/torture.c index dd70993c2..44aa462d0 100644 --- a/kernel/kernel/torture.c +++ b/kernel/kernel/torture.c @@ -409,7 +409,7 @@ static void (*torture_shutdown_hook)(void); */ void torture_shutdown_absorb(const char *title) { - while (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { + while (READ_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { pr_notice("torture thread %s parking due to system shutdown\n", title); schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT); @@ -480,9 +480,9 @@ static int torture_shutdown_notify(struct notifier_block *unused1, unsigned long unused2, void *unused3) { mutex_lock(&fullstop_mutex); - if (ACCESS_ONCE(fullstop) == FULLSTOP_DONTSTOP) { + if (READ_ONCE(fullstop) == FULLSTOP_DONTSTOP) { VERBOSE_TOROUT_STRING("Unscheduled system shutdown detected"); - ACCESS_ONCE(fullstop) = FULLSTOP_SHUTDOWN; + WRITE_ONCE(fullstop, FULLSTOP_SHUTDOWN); } else { pr_warn("Concurrent rmmod and shutdown illegal!\n"); } @@ -523,13 +523,14 @@ static int stutter; */ void stutter_wait(const char *title) { - while (ACCESS_ONCE(stutter_pause_test) || - (torture_runnable && !ACCESS_ONCE(*torture_runnable))) { + cond_resched_rcu_qs(); + while (READ_ONCE(stutter_pause_test) || + (torture_runnable && !READ_ONCE(*torture_runnable))) { if (stutter_pause_test) - if (ACCESS_ONCE(stutter_pause_test) == 1) + if (READ_ONCE(stutter_pause_test) == 1) schedule_timeout_interruptible(1); else - while (ACCESS_ONCE(stutter_pause_test)) + while (READ_ONCE(stutter_pause_test)) cond_resched(); else schedule_timeout_interruptible(round_jiffies_relative(HZ)); @@ -549,14 +550,14 @@ static int torture_stutter(void *arg) if (!torture_must_stop()) { if (stutter > 1) { schedule_timeout_interruptible(stutter - 1); - ACCESS_ONCE(stutter_pause_test) = 2; + WRITE_ONCE(stutter_pause_test, 2); } schedule_timeout_interruptible(1); - ACCESS_ONCE(stutter_pause_test) = 1; + WRITE_ONCE(stutter_pause_test, 1); } if (!torture_must_stop()) schedule_timeout_interruptible(stutter); - ACCESS_ONCE(stutter_pause_test) = 0; + WRITE_ONCE(stutter_pause_test, 0); torture_shutdown_absorb("torture_stutter"); } while (!torture_must_stop()); torture_kthread_stopping("torture_stutter"); @@ -642,13 +643,13 @@ EXPORT_SYMBOL_GPL(torture_init_end); bool torture_cleanup_begin(void) { mutex_lock(&fullstop_mutex); - if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { + if (READ_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { pr_warn("Concurrent rmmod and shutdown illegal!\n"); mutex_unlock(&fullstop_mutex); schedule_timeout_uninterruptible(10); return true; } - ACCESS_ONCE(fullstop) = FULLSTOP_RMMOD; + WRITE_ONCE(fullstop, FULLSTOP_RMMOD); mutex_unlock(&fullstop_mutex); torture_shutdown_cleanup(); torture_shuffle_cleanup(); @@ -681,7 +682,7 @@ EXPORT_SYMBOL_GPL(torture_must_stop); */ bool torture_must_stop_irq(void) { - return ACCESS_ONCE(fullstop) != FULLSTOP_DONTSTOP; + return READ_ONCE(fullstop) != FULLSTOP_DONTSTOP; } EXPORT_SYMBOL_GPL(torture_must_stop_irq); diff --git a/kernel/kernel/trace/Kconfig b/kernel/kernel/trace/Kconfig index ab3a277a3..364ccd0eb 100644 --- a/kernel/kernel/trace/Kconfig +++ b/kernel/kernel/trace/Kconfig @@ -538,7 +538,7 @@ config UPROBE_EVENT config BPF_EVENTS depends on BPF_SYSCALL - depends on KPROBE_EVENT + depends on (KPROBE_EVENT || UPROBE_EVENT) && PERF_EVENTS bool default y help @@ -739,6 +739,13 @@ config TRACE_ENUM_MAP_FILE If unsure, say N +config TRACING_EVENTS_GPIO + bool "Trace gpio events" + depends on GPIOLIB + default y + help + Enable tracing events for gpio subsystem + endif # FTRACE endif # TRACING_SUPPORT diff --git a/kernel/kernel/trace/blktrace.c b/kernel/kernel/trace/blktrace.c index 483cecfa5..a990824c8 100644 --- a/kernel/kernel/trace/blktrace.c +++ b/kernel/kernel/trace/blktrace.c @@ -103,7 +103,7 @@ record_it: memcpy((void *) t + sizeof(*t), data, len); if (blk_tracer) - trace_buffer_unlock_commit(buffer, event, 0, pc); + trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc); } } @@ -278,7 +278,7 @@ record_it: memcpy((void *) t + sizeof(*t), pdu_data, pdu_len); if (blk_tracer) { - trace_buffer_unlock_commit(buffer, event, 0, pc); + trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc); return; } } @@ -437,9 +437,9 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, struct block_device *bdev, struct blk_user_trace_setup *buts) { - struct blk_trace *old_bt, *bt = NULL; + struct blk_trace *bt = NULL; struct dentry *dir = NULL; - int ret, i; + int ret; if (!buts->buf_size || !buts->buf_nr) return -EINVAL; @@ -451,9 +451,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, * some device names have larger paths - convert the slashes * to underscores for this to work as expected */ - for (i = 0; i < strlen(buts->name); i++) - if (buts->name[i] == '/') - buts->name[i] = '_'; + strreplace(buts->name, '/', '_'); bt = kzalloc(sizeof(*bt), GFP_KERNEL); if (!bt) @@ -521,11 +519,8 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, bt->trace_state = Blktrace_setup; ret = -EBUSY; - old_bt = xchg(&q->blk_trace, bt); - if (old_bt) { - (void) xchg(&q->blk_trace, old_bt); + if (cmpxchg(&q->blk_trace, NULL, bt)) goto err; - } if (atomic_inc_return(&blk_probes_ref) == 1) blk_register_tracepoints(); @@ -780,9 +775,6 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, if (likely(!bt)) return; - if (!error && !bio_flagged(bio, BIO_UPTODATE)) - error = EIO; - __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio->bi_rw, what, error, 0, NULL); } @@ -889,8 +881,7 @@ static void blk_add_trace_split(void *ignore, __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio->bi_rw, BLK_TA_SPLIT, - !bio_flagged(bio, BIO_UPTODATE), - sizeof(rpdu), &rpdu); + bio->bi_error, sizeof(rpdu), &rpdu); } } @@ -922,8 +913,8 @@ static void blk_add_trace_bio_remap(void *ignore, r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, - bio->bi_rw, BLK_TA_REMAP, - !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r); + bio->bi_rw, BLK_TA_REMAP, bio->bi_error, + sizeof(r), &r); } /** @@ -1349,6 +1340,7 @@ static const struct { static enum print_line_t print_one_line(struct trace_iterator *iter, bool classic) { + struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; const struct blk_io_trace *t; u16 what; @@ -1357,7 +1349,7 @@ static enum print_line_t print_one_line(struct trace_iterator *iter, t = te_blk_io_trace(iter->ent); what = t->action & ((1 << BLK_TC_SHIFT) - 1); - long_act = !!(trace_flags & TRACE_ITER_VERBOSE); + long_act = !!(tr->trace_flags & TRACE_ITER_VERBOSE); log_action = classic ? &blk_log_action_classic : &blk_log_action; if (t->action == BLK_TN_MESSAGE) { @@ -1419,9 +1411,9 @@ blk_tracer_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) /* don't output context-info for blk_classic output */ if (bit == TRACE_BLK_OPT_CLASSIC) { if (set) - trace_flags &= ~TRACE_ITER_CONTEXT_INFO; + tr->trace_flags &= ~TRACE_ITER_CONTEXT_INFO; else - trace_flags |= TRACE_ITER_CONTEXT_INFO; + tr->trace_flags |= TRACE_ITER_CONTEXT_INFO; } return 0; } @@ -1450,14 +1442,14 @@ static struct trace_event trace_blk_event = { static int __init init_blk_tracer(void) { - if (!register_ftrace_event(&trace_blk_event)) { + if (!register_trace_event(&trace_blk_event)) { pr_warning("Warning: could not register block events\n"); return 1; } if (register_tracer(&blk_tracer) != 0) { pr_warning("Warning: could not register the block tracer\n"); - unregister_ftrace_event(&trace_blk_event); + unregister_trace_event(&trace_blk_event); return 1; } @@ -1487,7 +1479,7 @@ static int blk_trace_remove_queue(struct request_queue *q) static int blk_trace_setup_queue(struct request_queue *q, struct block_device *bdev) { - struct blk_trace *old_bt, *bt = NULL; + struct blk_trace *bt = NULL; int ret = -ENOMEM; bt = kzalloc(sizeof(*bt), GFP_KERNEL); @@ -1503,12 +1495,9 @@ static int blk_trace_setup_queue(struct request_queue *q, blk_trace_setup_lba(bt, bdev); - old_bt = xchg(&q->blk_trace, bt); - if (old_bt != NULL) { - (void)xchg(&q->blk_trace, old_bt); - ret = -EBUSY; + ret = -EBUSY; + if (cmpxchg(&q->blk_trace, NULL, bt)) goto free_bt; - } if (atomic_inc_return(&blk_probes_ref) == 1) blk_register_tracepoints(); diff --git a/kernel/kernel/trace/bpf_trace.c b/kernel/kernel/trace/bpf_trace.c index 2d56ce501..4228fd368 100644 --- a/kernel/kernel/trace/bpf_trace.c +++ b/kernel/kernel/trace/bpf_trace.c @@ -79,27 +79,18 @@ static const struct bpf_func_proto bpf_probe_read_proto = { .arg3_type = ARG_ANYTHING, }; -static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) -{ - /* NMI safe access to clock monotonic */ - return ktime_get_mono_fast_ns(); -} - -static const struct bpf_func_proto bpf_ktime_get_ns_proto = { - .func = bpf_ktime_get_ns, - .gpl_only = true, - .ret_type = RET_INTEGER, -}; - /* * limited trace_printk() - * only %d %u %x %ld %lu %lx %lld %llu %llx %p conversion specifiers allowed + * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed */ static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5) { char *fmt = (char *) (long) r1; + bool str_seen = false; int mod[3] = {}; int fmt_cnt = 0; + u64 unsafe_addr; + char buf[64]; int i; /* @@ -126,12 +117,37 @@ static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5) if (fmt[i] == 'l') { mod[fmt_cnt]++; i++; - } else if (fmt[i] == 'p') { + } else if (fmt[i] == 'p' || fmt[i] == 's') { mod[fmt_cnt]++; i++; if (!isspace(fmt[i]) && !ispunct(fmt[i]) && fmt[i] != 0) return -EINVAL; fmt_cnt++; + if (fmt[i - 1] == 's') { + if (str_seen) + /* allow only one '%s' per fmt string */ + return -EINVAL; + str_seen = true; + + switch (fmt_cnt) { + case 1: + unsafe_addr = r3; + r3 = (long) buf; + break; + case 2: + unsafe_addr = r4; + r4 = (long) buf; + break; + case 3: + unsafe_addr = r5; + r5 = (long) buf; + break; + } + buf[0] = 0; + strncpy_from_unsafe(buf, + (void *) (long) unsafe_addr, + sizeof(buf)); + } continue; } @@ -159,6 +175,95 @@ static const struct bpf_func_proto bpf_trace_printk_proto = { .arg2_type = ARG_CONST_STACK_SIZE, }; +const struct bpf_func_proto *bpf_get_trace_printk_proto(void) +{ + /* + * this program might be calling bpf_trace_printk, + * so allocate per-cpu printk buffers + */ + trace_printk_init_buffers(); + + return &bpf_trace_printk_proto; +} + +static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5) +{ + struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; + struct bpf_array *array = container_of(map, struct bpf_array, map); + struct perf_event *event; + + if (unlikely(index >= array->map.max_entries)) + return -E2BIG; + + event = (struct perf_event *)array->ptrs[index]; + if (!event) + return -ENOENT; + + /* make sure event is local and doesn't have pmu::count */ + if (event->oncpu != smp_processor_id() || + event->pmu->count) + return -EINVAL; + + /* + * we don't know if the function is run successfully by the + * return value. It can be judged in other places, such as + * eBPF programs. + */ + return perf_event_read_local(event); +} + +static const struct bpf_func_proto bpf_perf_event_read_proto = { + .func = bpf_perf_event_read, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, +}; + +static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size) +{ + struct pt_regs *regs = (struct pt_regs *) (long) r1; + struct bpf_map *map = (struct bpf_map *) (long) r2; + struct bpf_array *array = container_of(map, struct bpf_array, map); + void *data = (void *) (long) r4; + struct perf_sample_data sample_data; + struct perf_event *event; + struct perf_raw_record raw = { + .size = size, + .data = data, + }; + + if (unlikely(index >= array->map.max_entries)) + return -E2BIG; + + event = (struct perf_event *)array->ptrs[index]; + if (unlikely(!event)) + return -ENOENT; + + if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE || + event->attr.config != PERF_COUNT_SW_BPF_OUTPUT)) + return -EINVAL; + + if (unlikely(event->oncpu != smp_processor_id())) + return -EOPNOTSUPP; + + perf_sample_data_init(&sample_data, 0, 0); + sample_data.raw = &raw; + perf_event_output(event, &sample_data, regs); + return 0; +} + +static const struct bpf_func_proto bpf_perf_event_output_proto = { + .func = bpf_perf_event_output, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_STACK, + .arg5_type = ARG_CONST_STACK_SIZE, +}; + static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id) { switch (func_id) { @@ -172,15 +277,22 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func return &bpf_probe_read_proto; case BPF_FUNC_ktime_get_ns: return &bpf_ktime_get_ns_proto; - + case BPF_FUNC_tail_call: + return &bpf_tail_call_proto; + case BPF_FUNC_get_current_pid_tgid: + return &bpf_get_current_pid_tgid_proto; + case BPF_FUNC_get_current_uid_gid: + return &bpf_get_current_uid_gid_proto; + case BPF_FUNC_get_current_comm: + return &bpf_get_current_comm_proto; case BPF_FUNC_trace_printk: - /* - * this program might be calling bpf_trace_printk, - * so allocate per-cpu printk buffers - */ - trace_printk_init_buffers(); - - return &bpf_trace_printk_proto; + return bpf_get_trace_printk_proto(); + case BPF_FUNC_get_smp_processor_id: + return &bpf_get_smp_processor_id_proto; + case BPF_FUNC_perf_event_read: + return &bpf_perf_event_read_proto; + case BPF_FUNC_perf_event_output: + return &bpf_perf_event_output_proto; default: return NULL; } diff --git a/kernel/kernel/trace/ftrace.c b/kernel/kernel/trace/ftrace.c index eb11011b5..3f743b147 100644 --- a/kernel/kernel/trace/ftrace.c +++ b/kernel/kernel/trace/ftrace.c @@ -243,6 +243,11 @@ static void ftrace_sync_ipi(void *data) #ifdef CONFIG_FUNCTION_GRAPH_TRACER static void update_function_graph_func(void); + +/* Both enabled by default (can be cleared by function_graph tracer flags */ +static bool fgraph_sleep_time = true; +static bool fgraph_graph_time = true; + #else static inline void update_function_graph_func(void) { } #endif @@ -630,13 +635,18 @@ static int function_stat_show(struct seq_file *m, void *v) goto out; } +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + avg = rec->time; + do_div(avg, rec->counter); + if (tracing_thresh && (avg < tracing_thresh)) + goto out; +#endif + kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); seq_printf(m, " %-30.30s %10lu", str, rec->counter); #ifdef CONFIG_FUNCTION_GRAPH_TRACER seq_puts(m, " "); - avg = rec->time; - do_div(avg, rec->counter); /* Sample standard deviation (s^2) */ if (rec->counter <= 1) @@ -912,7 +922,7 @@ static void profile_graph_return(struct ftrace_graph_ret *trace) calltime = trace->rettime - trace->calltime; - if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) { + if (!fgraph_graph_time) { int index; index = trace->depth; @@ -3415,27 +3425,35 @@ ftrace_notrace_open(struct inode *inode, struct file *file) inode, file); } -static int ftrace_match(char *str, char *regex, int len, int type) +/* Type for quick search ftrace basic regexes (globs) from filter_parse_regex */ +struct ftrace_glob { + char *search; + unsigned len; + int type; +}; + +static int ftrace_match(char *str, struct ftrace_glob *g) { int matched = 0; int slen; - switch (type) { + switch (g->type) { case MATCH_FULL: - if (strcmp(str, regex) == 0) + if (strcmp(str, g->search) == 0) matched = 1; break; case MATCH_FRONT_ONLY: - if (strncmp(str, regex, len) == 0) + if (strncmp(str, g->search, g->len) == 0) matched = 1; break; case MATCH_MIDDLE_ONLY: - if (strstr(str, regex)) + if (strstr(str, g->search)) matched = 1; break; case MATCH_END_ONLY: slen = strlen(str); - if (slen >= len && memcmp(str + slen - len, regex, len) == 0) + if (slen >= g->len && + memcmp(str + slen - g->len, g->search, g->len) == 0) matched = 1; break; } @@ -3444,13 +3462,13 @@ static int ftrace_match(char *str, char *regex, int len, int type) } static int -enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int not) +enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int clear_filter) { struct ftrace_func_entry *entry; int ret = 0; entry = ftrace_lookup_ip(hash, rec->ip); - if (not) { + if (clear_filter) { /* Do nothing if it doesn't exist */ if (!entry) return 0; @@ -3467,42 +3485,68 @@ enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int not) } static int -ftrace_match_record(struct dyn_ftrace *rec, char *mod, - char *regex, int len, int type) +ftrace_match_record(struct dyn_ftrace *rec, struct ftrace_glob *func_g, + struct ftrace_glob *mod_g, int exclude_mod) { char str[KSYM_SYMBOL_LEN]; char *modname; kallsyms_lookup(rec->ip, NULL, NULL, &modname, str); - if (mod) { - /* module lookup requires matching the module */ - if (!modname || strcmp(modname, mod)) + if (mod_g) { + int mod_matches = (modname) ? ftrace_match(modname, mod_g) : 0; + + /* blank module name to match all modules */ + if (!mod_g->len) { + /* blank module globbing: modname xor exclude_mod */ + if ((!exclude_mod) != (!modname)) + goto func_match; + return 0; + } + + /* not matching the module */ + if (!modname || !mod_matches) { + if (exclude_mod) + goto func_match; + else + return 0; + } + + if (mod_matches && exclude_mod) return 0; +func_match: /* blank search means to match all funcs in the mod */ - if (!len) + if (!func_g->len) return 1; } - return ftrace_match(str, regex, len, type); + return ftrace_match(str, func_g); } static int -match_records(struct ftrace_hash *hash, char *buff, - int len, char *mod, int not) +match_records(struct ftrace_hash *hash, char *func, int len, char *mod) { - unsigned search_len = 0; struct ftrace_page *pg; struct dyn_ftrace *rec; - int type = MATCH_FULL; - char *search = buff; + struct ftrace_glob func_g = { .type = MATCH_FULL }; + struct ftrace_glob mod_g = { .type = MATCH_FULL }; + struct ftrace_glob *mod_match = (mod) ? &mod_g : NULL; + int exclude_mod = 0; int found = 0; int ret; + int clear_filter; - if (len) { - type = filter_parse_regex(buff, len, &search, ¬); - search_len = strlen(search); + if (func) { + func_g.type = filter_parse_regex(func, len, &func_g.search, + &clear_filter); + func_g.len = strlen(func_g.search); + } + + if (mod) { + mod_g.type = filter_parse_regex(mod, strlen(mod), + &mod_g.search, &exclude_mod); + mod_g.len = strlen(mod_g.search); } mutex_lock(&ftrace_lock); @@ -3511,8 +3555,8 @@ match_records(struct ftrace_hash *hash, char *buff, goto out_unlock; do_for_each_ftrace_rec(pg, rec) { - if (ftrace_match_record(rec, mod, search, search_len, type)) { - ret = enter_record(hash, rec, not); + if (ftrace_match_record(rec, &func_g, mod_match, exclude_mod)) { + ret = enter_record(hash, rec, clear_filter); if (ret < 0) { found = ret; goto out_unlock; @@ -3529,26 +3573,9 @@ match_records(struct ftrace_hash *hash, char *buff, static int ftrace_match_records(struct ftrace_hash *hash, char *buff, int len) { - return match_records(hash, buff, len, NULL, 0); + return match_records(hash, buff, len, NULL); } -static int -ftrace_match_module_records(struct ftrace_hash *hash, char *buff, char *mod) -{ - int not = 0; - - /* blank or '*' mean the same */ - if (strcmp(buff, "*") == 0) - buff[0] = 0; - - /* handle the case of 'dont filter this module' */ - if (strcmp(buff, "!") == 0 || strcmp(buff, "!*") == 0) { - buff[0] = 0; - not = 1; - } - - return match_records(hash, buff, strlen(buff), mod, not); -} /* * We register the module command as a template to show others how @@ -3557,10 +3584,9 @@ ftrace_match_module_records(struct ftrace_hash *hash, char *buff, char *mod) static int ftrace_mod_callback(struct ftrace_hash *hash, - char *func, char *cmd, char *param, int enable) + char *func, char *cmd, char *module, int enable) { - char *mod; - int ret = -EINVAL; + int ret; /* * cmd == 'mod' because we only registered this func @@ -3569,21 +3595,11 @@ ftrace_mod_callback(struct ftrace_hash *hash, * you can tell which command was used by the cmd * parameter. */ - - /* we must have a module name */ - if (!param) - return ret; - - mod = strsep(¶m, ":"); - if (!strlen(mod)) - return ret; - - ret = ftrace_match_module_records(hash, func, mod); + ret = match_records(hash, func, strlen(func), module); if (!ret) - ret = -EINVAL; + return -EINVAL; if (ret < 0) return ret; - return 0; } @@ -3694,19 +3710,20 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, { struct ftrace_ops_hash old_hash_ops; struct ftrace_func_probe *entry; + struct ftrace_glob func_g; struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash; struct ftrace_hash *old_hash = *orig_hash; struct ftrace_hash *hash; struct ftrace_page *pg; struct dyn_ftrace *rec; - int type, len, not; + int not; unsigned long key; int count = 0; - char *search; int ret; - type = filter_parse_regex(glob, strlen(glob), &search, ¬); - len = strlen(search); + func_g.type = filter_parse_regex(glob, strlen(glob), + &func_g.search, ¬); + func_g.len = strlen(func_g.search); /* we do not support '!' for function probes */ if (WARN_ON(not)) @@ -3733,7 +3750,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, do_for_each_ftrace_rec(pg, rec) { - if (!ftrace_match_record(rec, NULL, search, len, type)) + if (!ftrace_match_record(rec, &func_g, NULL, 0)) continue; entry = kmalloc(sizeof(*entry), GFP_KERNEL); @@ -3806,24 +3823,24 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, struct ftrace_func_entry *rec_entry; struct ftrace_func_probe *entry; struct ftrace_func_probe *p; + struct ftrace_glob func_g; struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash; struct ftrace_hash *old_hash = *orig_hash; struct list_head free_list; struct ftrace_hash *hash; struct hlist_node *tmp; char str[KSYM_SYMBOL_LEN]; - int type = MATCH_FULL; - int i, len = 0; - char *search; - int ret; + int i, ret; if (glob && (strcmp(glob, "*") == 0 || !strlen(glob))) - glob = NULL; + func_g.search = NULL; else if (glob) { int not; - type = filter_parse_regex(glob, strlen(glob), &search, ¬); - len = strlen(search); + func_g.type = filter_parse_regex(glob, strlen(glob), + &func_g.search, ¬); + func_g.len = strlen(func_g.search); + func_g.search = glob; /* we do not support '!' for function probes */ if (WARN_ON(not)) @@ -3852,10 +3869,10 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, continue; /* do this last, since it is the most expensive */ - if (glob) { + if (func_g.search) { kallsyms_lookup(entry->ip, NULL, NULL, NULL, str); - if (!ftrace_match(str, glob, len, type)) + if (!ftrace_match(str, &func_g)) continue; } @@ -3884,7 +3901,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, ftrace_free_entry(entry); } mutex_unlock(&ftrace_lock); - + out_unlock: mutex_unlock(&trace_probe_ops.func_hash->regex_lock); free_ftrace_hash(hash); @@ -4600,21 +4617,21 @@ ftrace_graph_release(struct inode *inode, struct file *file) static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer) { + struct ftrace_glob func_g; struct dyn_ftrace *rec; struct ftrace_page *pg; - int search_len; int fail = 1; - int type, not; - char *search; + int not; bool exists; int i; /* decode regex */ - type = filter_parse_regex(buffer, strlen(buffer), &search, ¬); + func_g.type = filter_parse_regex(buffer, strlen(buffer), + &func_g.search, ¬); if (!not && *idx >= size) return -EBUSY; - search_len = strlen(search); + func_g.len = strlen(func_g.search); mutex_lock(&ftrace_lock); @@ -4625,7 +4642,7 @@ ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer) do_for_each_ftrace_rec(pg, rec) { - if (ftrace_match_record(rec, NULL, search, search_len, type)) { + if (ftrace_match_record(rec, &func_g, NULL, 0)) { /* if it is in the array */ exists = false; for (i = 0; i < *idx; i++) { @@ -4778,17 +4795,6 @@ static int ftrace_cmp_ips(const void *a, const void *b) return 0; } -static void ftrace_swap_ips(void *a, void *b, int size) -{ - unsigned long *ipa = a; - unsigned long *ipb = b; - unsigned long t; - - t = *ipa; - *ipa = *ipb; - *ipb = t; -} - static int ftrace_process_locs(struct module *mod, unsigned long *start, unsigned long *end) @@ -4808,7 +4814,7 @@ static int ftrace_process_locs(struct module *mod, return 0; sort(start, count, sizeof(*start), - ftrace_cmp_ips, ftrace_swap_ips); + ftrace_cmp_ips, NULL); start_pg = ftrace_allocate_pages(count); if (!start_pg) @@ -5634,6 +5640,16 @@ static struct ftrace_ops graph_ops = { ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash) }; +void ftrace_graph_sleep_time_control(bool enable) +{ + fgraph_sleep_time = enable; +} + +void ftrace_graph_graph_time_control(bool enable) +{ + fgraph_graph_time = enable; +} + int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) { return 0; @@ -5692,7 +5708,7 @@ free: } static void -ftrace_graph_probe_sched_switch(void *ignore, +ftrace_graph_probe_sched_switch(void *ignore, bool preempt, struct task_struct *prev, struct task_struct *next) { unsigned long long timestamp; @@ -5702,7 +5718,7 @@ ftrace_graph_probe_sched_switch(void *ignore, * Does the user want to count the time a function was asleep. * If so, do not update the time stamps. */ - if (trace_flags & TRACE_ITER_SLEEP_TIME) + if (fgraph_sleep_time) return; timestamp = trace_clock_local(); diff --git a/kernel/kernel/trace/latency_hist.c b/kernel/kernel/trace/latency_hist.c index 66a69eb53..7f6ee70de 100644 --- a/kernel/kernel/trace/latency_hist.c +++ b/kernel/kernel/trace/latency_hist.c @@ -115,9 +115,9 @@ static DEFINE_PER_CPU(struct hist_data, wakeup_latency_hist_sharedprio); static char *wakeup_latency_hist_dir = "wakeup"; static char *wakeup_latency_hist_dir_sharedprio = "sharedprio"; static notrace void probe_wakeup_latency_hist_start(void *v, - struct task_struct *p, int success); + struct task_struct *p); static notrace void probe_wakeup_latency_hist_stop(void *v, - struct task_struct *prev, struct task_struct *next); + bool preempt, struct task_struct *prev, struct task_struct *next); static notrace void probe_sched_migrate_task(void *, struct task_struct *task, int cpu); static struct enable_data wakeup_latency_enabled_data = { @@ -869,7 +869,7 @@ static notrace void probe_sched_migrate_task(void *v, struct task_struct *task, } static notrace void probe_wakeup_latency_hist_start(void *v, - struct task_struct *p, int success) + struct task_struct *p) { unsigned long flags; struct task_struct *curr = current; @@ -907,7 +907,7 @@ out: } static notrace void probe_wakeup_latency_hist_stop(void *v, - struct task_struct *prev, struct task_struct *next) + bool preempt, struct task_struct *prev, struct task_struct *next) { unsigned long flags; int cpu = task_cpu(next); diff --git a/kernel/kernel/trace/ring_buffer.c b/kernel/kernel/trace/ring_buffer.c index 0315d4317..9c6045a27 100644 --- a/kernel/kernel/trace/ring_buffer.c +++ b/kernel/kernel/trace/ring_buffer.c @@ -3,7 +3,7 @@ * * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> */ -#include <linux/ftrace_event.h> +#include <linux/trace_events.h> #include <linux/ring_buffer.h> #include <linux/trace_clock.h> #include <linux/trace_seq.h> @@ -115,63 +115,11 @@ int ring_buffer_print_entry_header(struct trace_seq *s) * */ -/* - * A fast way to enable or disable all ring buffers is to - * call tracing_on or tracing_off. Turning off the ring buffers - * prevents all ring buffers from being recorded to. - * Turning this switch on, makes it OK to write to the - * ring buffer, if the ring buffer is enabled itself. - * - * There's three layers that must be on in order to write - * to the ring buffer. - * - * 1) This global flag must be set. - * 2) The ring buffer must be enabled for recording. - * 3) The per cpu buffer must be enabled for recording. - * - * In case of an anomaly, this global flag has a bit set that - * will permantly disable all ring buffers. - */ - -/* - * Global flag to disable all recording to ring buffers - * This has two bits: ON, DISABLED - * - * ON DISABLED - * ---- ---------- - * 0 0 : ring buffers are off - * 1 0 : ring buffers are on - * X 1 : ring buffers are permanently disabled - */ - -enum { - RB_BUFFERS_ON_BIT = 0, - RB_BUFFERS_DISABLED_BIT = 1, -}; - -enum { - RB_BUFFERS_ON = 1 << RB_BUFFERS_ON_BIT, - RB_BUFFERS_DISABLED = 1 << RB_BUFFERS_DISABLED_BIT, -}; - -static unsigned long ring_buffer_flags __read_mostly = RB_BUFFERS_ON; - /* Used for individual buffers (after the counter) */ #define RB_BUFFER_OFF (1 << 20) #define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data) -/** - * tracing_off_permanent - permanently disable ring buffers - * - * This function, once called, will disable all ring buffers - * permanently. - */ -void tracing_off_permanent(void) -{ - set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags); -} - #define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) #define RB_ALIGNMENT 4U #define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) @@ -452,6 +400,34 @@ struct rb_irq_work { }; /* + * Structure to hold event state and handle nested events. + */ +struct rb_event_info { + u64 ts; + u64 delta; + unsigned long length; + struct buffer_page *tail_page; + int add_timestamp; +}; + +/* + * Used for which event context the event is in. + * NMI = 0 + * IRQ = 1 + * SOFTIRQ = 2 + * NORMAL = 3 + * + * See trace_recursive_lock() comment below for more details. + */ +enum { + RB_CTX_NMI, + RB_CTX_IRQ, + RB_CTX_SOFTIRQ, + RB_CTX_NORMAL, + RB_CTX_MAX +}; + +/* * head_page == tail_page && head == tail then buffer is empty. */ struct ring_buffer_per_cpu { @@ -462,6 +438,7 @@ struct ring_buffer_per_cpu { arch_spinlock_t lock; struct lock_class_key lock_key; unsigned int nr_pages; + unsigned int current_context; struct list_head *pages; struct buffer_page *head_page; /* read from head */ struct buffer_page *tail_page; /* write to tail */ @@ -852,7 +829,7 @@ rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer, * writer is ever on it, the previous pointer never points * back to the reader page. */ -static int rb_is_reader_page(struct buffer_page *page) +static bool rb_is_reader_page(struct buffer_page *page) { struct list_head *list = page->list.prev; @@ -1910,79 +1887,6 @@ rb_event_index(struct ring_buffer_event *event) return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE; } -static inline int -rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) -{ - unsigned long addr = (unsigned long)event; - unsigned long index; - - index = rb_event_index(event); - addr &= PAGE_MASK; - - return cpu_buffer->commit_page->page == (void *)addr && - rb_commit_index(cpu_buffer) == index; -} - -static void -rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) -{ - unsigned long max_count; - - /* - * We only race with interrupts and NMIs on this CPU. - * If we own the commit event, then we can commit - * all others that interrupted us, since the interruptions - * are in stack format (they finish before they come - * back to us). This allows us to do a simple loop to - * assign the commit to the tail. - */ - again: - max_count = cpu_buffer->nr_pages * 100; - - while (cpu_buffer->commit_page != cpu_buffer->tail_page) { - if (RB_WARN_ON(cpu_buffer, !(--max_count))) - return; - if (RB_WARN_ON(cpu_buffer, - rb_is_reader_page(cpu_buffer->tail_page))) - return; - local_set(&cpu_buffer->commit_page->page->commit, - rb_page_write(cpu_buffer->commit_page)); - rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); - cpu_buffer->write_stamp = - cpu_buffer->commit_page->page->time_stamp; - /* add barrier to keep gcc from optimizing too much */ - barrier(); - } - while (rb_commit_index(cpu_buffer) != - rb_page_write(cpu_buffer->commit_page)) { - - local_set(&cpu_buffer->commit_page->page->commit, - rb_page_write(cpu_buffer->commit_page)); - RB_WARN_ON(cpu_buffer, - local_read(&cpu_buffer->commit_page->page->commit) & - ~RB_WRITE_MASK); - barrier(); - } - - /* again, keep gcc from optimizing */ - barrier(); - - /* - * If an interrupt came in just after the first while loop - * and pushed the tail page forward, we will be left with - * a dangling commit that will never go forward. - */ - if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page)) - goto again; -} - -static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer) -{ - cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp; - cpu_buffer->reader_page->read = 0; -} - static void rb_inc_iter(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; @@ -2002,64 +1906,6 @@ static void rb_inc_iter(struct ring_buffer_iter *iter) iter->head = 0; } -/* Slow path, do not inline */ -static noinline struct ring_buffer_event * -rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) -{ - event->type_len = RINGBUF_TYPE_TIME_EXTEND; - - /* Not the first event on the page? */ - if (rb_event_index(event)) { - event->time_delta = delta & TS_MASK; - event->array[0] = delta >> TS_SHIFT; - } else { - /* nope, just zero it */ - event->time_delta = 0; - event->array[0] = 0; - } - - return skip_time_extend(event); -} - -/** - * rb_update_event - update event type and data - * @event: the event to update - * @type: the type of event - * @length: the size of the event field in the ring buffer - * - * Update the type and data fields of the event. The length - * is the actual size that is written to the ring buffer, - * and with this, we can determine what to place into the - * data field. - */ -static void -rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event, unsigned length, - int add_timestamp, u64 delta) -{ - /* Only a commit updates the timestamp */ - if (unlikely(!rb_event_is_commit(cpu_buffer, event))) - delta = 0; - - /* - * If we need to add a timestamp, then we - * add it to the start of the resevered space. - */ - if (unlikely(add_timestamp)) { - event = rb_add_time_stamp(event, delta); - length -= RB_LEN_TIME_EXTEND; - delta = 0; - } - - event->time_delta = delta; - length -= RB_EVNT_HDR_SIZE; - if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) { - event->type_len = 0; - event->array[0] = length; - } else - event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); -} - /* * rb_handle_head_page - writer hit the head page * @@ -2218,29 +2064,13 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer, return 0; } -static unsigned rb_calculate_event_length(unsigned length) -{ - struct ring_buffer_event event; /* Used only for sizeof array */ - - /* zero length can cause confusions */ - if (!length) - length = 1; - - if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) - length += sizeof(event.array[0]); - - length += RB_EVNT_HDR_SIZE; - length = ALIGN(length, RB_ARCH_ALIGNMENT); - - return length; -} - static inline void rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, - struct buffer_page *tail_page, - unsigned long tail, unsigned long length) + unsigned long tail, struct rb_event_info *info) { + struct buffer_page *tail_page = info->tail_page; struct ring_buffer_event *event; + unsigned long length = info->length; /* * Only the event that crossed the page boundary @@ -2310,13 +2140,14 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, */ static noinline struct ring_buffer_event * rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, - unsigned long length, unsigned long tail, - struct buffer_page *tail_page, u64 ts) + unsigned long tail, struct rb_event_info *info) { + struct buffer_page *tail_page = info->tail_page; struct buffer_page *commit_page = cpu_buffer->commit_page; struct ring_buffer *buffer = cpu_buffer->buffer; struct buffer_page *next_page; int ret; + u64 ts; next_page = tail_page; @@ -2402,74 +2233,120 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, out_again: - rb_reset_tail(cpu_buffer, tail_page, tail, length); + rb_reset_tail(cpu_buffer, tail, info); /* fail and let the caller try again */ return ERR_PTR(-EAGAIN); out_reset: /* reset write */ - rb_reset_tail(cpu_buffer, tail_page, tail, length); + rb_reset_tail(cpu_buffer, tail, info); return NULL; } -static struct ring_buffer_event * -__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, - unsigned long length, u64 ts, - u64 delta, int add_timestamp) +/* Slow path, do not inline */ +static noinline struct ring_buffer_event * +rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) { - struct buffer_page *tail_page; - struct ring_buffer_event *event; - unsigned long tail, write; + event->type_len = RINGBUF_TYPE_TIME_EXTEND; - /* - * If the time delta since the last event is too big to - * hold in the time field of the event, then we append a - * TIME EXTEND event ahead of the data event. - */ - if (unlikely(add_timestamp)) - length += RB_LEN_TIME_EXTEND; + /* Not the first event on the page? */ + if (rb_event_index(event)) { + event->time_delta = delta & TS_MASK; + event->array[0] = delta >> TS_SHIFT; + } else { + /* nope, just zero it */ + event->time_delta = 0; + event->array[0] = 0; + } - tail_page = cpu_buffer->tail_page; - write = local_add_return(length, &tail_page->write); + return skip_time_extend(event); +} - /* set write to only the index of the write */ - write &= RB_WRITE_MASK; - tail = write - length; +static inline bool rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event); + +/** + * rb_update_event - update event type and data + * @event: the event to update + * @type: the type of event + * @length: the size of the event field in the ring buffer + * + * Update the type and data fields of the event. The length + * is the actual size that is written to the ring buffer, + * and with this, we can determine what to place into the + * data field. + */ +static void +rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event, + struct rb_event_info *info) +{ + unsigned length = info->length; + u64 delta = info->delta; + + /* Only a commit updates the timestamp */ + if (unlikely(!rb_event_is_commit(cpu_buffer, event))) + delta = 0; /* - * If this is the first commit on the page, then it has the same - * timestamp as the page itself. + * If we need to add a timestamp, then we + * add it to the start of the resevered space. */ - if (!tail) + if (unlikely(info->add_timestamp)) { + event = rb_add_time_stamp(event, delta); + length -= RB_LEN_TIME_EXTEND; delta = 0; + } - /* See if we shot pass the end of this buffer page */ - if (unlikely(write > BUF_PAGE_SIZE)) - return rb_move_tail(cpu_buffer, length, tail, - tail_page, ts); + event->time_delta = delta; + length -= RB_EVNT_HDR_SIZE; + if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) { + event->type_len = 0; + event->array[0] = length; + } else + event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); +} - /* We reserved something on the buffer */ +static unsigned rb_calculate_event_length(unsigned length) +{ + struct ring_buffer_event event; /* Used only for sizeof array */ - event = __rb_page_index(tail_page, tail); - kmemcheck_annotate_bitfield(event, bitfield); - rb_update_event(cpu_buffer, event, length, add_timestamp, delta); + /* zero length can cause confusions */ + if (!length) + length++; - local_inc(&tail_page->entries); + if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) + length += sizeof(event.array[0]); + + length += RB_EVNT_HDR_SIZE; + length = ALIGN(length, RB_ARCH_ALIGNMENT); /* - * If this is the first commit on the page, then update - * its timestamp. + * In case the time delta is larger than the 27 bits for it + * in the header, we need to add a timestamp. If another + * event comes in when trying to discard this one to increase + * the length, then the timestamp will be added in the allocated + * space of this event. If length is bigger than the size needed + * for the TIME_EXTEND, then padding has to be used. The events + * length must be either RB_LEN_TIME_EXTEND, or greater than or equal + * to RB_LEN_TIME_EXTEND + 8, as 8 is the minimum size for padding. + * As length is a multiple of 4, we only need to worry if it + * is 12 (RB_LEN_TIME_EXTEND + 4). */ - if (!tail) - tail_page->page->time_stamp = ts; + if (length == RB_LEN_TIME_EXTEND + RB_ALIGNMENT) + length += RB_ALIGNMENT; - /* account for these added bytes */ - local_add(length, &cpu_buffer->entries_bytes); + return length; +} - return event; +#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +static inline bool sched_clock_stable(void) +{ + return true; } +#endif static inline int rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, @@ -2517,6 +2394,59 @@ static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer) local_inc(&cpu_buffer->commits); } +static void +rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) +{ + unsigned long max_count; + + /* + * We only race with interrupts and NMIs on this CPU. + * If we own the commit event, then we can commit + * all others that interrupted us, since the interruptions + * are in stack format (they finish before they come + * back to us). This allows us to do a simple loop to + * assign the commit to the tail. + */ + again: + max_count = cpu_buffer->nr_pages * 100; + + while (cpu_buffer->commit_page != cpu_buffer->tail_page) { + if (RB_WARN_ON(cpu_buffer, !(--max_count))) + return; + if (RB_WARN_ON(cpu_buffer, + rb_is_reader_page(cpu_buffer->tail_page))) + return; + local_set(&cpu_buffer->commit_page->page->commit, + rb_page_write(cpu_buffer->commit_page)); + rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); + cpu_buffer->write_stamp = + cpu_buffer->commit_page->page->time_stamp; + /* add barrier to keep gcc from optimizing too much */ + barrier(); + } + while (rb_commit_index(cpu_buffer) != + rb_page_write(cpu_buffer->commit_page)) { + + local_set(&cpu_buffer->commit_page->page->commit, + rb_page_write(cpu_buffer->commit_page)); + RB_WARN_ON(cpu_buffer, + local_read(&cpu_buffer->commit_page->page->commit) & + ~RB_WRITE_MASK); + barrier(); + } + + /* again, keep gcc from optimizing */ + barrier(); + + /* + * If an interrupt came in just after the first while loop + * and pushed the tail page forward, we will be left with + * a dangling commit that will never go forward. + */ + if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page)) + goto again; +} + static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) { unsigned long commits; @@ -2549,95 +2479,96 @@ static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) } } -static struct ring_buffer_event * -rb_reserve_next_event(struct ring_buffer *buffer, - struct ring_buffer_per_cpu *cpu_buffer, - unsigned long length) +static inline void rb_event_discard(struct ring_buffer_event *event) { - struct ring_buffer_event *event; - u64 ts, delta; - int nr_loops = 0; - int add_timestamp; - u64 diff; + if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) + event = skip_time_extend(event); - rb_start_commit(cpu_buffer); + /* array[0] holds the actual length for the discarded event */ + event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; + event->type_len = RINGBUF_TYPE_PADDING; + /* time delta must be non zero */ + if (!event->time_delta) + event->time_delta = 1; +} -#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP - /* - * Due to the ability to swap a cpu buffer from a buffer - * it is possible it was swapped before we committed. - * (committing stops a swap). We check for it here and - * if it happened, we have to fail the write. - */ - barrier(); - if (unlikely(ACCESS_ONCE(cpu_buffer->buffer) != buffer)) { - local_dec(&cpu_buffer->committing); - local_dec(&cpu_buffer->commits); - return NULL; - } -#endif +static inline bool +rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + unsigned long addr = (unsigned long)event; + unsigned long index; - length = rb_calculate_event_length(length); - again: - add_timestamp = 0; - delta = 0; + index = rb_event_index(event); + addr &= PAGE_MASK; + + return cpu_buffer->commit_page->page == (void *)addr && + rb_commit_index(cpu_buffer) == index; +} + +static void +rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + u64 delta; /* - * We allow for interrupts to reenter here and do a trace. - * If one does, it will cause this original code to loop - * back here. Even with heavy interrupts happening, this - * should only happen a few times in a row. If this happens - * 1000 times in a row, there must be either an interrupt - * storm or we have something buggy. - * Bail! + * The event first in the commit queue updates the + * time stamp. */ - if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) - goto out_fail; + if (rb_event_is_commit(cpu_buffer, event)) { + /* + * A commit event that is first on a page + * updates the write timestamp with the page stamp + */ + if (!rb_event_index(event)) + cpu_buffer->write_stamp = + cpu_buffer->commit_page->page->time_stamp; + else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { + delta = event->array[0]; + delta <<= TS_SHIFT; + delta += event->time_delta; + cpu_buffer->write_stamp += delta; + } else + cpu_buffer->write_stamp += event->time_delta; + } +} - ts = rb_time_stamp(cpu_buffer->buffer); - diff = ts - cpu_buffer->write_stamp; +static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + local_inc(&cpu_buffer->entries); + rb_update_write_stamp(cpu_buffer, event); + rb_end_commit(cpu_buffer); +} - /* make sure this diff is calculated here */ - barrier(); +static __always_inline void +rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) +{ + bool pagebusy; - /* Did the write stamp get updated already? */ - if (likely(ts >= cpu_buffer->write_stamp)) { - delta = diff; - if (unlikely(test_time_stamp(delta))) { - int local_clock_stable = 1; -#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK - local_clock_stable = sched_clock_stable(); -#endif - WARN_ONCE(delta > (1ULL << 59), - KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", - (unsigned long long)delta, - (unsigned long long)ts, - (unsigned long long)cpu_buffer->write_stamp, - local_clock_stable ? "" : - "If you just came from a suspend/resume,\n" - "please switch to the trace global clock:\n" - " echo global > /sys/kernel/debug/tracing/trace_clock\n"); - add_timestamp = 1; - } + if (buffer->irq_work.waiters_pending) { + buffer->irq_work.waiters_pending = false; + /* irq_work_queue() supplies it's own memory barriers */ + irq_work_queue(&buffer->irq_work.work); } - event = __rb_reserve_next(cpu_buffer, length, ts, - delta, add_timestamp); - if (unlikely(PTR_ERR(event) == -EAGAIN)) - goto again; - - if (!event) - goto out_fail; + if (cpu_buffer->irq_work.waiters_pending) { + cpu_buffer->irq_work.waiters_pending = false; + /* irq_work_queue() supplies it's own memory barriers */ + irq_work_queue(&cpu_buffer->irq_work.work); + } - return event; + pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; - out_fail: - rb_end_commit(cpu_buffer); - return NULL; + if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) { + cpu_buffer->irq_work.wakeup_full = true; + cpu_buffer->irq_work.full_waiters_pending = false; + /* irq_work_queue() supplies it's own memory barriers */ + irq_work_queue(&cpu_buffer->irq_work.work); + } } -#ifdef CONFIG_TRACING - /* * The lock and unlock are done within a preempt disable section. * The current_context per_cpu variable can only be modified @@ -2675,210 +2606,270 @@ rb_reserve_next_event(struct ring_buffer *buffer, * just so happens that it is the same bit corresponding to * the current context. */ -static DEFINE_PER_CPU(unsigned int, current_context); -static __always_inline int trace_recursive_lock(void) +static __always_inline int +trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) { - unsigned int val = __this_cpu_read(current_context); + unsigned int val = cpu_buffer->current_context; int bit; if (in_interrupt()) { if (in_nmi()) - bit = 0; + bit = RB_CTX_NMI; else if (in_irq()) - bit = 1; + bit = RB_CTX_IRQ; else - bit = 2; + bit = RB_CTX_SOFTIRQ; } else - bit = 3; + bit = RB_CTX_NORMAL; if (unlikely(val & (1 << bit))) return 1; val |= (1 << bit); - __this_cpu_write(current_context, val); + cpu_buffer->current_context = val; return 0; } -static __always_inline void trace_recursive_unlock(void) +static __always_inline void +trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer) { - __this_cpu_and(current_context, __this_cpu_read(current_context) - 1); + cpu_buffer->current_context &= cpu_buffer->current_context - 1; } -#else - -#define trace_recursive_lock() (0) -#define trace_recursive_unlock() do { } while (0) - -#endif - /** - * ring_buffer_lock_reserve - reserve a part of the buffer - * @buffer: the ring buffer to reserve from - * @length: the length of the data to reserve (excluding event header) - * - * Returns a reseverd event on the ring buffer to copy directly to. - * The user of this interface will need to get the body to write into - * and can use the ring_buffer_event_data() interface. + * ring_buffer_unlock_commit - commit a reserved + * @buffer: The buffer to commit to + * @event: The event pointer to commit. * - * The length is the length of the data needed, not the event length - * which also includes the event header. + * This commits the data to the ring buffer, and releases any locks held. * - * Must be paired with ring_buffer_unlock_commit, unless NULL is returned. - * If NULL is returned, then nothing has been allocated or locked. + * Must be paired with ring_buffer_lock_reserve. */ -struct ring_buffer_event * -ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) +int ring_buffer_unlock_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event) { struct ring_buffer_per_cpu *cpu_buffer; - struct ring_buffer_event *event; - int cpu; + int cpu = raw_smp_processor_id(); - if (ring_buffer_flags != RB_BUFFERS_ON) - return NULL; + cpu_buffer = buffer->buffers[cpu]; - /* If we are tracing schedule, we don't want to recurse */ - preempt_disable_notrace(); + rb_commit(cpu_buffer, event); - if (atomic_read(&buffer->record_disabled)) - goto out_nocheck; + rb_wakeups(buffer, cpu_buffer); - if (trace_recursive_lock()) - goto out_nocheck; + trace_recursive_unlock(cpu_buffer); - cpu = raw_smp_processor_id(); + preempt_enable_notrace(); - if (!cpumask_test_cpu(cpu, buffer->cpumask)) - goto out; + return 0; +} +EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); - cpu_buffer = buffer->buffers[cpu]; +static noinline void +rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer, + struct rb_event_info *info) +{ + WARN_ONCE(info->delta > (1ULL << 59), + KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", + (unsigned long long)info->delta, + (unsigned long long)info->ts, + (unsigned long long)cpu_buffer->write_stamp, + sched_clock_stable() ? "" : + "If you just came from a suspend/resume,\n" + "please switch to the trace global clock:\n" + " echo global > /sys/kernel/debug/tracing/trace_clock\n"); + info->add_timestamp = 1; +} - if (atomic_read(&cpu_buffer->record_disabled)) - goto out; +static struct ring_buffer_event * +__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, + struct rb_event_info *info) +{ + struct ring_buffer_event *event; + struct buffer_page *tail_page; + unsigned long tail, write; - if (length > BUF_MAX_DATA_SIZE) - goto out; + /* + * If the time delta since the last event is too big to + * hold in the time field of the event, then we append a + * TIME EXTEND event ahead of the data event. + */ + if (unlikely(info->add_timestamp)) + info->length += RB_LEN_TIME_EXTEND; - event = rb_reserve_next_event(buffer, cpu_buffer, length); - if (!event) - goto out; + tail_page = info->tail_page = cpu_buffer->tail_page; + write = local_add_return(info->length, &tail_page->write); - return event; + /* set write to only the index of the write */ + write &= RB_WRITE_MASK; + tail = write - info->length; - out: - trace_recursive_unlock(); + /* + * If this is the first commit on the page, then it has the same + * timestamp as the page itself. + */ + if (!tail) + info->delta = 0; - out_nocheck: - preempt_enable_notrace(); - return NULL; -} -EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); + /* See if we shot pass the end of this buffer page */ + if (unlikely(write > BUF_PAGE_SIZE)) + return rb_move_tail(cpu_buffer, tail, info); -static void -rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) -{ - u64 delta; + /* We reserved something on the buffer */ + + event = __rb_page_index(tail_page, tail); + kmemcheck_annotate_bitfield(event, bitfield); + rb_update_event(cpu_buffer, event, info); + + local_inc(&tail_page->entries); /* - * The event first in the commit queue updates the - * time stamp. + * If this is the first commit on the page, then update + * its timestamp. */ - if (rb_event_is_commit(cpu_buffer, event)) { - /* - * A commit event that is first on a page - * updates the write timestamp with the page stamp - */ - if (!rb_event_index(event)) - cpu_buffer->write_stamp = - cpu_buffer->commit_page->page->time_stamp; - else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { - delta = event->array[0]; - delta <<= TS_SHIFT; - delta += event->time_delta; - cpu_buffer->write_stamp += delta; - } else - cpu_buffer->write_stamp += event->time_delta; - } -} + if (!tail) + tail_page->page->time_stamp = info->ts; -static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) -{ - local_inc(&cpu_buffer->entries); - rb_update_write_stamp(cpu_buffer, event); - rb_end_commit(cpu_buffer); + /* account for these added bytes */ + local_add(info->length, &cpu_buffer->entries_bytes); + + return event; } -static __always_inline void -rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) +static struct ring_buffer_event * +rb_reserve_next_event(struct ring_buffer *buffer, + struct ring_buffer_per_cpu *cpu_buffer, + unsigned long length) { - bool pagebusy; + struct ring_buffer_event *event; + struct rb_event_info info; + int nr_loops = 0; + u64 diff; - if (buffer->irq_work.waiters_pending) { - buffer->irq_work.waiters_pending = false; - /* irq_work_queue() supplies it's own memory barriers */ - irq_work_queue(&buffer->irq_work.work); + rb_start_commit(cpu_buffer); + +#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP + /* + * Due to the ability to swap a cpu buffer from a buffer + * it is possible it was swapped before we committed. + * (committing stops a swap). We check for it here and + * if it happened, we have to fail the write. + */ + barrier(); + if (unlikely(ACCESS_ONCE(cpu_buffer->buffer) != buffer)) { + local_dec(&cpu_buffer->committing); + local_dec(&cpu_buffer->commits); + return NULL; } +#endif - if (cpu_buffer->irq_work.waiters_pending) { - cpu_buffer->irq_work.waiters_pending = false; - /* irq_work_queue() supplies it's own memory barriers */ - irq_work_queue(&cpu_buffer->irq_work.work); + info.length = rb_calculate_event_length(length); + again: + info.add_timestamp = 0; + info.delta = 0; + + /* + * We allow for interrupts to reenter here and do a trace. + * If one does, it will cause this original code to loop + * back here. Even with heavy interrupts happening, this + * should only happen a few times in a row. If this happens + * 1000 times in a row, there must be either an interrupt + * storm or we have something buggy. + * Bail! + */ + if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) + goto out_fail; + + info.ts = rb_time_stamp(cpu_buffer->buffer); + diff = info.ts - cpu_buffer->write_stamp; + + /* make sure this diff is calculated here */ + barrier(); + + /* Did the write stamp get updated already? */ + if (likely(info.ts >= cpu_buffer->write_stamp)) { + info.delta = diff; + if (unlikely(test_time_stamp(info.delta))) + rb_handle_timestamp(cpu_buffer, &info); } - pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; + event = __rb_reserve_next(cpu_buffer, &info); - if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) { - cpu_buffer->irq_work.wakeup_full = true; - cpu_buffer->irq_work.full_waiters_pending = false; - /* irq_work_queue() supplies it's own memory barriers */ - irq_work_queue(&cpu_buffer->irq_work.work); + if (unlikely(PTR_ERR(event) == -EAGAIN)) { + if (info.add_timestamp) + info.length -= RB_LEN_TIME_EXTEND; + goto again; } + + if (!event) + goto out_fail; + + return event; + + out_fail: + rb_end_commit(cpu_buffer); + return NULL; } /** - * ring_buffer_unlock_commit - commit a reserved - * @buffer: The buffer to commit to - * @event: The event pointer to commit. + * ring_buffer_lock_reserve - reserve a part of the buffer + * @buffer: the ring buffer to reserve from + * @length: the length of the data to reserve (excluding event header) * - * This commits the data to the ring buffer, and releases any locks held. + * Returns a reseverd event on the ring buffer to copy directly to. + * The user of this interface will need to get the body to write into + * and can use the ring_buffer_event_data() interface. * - * Must be paired with ring_buffer_lock_reserve. + * The length is the length of the data needed, not the event length + * which also includes the event header. + * + * Must be paired with ring_buffer_unlock_commit, unless NULL is returned. + * If NULL is returned, then nothing has been allocated or locked. */ -int ring_buffer_unlock_commit(struct ring_buffer *buffer, - struct ring_buffer_event *event) +struct ring_buffer_event * +ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) { struct ring_buffer_per_cpu *cpu_buffer; - int cpu = raw_smp_processor_id(); + struct ring_buffer_event *event; + int cpu; - cpu_buffer = buffer->buffers[cpu]; + /* If we are tracing schedule, we don't want to recurse */ + preempt_disable_notrace(); - rb_commit(cpu_buffer, event); + if (unlikely(atomic_read(&buffer->record_disabled))) + goto out; - rb_wakeups(buffer, cpu_buffer); + cpu = raw_smp_processor_id(); - trace_recursive_unlock(); + if (unlikely(!cpumask_test_cpu(cpu, buffer->cpumask))) + goto out; - preempt_enable_notrace(); + cpu_buffer = buffer->buffers[cpu]; - return 0; -} -EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); + if (unlikely(atomic_read(&cpu_buffer->record_disabled))) + goto out; -static inline void rb_event_discard(struct ring_buffer_event *event) -{ - if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) - event = skip_time_extend(event); + if (unlikely(length > BUF_MAX_DATA_SIZE)) + goto out; - /* array[0] holds the actual length for the discarded event */ - event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; - event->type_len = RINGBUF_TYPE_PADDING; - /* time delta must be non zero */ - if (!event->time_delta) - event->time_delta = 1; + if (unlikely(trace_recursive_lock(cpu_buffer))) + goto out; + + event = rb_reserve_next_event(buffer, cpu_buffer, length); + if (!event) + goto out_unlock; + + return event; + + out_unlock: + trace_recursive_unlock(cpu_buffer); + out: + preempt_enable_notrace(); + return NULL; } +EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); /* * Decrement the entries to the page that an event is on. @@ -2970,7 +2961,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, out: rb_end_commit(cpu_buffer); - trace_recursive_unlock(); + trace_recursive_unlock(cpu_buffer); preempt_enable_notrace(); @@ -3000,9 +2991,6 @@ int ring_buffer_write(struct ring_buffer *buffer, int ret = -EBUSY; int cpu; - if (ring_buffer_flags != RB_BUFFERS_ON) - return -EBUSY; - preempt_disable_notrace(); if (atomic_read(&buffer->record_disabled)) @@ -3021,9 +3009,12 @@ int ring_buffer_write(struct ring_buffer *buffer, if (length > BUF_MAX_DATA_SIZE) goto out; + if (unlikely(trace_recursive_lock(cpu_buffer))) + goto out; + event = rb_reserve_next_event(buffer, cpu_buffer, length); if (!event) - goto out; + goto out_unlock; body = rb_event_data(event); @@ -3034,6 +3025,10 @@ int ring_buffer_write(struct ring_buffer *buffer, rb_wakeups(buffer, cpu_buffer); ret = 0; + + out_unlock: + trace_recursive_unlock(cpu_buffer); + out: preempt_enable_notrace(); @@ -3041,7 +3036,7 @@ int ring_buffer_write(struct ring_buffer *buffer, } EXPORT_SYMBOL_GPL(ring_buffer_write); -static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) +static bool rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *reader = cpu_buffer->reader_page; struct buffer_page *head = rb_set_head_page(cpu_buffer); @@ -3049,7 +3044,7 @@ static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) /* In case of error, head will be NULL */ if (unlikely(!head)) - return 1; + return true; return reader->read == rb_page_commit(reader) && (commit == reader || @@ -3628,7 +3623,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) /* Finally update the reader page to the new head */ cpu_buffer->reader_page = reader; - rb_reset_reader_page(cpu_buffer); + cpu_buffer->reader_page->read = 0; if (overwrite != cpu_buffer->last_overrun) { cpu_buffer->lost_events = overwrite - cpu_buffer->last_overrun; @@ -3638,6 +3633,10 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) goto again; out: + /* Update the read_stamp on the first event */ + if (reader && reader->read == 0) + cpu_buffer->read_stamp = reader->page->time_stamp; + arch_spin_unlock(&cpu_buffer->lock); local_irq_restore(flags); @@ -3860,19 +3859,36 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) } EXPORT_SYMBOL_GPL(ring_buffer_iter_peek); -static inline int rb_ok_to_lock(void) +static inline bool rb_reader_lock(struct ring_buffer_per_cpu *cpu_buffer) { + if (likely(!in_nmi())) { + raw_spin_lock(&cpu_buffer->reader_lock); + return true; + } + /* * If an NMI die dumps out the content of the ring buffer - * do not grab locks. We also permanently disable the ring - * buffer too. A one time deal is all you get from reading - * the ring buffer from an NMI. + * trylock must be used to prevent a deadlock if the NMI + * preempted a task that holds the ring buffer locks. If + * we get the lock then all is fine, if not, then continue + * to do the read, but this can corrupt the ring buffer, + * so it must be permanently disabled from future writes. + * Reading from NMI is a oneshot deal. */ - if (likely(!in_nmi())) - return 1; + if (raw_spin_trylock(&cpu_buffer->reader_lock)) + return true; - tracing_off_permanent(); - return 0; + /* Continue without locking, but disable the ring buffer */ + atomic_inc(&cpu_buffer->record_disabled); + return false; +} + +static inline void +rb_reader_unlock(struct ring_buffer_per_cpu *cpu_buffer, bool locked) +{ + if (likely(locked)) + raw_spin_unlock(&cpu_buffer->reader_lock); + return; } /** @@ -3892,21 +3908,18 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts, struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; struct ring_buffer_event *event; unsigned long flags; - int dolock; + bool dolock; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return NULL; - dolock = rb_ok_to_lock(); again: local_irq_save(flags); - if (dolock) - raw_spin_lock(&cpu_buffer->reader_lock); + dolock = rb_reader_lock(cpu_buffer); event = rb_buffer_peek(cpu_buffer, ts, lost_events); if (event && event->type_len == RINGBUF_TYPE_PADDING) rb_advance_reader(cpu_buffer); - if (dolock) - raw_spin_unlock(&cpu_buffer->reader_lock); + rb_reader_unlock(cpu_buffer, dolock); local_irq_restore(flags); if (event && event->type_len == RINGBUF_TYPE_PADDING) @@ -3959,9 +3972,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts, struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event = NULL; unsigned long flags; - int dolock; - - dolock = rb_ok_to_lock(); + bool dolock; again: /* might be called in atomic */ @@ -3972,8 +3983,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts, cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); - if (dolock) - raw_spin_lock(&cpu_buffer->reader_lock); + dolock = rb_reader_lock(cpu_buffer); event = rb_buffer_peek(cpu_buffer, ts, lost_events); if (event) { @@ -3981,8 +3991,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts, rb_advance_reader(cpu_buffer); } - if (dolock) - raw_spin_unlock(&cpu_buffer->reader_lock); + rb_reader_unlock(cpu_buffer, dolock); local_irq_restore(flags); out: @@ -4259,32 +4268,28 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset); * rind_buffer_empty - is the ring buffer empty? * @buffer: The ring buffer to test */ -int ring_buffer_empty(struct ring_buffer *buffer) +bool ring_buffer_empty(struct ring_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags; - int dolock; + bool dolock; int cpu; int ret; - dolock = rb_ok_to_lock(); - /* yes this is racy, but if you don't like the race, lock the buffer */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); - if (dolock) - raw_spin_lock(&cpu_buffer->reader_lock); + dolock = rb_reader_lock(cpu_buffer); ret = rb_per_cpu_empty(cpu_buffer); - if (dolock) - raw_spin_unlock(&cpu_buffer->reader_lock); + rb_reader_unlock(cpu_buffer, dolock); local_irq_restore(flags); if (!ret) - return 0; + return false; } - return 1; + return true; } EXPORT_SYMBOL_GPL(ring_buffer_empty); @@ -4293,25 +4298,21 @@ EXPORT_SYMBOL_GPL(ring_buffer_empty); * @buffer: The ring buffer * @cpu: The CPU buffer to test */ -int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) +bool ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags; - int dolock; + bool dolock; int ret; if (!cpumask_test_cpu(cpu, buffer->cpumask)) - return 1; - - dolock = rb_ok_to_lock(); + return true; cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); - if (dolock) - raw_spin_lock(&cpu_buffer->reader_lock); + dolock = rb_reader_lock(cpu_buffer); ret = rb_per_cpu_empty(cpu_buffer); - if (dolock) - raw_spin_unlock(&cpu_buffer->reader_lock); + rb_reader_unlock(cpu_buffer, dolock); local_irq_restore(flags); return ret; @@ -4349,9 +4350,6 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, ret = -EAGAIN; - if (ring_buffer_flags != RB_BUFFERS_ON) - goto out; - if (atomic_read(&buffer_a->record_disabled)) goto out; diff --git a/kernel/kernel/trace/ring_buffer_benchmark.c b/kernel/kernel/trace/ring_buffer_benchmark.c index 1b28df2d9..6df9a83e2 100644 --- a/kernel/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/kernel/trace/ring_buffer_benchmark.c @@ -24,19 +24,19 @@ struct rb_page { static int wakeup_interval = 100; static int reader_finish; -static struct completion read_start; -static struct completion read_done; +static DECLARE_COMPLETION(read_start); +static DECLARE_COMPLETION(read_done); static struct ring_buffer *buffer; static struct task_struct *producer; static struct task_struct *consumer; static unsigned long read; -static int disable_reader; +static unsigned int disable_reader; module_param(disable_reader, uint, 0644); MODULE_PARM_DESC(disable_reader, "only run producer"); -static int write_iteration = 50; +static unsigned int write_iteration = 50; module_param(write_iteration, uint, 0644); MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings"); @@ -46,26 +46,26 @@ static int consumer_nice = MAX_NICE; static int producer_fifo = -1; static int consumer_fifo = -1; -module_param(producer_nice, uint, 0644); +module_param(producer_nice, int, 0644); MODULE_PARM_DESC(producer_nice, "nice prio for producer"); -module_param(consumer_nice, uint, 0644); +module_param(consumer_nice, int, 0644); MODULE_PARM_DESC(consumer_nice, "nice prio for consumer"); -module_param(producer_fifo, uint, 0644); +module_param(producer_fifo, int, 0644); MODULE_PARM_DESC(producer_fifo, "fifo prio for producer"); -module_param(consumer_fifo, uint, 0644); +module_param(consumer_fifo, int, 0644); MODULE_PARM_DESC(consumer_fifo, "fifo prio for consumer"); static int read_events; -static int kill_test; +static int test_error; -#define KILL_TEST() \ +#define TEST_ERROR() \ do { \ - if (!kill_test) { \ - kill_test = 1; \ + if (!test_error) { \ + test_error = 1; \ WARN_ON(1); \ } \ } while (0) @@ -75,6 +75,11 @@ enum event_status { EVENT_DROPPED, }; +static bool break_test(void) +{ + return test_error || kthread_should_stop(); +} + static enum event_status read_event(int cpu) { struct ring_buffer_event *event; @@ -87,7 +92,7 @@ static enum event_status read_event(int cpu) entry = ring_buffer_event_data(event); if (*entry != cpu) { - KILL_TEST(); + TEST_ERROR(); return EVENT_DROPPED; } @@ -115,10 +120,10 @@ static enum event_status read_page(int cpu) rpage = bpage; /* The commit may have missed event flags set, clear them */ commit = local_read(&rpage->commit) & 0xfffff; - for (i = 0; i < commit && !kill_test; i += inc) { + for (i = 0; i < commit && !test_error ; i += inc) { if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) { - KILL_TEST(); + TEST_ERROR(); break; } @@ -128,7 +133,7 @@ static enum event_status read_page(int cpu) case RINGBUF_TYPE_PADDING: /* failed writes may be discarded events */ if (!event->time_delta) - KILL_TEST(); + TEST_ERROR(); inc = event->array[0] + 4; break; case RINGBUF_TYPE_TIME_EXTEND: @@ -137,12 +142,12 @@ static enum event_status read_page(int cpu) case 0: entry = ring_buffer_event_data(event); if (*entry != cpu) { - KILL_TEST(); + TEST_ERROR(); break; } read++; if (!event->array[0]) { - KILL_TEST(); + TEST_ERROR(); break; } inc = event->array[0] + 4; @@ -150,17 +155,17 @@ static enum event_status read_page(int cpu) default: entry = ring_buffer_event_data(event); if (*entry != cpu) { - KILL_TEST(); + TEST_ERROR(); break; } read++; inc = ((event->type_len + 1) * 4); } - if (kill_test) + if (test_error) break; if (inc <= 0) { - KILL_TEST(); + TEST_ERROR(); break; } } @@ -178,10 +183,14 @@ static void ring_buffer_consumer(void) read_events ^= 1; read = 0; - while (!reader_finish && !kill_test) { - int found; + /* + * Continue running until the producer specifically asks to stop + * and is ready for the completion. + */ + while (!READ_ONCE(reader_finish)) { + int found = 1; - do { + while (found && !test_error) { int cpu; found = 0; @@ -193,19 +202,25 @@ static void ring_buffer_consumer(void) else stat = read_page(cpu); - if (kill_test) + if (test_error) break; + if (stat == EVENT_FOUND) found = 1; + } - } while (found && !kill_test); + } + /* Wait till the producer wakes us up when there is more data + * available or when the producer wants us to finish reading. + */ set_current_state(TASK_INTERRUPTIBLE); if (reader_finish) break; schedule(); } + __set_current_state(TASK_RUNNING); reader_finish = 0; complete(&read_done); } @@ -263,8 +278,7 @@ static void ring_buffer_producer(void) if (cnt % wakeup_interval) cond_resched(); #endif - - } while (ktime_before(end_time, timeout) && !kill_test); + } while (ktime_before(end_time, timeout) && !break_test()); trace_printk("End ring buffer hammer\n"); if (consumer) { @@ -274,8 +288,6 @@ static void ring_buffer_producer(void) /* the completions must be visible before the finish var */ smp_wmb(); reader_finish = 1; - /* finish var visible before waking up the consumer */ - smp_wmb(); wake_up_process(consumer); wait_for_completion(&read_done); } @@ -285,7 +297,7 @@ static void ring_buffer_producer(void) entries = ring_buffer_entries(buffer); overruns = ring_buffer_overruns(buffer); - if (kill_test) + if (test_error) trace_printk("ERROR!\n"); if (!disable_reader) { @@ -366,20 +378,19 @@ static void wait_to_die(void) static int ring_buffer_consumer_thread(void *arg) { - while (!kthread_should_stop() && !kill_test) { + while (!break_test()) { complete(&read_start); ring_buffer_consumer(); set_current_state(TASK_INTERRUPTIBLE); - if (kthread_should_stop() || kill_test) + if (break_test()) break; - schedule(); } __set_current_state(TASK_RUNNING); - if (kill_test) + if (!kthread_should_stop()) wait_to_die(); return 0; @@ -387,25 +398,28 @@ static int ring_buffer_consumer_thread(void *arg) static int ring_buffer_producer_thread(void *arg) { - init_completion(&read_start); - - while (!kthread_should_stop() && !kill_test) { + while (!break_test()) { ring_buffer_reset(buffer); if (consumer) { - smp_wmb(); wake_up_process(consumer); wait_for_completion(&read_start); } ring_buffer_producer(); + if (break_test()) + goto out_kill; trace_printk("Sleeping for 10 secs\n"); set_current_state(TASK_INTERRUPTIBLE); + if (break_test()) + goto out_kill; schedule_timeout(HZ * SLEEP_TIME); } - if (kill_test) +out_kill: + __set_current_state(TASK_RUNNING); + if (!kthread_should_stop()) wait_to_die(); return 0; diff --git a/kernel/kernel/trace/trace.c b/kernel/kernel/trace/trace.c index 5088d1ce2..f1ee84b00 100644 --- a/kernel/kernel/trace/trace.c +++ b/kernel/kernel/trace/trace.c @@ -100,8 +100,6 @@ static DEFINE_PER_CPU(bool, trace_cmdline_save); */ static int tracing_disabled = 1; -DEFINE_PER_CPU(int, ftrace_cpu_disabled); - cpumask_var_t __read_mostly tracing_buffer_mask; /* @@ -214,12 +212,10 @@ __setup("alloc_snapshot", boot_alloc_snapshot); static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata; -static char *trace_boot_options __initdata; static int __init set_trace_boot_options(char *str) { strlcpy(trace_boot_options_buf, str, MAX_TRACER_SIZE); - trace_boot_options = trace_boot_options_buf; return 0; } __setup("trace_options=", set_trace_boot_options); @@ -250,6 +246,19 @@ unsigned long long ns2usecs(cycle_t nsec) return nsec; } +/* trace_flags holds trace_options default values */ +#define TRACE_DEFAULT_FLAGS \ + (FUNCTION_DEFAULT_FLAGS | \ + TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | \ + TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | \ + TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | \ + TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS) + +/* trace_options that are only supported by global_trace */ +#define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER_PRINTK | \ + TRACE_ITER_PRINTK_MSGONLY | TRACE_ITER_RECORD_CMD) + + /* * The global_trace is the descriptor that holds the tracing * buffers for the live tracing. For each CPU, it contains @@ -262,7 +271,9 @@ unsigned long long ns2usecs(cycle_t nsec) * pages for the buffer for that CPU. Each CPU has the same number * of pages allocated for its buffer. */ -static struct trace_array global_trace; +static struct trace_array global_trace = { + .trace_flags = TRACE_DEFAULT_FLAGS, +}; LIST_HEAD(ftrace_trace_arrays); @@ -297,11 +308,11 @@ void trace_array_put(struct trace_array *this_tr) mutex_unlock(&trace_types_lock); } -int filter_check_discard(struct ftrace_event_file *file, void *rec, +int filter_check_discard(struct trace_event_file *file, void *rec, struct ring_buffer *buffer, struct ring_buffer_event *event) { - if (unlikely(file->flags & FTRACE_EVENT_FL_FILTERED) && + if (unlikely(file->flags & EVENT_FILE_FL_FILTERED) && !filter_match_preds(file->filter, rec)) { ring_buffer_discard_commit(buffer, event); return 1; @@ -311,7 +322,7 @@ int filter_check_discard(struct ftrace_event_file *file, void *rec, } EXPORT_SYMBOL_GPL(filter_check_discard); -int call_filter_check_discard(struct ftrace_event_call *call, void *rec, +int call_filter_check_discard(struct trace_event_call *call, void *rec, struct ring_buffer *buffer, struct ring_buffer_event *event) { @@ -468,11 +479,29 @@ static inline void trace_access_lock_init(void) #endif -/* trace_flags holds trace_options default values */ -unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | - TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | - TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | - TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | TRACE_ITER_FUNCTION; +#ifdef CONFIG_STACKTRACE +static void __ftrace_trace_stack(struct ring_buffer *buffer, + unsigned long flags, + int skip, int pc, struct pt_regs *regs); +static inline void ftrace_trace_stack(struct trace_array *tr, + struct ring_buffer *buffer, + unsigned long flags, + int skip, int pc, struct pt_regs *regs); + +#else +static inline void __ftrace_trace_stack(struct ring_buffer *buffer, + unsigned long flags, + int skip, int pc, struct pt_regs *regs) +{ +} +static inline void ftrace_trace_stack(struct trace_array *tr, + struct ring_buffer *buffer, + unsigned long flags, + int skip, int pc, struct pt_regs *regs) +{ +} + +#endif static void tracer_tracing_on(struct trace_array *tr) { @@ -518,7 +547,7 @@ int __trace_puts(unsigned long ip, const char *str, int size) int alloc; int pc; - if (!(trace_flags & TRACE_ITER_PRINTK)) + if (!(global_trace.trace_flags & TRACE_ITER_PRINTK)) return 0; pc = preempt_count(); @@ -548,7 +577,7 @@ int __trace_puts(unsigned long ip, const char *str, int size) entry->buf[size] = '\0'; __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(buffer, irq_flags, 4, pc); + ftrace_trace_stack(&global_trace, buffer, irq_flags, 4, pc, NULL); return size; } @@ -568,7 +597,7 @@ int __trace_bputs(unsigned long ip, const char *str) int size = sizeof(struct bputs_entry); int pc; - if (!(trace_flags & TRACE_ITER_PRINTK)) + if (!(global_trace.trace_flags & TRACE_ITER_PRINTK)) return 0; pc = preempt_count(); @@ -588,7 +617,7 @@ int __trace_bputs(unsigned long ip, const char *str) entry->str = str; __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(buffer, irq_flags, 4, pc); + ftrace_trace_stack(&global_trace, buffer, irq_flags, 4, pc, NULL); return 1; } @@ -834,34 +863,18 @@ unsigned long nsecs_to_usecs(unsigned long nsecs) return nsecs / 1000; } +/* + * TRACE_FLAGS is defined as a tuple matching bit masks with strings. + * It uses C(a, b) where 'a' is the enum name and 'b' is the string that + * matches it. By defining "C(a, b) b", TRACE_FLAGS becomes a list + * of strings in the order that the enums were defined. + */ +#undef C +#define C(a, b) b + /* These must match the bit postions in trace_iterator_flags */ static const char *trace_options[] = { - "print-parent", - "sym-offset", - "sym-addr", - "verbose", - "raw", - "hex", - "bin", - "block", - "stacktrace", - "trace_printk", - "ftrace_preempt", - "branch", - "annotate", - "userstacktrace", - "sym-userobj", - "printk-msg-only", - "context-info", - "latency-format", - "sleep-time", - "graph-time", - "record-cmd", - "overwrite", - "disable_on_free", - "irq-info", - "markers", - "function-trace", + TRACE_FLAGS NULL }; @@ -876,6 +889,7 @@ static struct { { trace_clock_jiffies, "uptime", 0 }, { trace_clock, "perf", 1 }, { ktime_get_mono_fast_ns, "mono", 1 }, + { ktime_get_raw_fast_ns, "mono_raw", 1 }, ARCH_TRACE_CLOCKS }; @@ -1203,13 +1217,17 @@ static inline int run_tracer_selftest(struct tracer *type) } #endif /* CONFIG_FTRACE_STARTUP_TEST */ +static void add_tracer_options(struct trace_array *tr, struct tracer *t); + +static void __init apply_trace_boot_options(void); + /** * register_tracer - register a tracer with the ftrace system. * @type - the plugin for the tracer * * Register a new plugin tracer. */ -int register_tracer(struct tracer *type) +int __init register_tracer(struct tracer *type) { struct tracer *t; int ret = 0; @@ -1252,6 +1270,7 @@ int register_tracer(struct tracer *type) type->next = trace_types; trace_types = type; + add_tracer_options(&global_trace, type); out: tracing_selftest_running = false; @@ -1267,6 +1286,9 @@ int register_tracer(struct tracer *type) /* Do we want this tracer to start on bootup? */ tracing_set_tracer(&global_trace, type->name); default_bootup_tracer = NULL; + + apply_trace_boot_options(); + /* disable other selftests, since this will break it. */ tracing_selftest_disabled = true; #ifdef CONFIG_FTRACE_STARTUP_TEST @@ -1674,36 +1696,29 @@ __buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *eve ring_buffer_unlock_commit(buffer, event); } -static inline void -__trace_buffer_unlock_commit(struct ring_buffer *buffer, - struct ring_buffer_event *event, - unsigned long flags, int pc) +void trace_buffer_unlock_commit(struct trace_array *tr, + struct ring_buffer *buffer, + struct ring_buffer_event *event, + unsigned long flags, int pc) { __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(buffer, flags, 6, pc); + ftrace_trace_stack(tr, buffer, flags, 6, pc, NULL); ftrace_trace_userstack(buffer, flags, pc); } - -void trace_buffer_unlock_commit(struct ring_buffer *buffer, - struct ring_buffer_event *event, - unsigned long flags, int pc) -{ - __trace_buffer_unlock_commit(buffer, event, flags, pc); -} EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit); static struct ring_buffer *temp_buffer; struct ring_buffer_event * trace_event_buffer_lock_reserve(struct ring_buffer **current_rb, - struct ftrace_event_file *ftrace_file, + struct trace_event_file *trace_file, int type, unsigned long len, unsigned long flags, int pc) { struct ring_buffer_event *entry; - *current_rb = ftrace_file->tr->trace_buffer.buffer; + *current_rb = trace_file->tr->trace_buffer.buffer; entry = trace_buffer_lock_reserve(*current_rb, type, len, flags, pc); /* @@ -1712,7 +1727,7 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb, * to store the trace event for the tigger to use. It's recusive * safe and will not be recorded anywhere. */ - if (!entry && ftrace_file->flags & FTRACE_EVENT_FL_TRIGGER_COND) { + if (!entry && trace_file->flags & EVENT_FILE_FL_TRIGGER_COND) { *current_rb = temp_buffer; entry = trace_buffer_lock_reserve(*current_rb, type, len, flags, pc); @@ -1732,22 +1747,15 @@ trace_current_buffer_lock_reserve(struct ring_buffer **current_rb, } EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve); -void trace_current_buffer_unlock_commit(struct ring_buffer *buffer, - struct ring_buffer_event *event, - unsigned long flags, int pc) -{ - __trace_buffer_unlock_commit(buffer, event, flags, pc); -} -EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit); - -void trace_buffer_unlock_commit_regs(struct ring_buffer *buffer, +void trace_buffer_unlock_commit_regs(struct trace_array *tr, + struct ring_buffer *buffer, struct ring_buffer_event *event, unsigned long flags, int pc, struct pt_regs *regs) { __buffer_unlock_commit(buffer, event); - ftrace_trace_stack_regs(buffer, flags, 0, pc, regs); + ftrace_trace_stack(tr, buffer, flags, 0, pc, regs); ftrace_trace_userstack(buffer, flags, pc); } EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit_regs); @@ -1764,15 +1772,11 @@ trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, int pc) { - struct ftrace_event_call *call = &event_function; + struct trace_event_call *call = &event_function; struct ring_buffer *buffer = tr->trace_buffer.buffer; struct ring_buffer_event *event; struct ftrace_entry *entry; - /* If we are reading the ring buffer, don't trace */ - if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) - return; - event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), flags, pc); if (!event) @@ -1799,7 +1803,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, int skip, int pc, struct pt_regs *regs) { - struct ftrace_event_call *call = &event_kernel_stack; + struct trace_event_call *call = &event_kernel_stack; struct ring_buffer_event *event; struct stack_entry *entry; struct stack_trace trace; @@ -1876,24 +1880,17 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, } -void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags, - int skip, int pc, struct pt_regs *regs) +static inline void ftrace_trace_stack(struct trace_array *tr, + struct ring_buffer *buffer, + unsigned long flags, + int skip, int pc, struct pt_regs *regs) { - if (!(trace_flags & TRACE_ITER_STACKTRACE)) + if (!(tr->trace_flags & TRACE_ITER_STACKTRACE)) return; __ftrace_trace_stack(buffer, flags, skip, pc, regs); } -void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, - int skip, int pc) -{ - if (!(trace_flags & TRACE_ITER_STACKTRACE)) - return; - - __ftrace_trace_stack(buffer, flags, skip, pc, NULL); -} - void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, int pc) { @@ -1927,12 +1924,12 @@ static DEFINE_PER_CPU(int, user_stack_count); void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) { - struct ftrace_event_call *call = &event_user_stack; + struct trace_event_call *call = &event_user_stack; struct ring_buffer_event *event; struct userstack_entry *entry; struct stack_trace trace; - if (!(trace_flags & TRACE_ITER_USERSTACKTRACE)) + if (!(global_trace.trace_flags & TRACE_ITER_USERSTACKTRACE)) return; /* @@ -2133,7 +2130,7 @@ static void trace_printk_start_stop_comm(int enabled) */ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) { - struct ftrace_event_call *call = &event_bprint; + struct trace_event_call *call = &event_bprint; struct ring_buffer_event *event; struct ring_buffer *buffer; struct trace_array *tr = &global_trace; @@ -2176,7 +2173,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) memcpy(entry->buf, tbuffer, sizeof(u32) * len); if (!call_filter_check_discard(call, entry, buffer, event)) { __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(buffer, flags, 6, pc); + ftrace_trace_stack(tr, buffer, flags, 6, pc, NULL); } out: @@ -2191,7 +2188,7 @@ static int __trace_array_vprintk(struct ring_buffer *buffer, unsigned long ip, const char *fmt, va_list args) { - struct ftrace_event_call *call = &event_print; + struct trace_event_call *call = &event_print; struct ring_buffer_event *event; int len = 0, size, pc; struct print_entry *entry; @@ -2228,7 +2225,7 @@ __trace_array_vprintk(struct ring_buffer *buffer, memcpy(&entry->buf, tbuffer, len + 1); if (!call_filter_check_discard(call, entry, buffer, event)) { __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(buffer, flags, 6, pc); + ftrace_trace_stack(&global_trace, buffer, flags, 6, pc, NULL); } out: preempt_enable_notrace(); @@ -2249,7 +2246,7 @@ int trace_array_printk(struct trace_array *tr, int ret; va_list ap; - if (!(trace_flags & TRACE_ITER_PRINTK)) + if (!(global_trace.trace_flags & TRACE_ITER_PRINTK)) return 0; va_start(ap, fmt); @@ -2264,7 +2261,7 @@ int trace_array_printk_buf(struct ring_buffer *buffer, int ret; va_list ap; - if (!(trace_flags & TRACE_ITER_PRINTK)) + if (!(global_trace.trace_flags & TRACE_ITER_PRINTK)) return 0; va_start(ap, fmt); @@ -2611,7 +2608,7 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file void print_trace_header(struct seq_file *m, struct trace_iterator *iter) { - unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); + unsigned long sym_flags = (global_trace.trace_flags & TRACE_ITER_SYM_MASK); struct trace_buffer *buf = iter->trace_buffer; struct trace_array_cpu *data = per_cpu_ptr(buf->data, buf->cpu); struct tracer *type = iter->trace; @@ -2673,20 +2670,22 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) static void test_cpu_buff_start(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; + struct trace_array *tr = iter->tr; - if (!(trace_flags & TRACE_ITER_ANNOTATE)) + if (!(tr->trace_flags & TRACE_ITER_ANNOTATE)) return; if (!(iter->iter_flags & TRACE_FILE_ANNOTATE)) return; - if (cpumask_test_cpu(iter->cpu, iter->started)) + if (iter->started && cpumask_test_cpu(iter->cpu, iter->started)) return; if (per_cpu_ptr(iter->trace_buffer->data, iter->cpu)->skipped_entries) return; - cpumask_set_cpu(iter->cpu, iter->started); + if (iter->started) + cpumask_set_cpu(iter->cpu, iter->started); /* Don't print started cpu buffer for the first entry of the trace */ if (iter->idx > 1) @@ -2696,8 +2695,9 @@ static void test_cpu_buff_start(struct trace_iterator *iter) static enum print_line_t print_trace_fmt(struct trace_iterator *iter) { + struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; - unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); + unsigned long sym_flags = (tr->trace_flags & TRACE_ITER_SYM_MASK); struct trace_entry *entry; struct trace_event *event; @@ -2707,7 +2707,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) event = ftrace_find_event(entry->type); - if (trace_flags & TRACE_ITER_CONTEXT_INFO) { + if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) { if (iter->iter_flags & TRACE_FILE_LAT_FMT) trace_print_lat_context(iter); else @@ -2727,13 +2727,14 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) static enum print_line_t print_raw_fmt(struct trace_iterator *iter) { + struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; struct trace_entry *entry; struct trace_event *event; entry = iter->ent; - if (trace_flags & TRACE_ITER_CONTEXT_INFO) + if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) trace_seq_printf(s, "%d %d %llu ", entry->pid, iter->cpu, iter->ts); @@ -2751,6 +2752,7 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter) static enum print_line_t print_hex_fmt(struct trace_iterator *iter) { + struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; unsigned char newline = '\n'; struct trace_entry *entry; @@ -2758,7 +2760,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter) entry = iter->ent; - if (trace_flags & TRACE_ITER_CONTEXT_INFO) { + if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) { SEQ_PUT_HEX_FIELD(s, entry->pid); SEQ_PUT_HEX_FIELD(s, iter->cpu); SEQ_PUT_HEX_FIELD(s, iter->ts); @@ -2780,13 +2782,14 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter) static enum print_line_t print_bin_fmt(struct trace_iterator *iter) { + struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; struct trace_entry *entry; struct trace_event *event; entry = iter->ent; - if (trace_flags & TRACE_ITER_CONTEXT_INFO) { + if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) { SEQ_PUT_FIELD(s, entry->pid); SEQ_PUT_FIELD(s, iter->cpu); SEQ_PUT_FIELD(s, iter->ts); @@ -2835,6 +2838,8 @@ int trace_empty(struct trace_iterator *iter) /* Called with trace_event_read_lock() held. */ enum print_line_t print_trace_line(struct trace_iterator *iter) { + struct trace_array *tr = iter->tr; + unsigned long trace_flags = tr->trace_flags; enum print_line_t ret; if (iter->lost_events) { @@ -2880,6 +2885,7 @@ enum print_line_t print_trace_line(struct trace_iterator *iter) void trace_latency_header(struct seq_file *m) { struct trace_iterator *iter = m->private; + struct trace_array *tr = iter->tr; /* print nothing if the buffers are empty */ if (trace_empty(iter)) @@ -2888,13 +2894,15 @@ void trace_latency_header(struct seq_file *m) if (iter->iter_flags & TRACE_FILE_LAT_FMT) print_trace_header(m, iter); - if (!(trace_flags & TRACE_ITER_VERBOSE)) + if (!(tr->trace_flags & TRACE_ITER_VERBOSE)) print_lat_help_header(m); } void trace_default_header(struct seq_file *m) { struct trace_iterator *iter = m->private; + struct trace_array *tr = iter->tr; + unsigned long trace_flags = tr->trace_flags; if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) return; @@ -3044,7 +3052,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) if (!iter) return ERR_PTR(-ENOMEM); - iter->buffer_iter = kzalloc(sizeof(*iter->buffer_iter) * num_possible_cpus(), + iter->buffer_iter = kcalloc(nr_cpu_ids, sizeof(*iter->buffer_iter), GFP_KERNEL); if (!iter->buffer_iter) goto release; @@ -3239,7 +3247,7 @@ static int tracing_open(struct inode *inode, struct file *file) iter = __tracing_open(inode, file, false); if (IS_ERR(iter)) ret = PTR_ERR(iter); - else if (trace_flags & TRACE_ITER_LATENCY_FMT) + else if (tr->trace_flags & TRACE_ITER_LATENCY_FMT) iter->iter_flags |= TRACE_FILE_LAT_FMT; } @@ -3486,7 +3494,7 @@ static int tracing_trace_options_show(struct seq_file *m, void *v) trace_opts = tr->current_trace->flags->opts; for (i = 0; trace_options[i]; i++) { - if (trace_flags & (1 << i)) + if (tr->trace_flags & (1 << i)) seq_printf(m, "%s\n", trace_options[i]); else seq_printf(m, "no%s\n", trace_options[i]); @@ -3551,7 +3559,7 @@ int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set) int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) { /* do nothing if flag is already set */ - if (!!(trace_flags & mask) == !!enabled) + if (!!(tr->trace_flags & mask) == !!enabled) return 0; /* Give the tracer a chance to approve the change */ @@ -3560,9 +3568,9 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) return -EINVAL; if (enabled) - trace_flags |= mask; + tr->trace_flags |= mask; else - trace_flags &= ~mask; + tr->trace_flags &= ~mask; if (mask == TRACE_ITER_RECORD_CMD) trace_event_enable_cmd_record(enabled); @@ -3574,8 +3582,10 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) #endif } - if (mask == TRACE_ITER_PRINTK) + if (mask == TRACE_ITER_PRINTK) { trace_printk_start_stop_comm(enabled); + trace_printk_control(enabled); + } return 0; } @@ -3586,6 +3596,7 @@ static int trace_set_options(struct trace_array *tr, char *option) int neg = 0; int ret = -ENODEV; int i; + size_t orig_len = strlen(option); cmp = strstrip(option); @@ -3609,9 +3620,36 @@ static int trace_set_options(struct trace_array *tr, char *option) mutex_unlock(&trace_types_lock); + /* + * If the first trailing whitespace is replaced with '\0' by strstrip, + * turn it back into a space. + */ + if (orig_len > strlen(option)) + option[strlen(option)] = ' '; + return ret; } +static void __init apply_trace_boot_options(void) +{ + char *buf = trace_boot_options_buf; + char *option; + + while (true) { + option = strsep(&buf, ","); + + if (!option) + break; + + if (*option) + trace_set_options(&global_trace, option); + + /* Put back the comma to allow this to be called again */ + if (buf) + *(buf - 1) = ','; + } +} + static ssize_t tracing_trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) @@ -4306,11 +4344,8 @@ int tracing_update_buffers(void) struct trace_option_dentry; -static struct trace_option_dentry * -create_trace_option_files(struct trace_array *tr, struct tracer *tracer); - static void -destroy_trace_option_files(struct trace_option_dentry *topts); +create_trace_option_files(struct trace_array *tr, struct tracer *tracer); /* * Used to clear out the tracer before deletion of an instance. @@ -4329,20 +4364,13 @@ static void tracing_set_nop(struct trace_array *tr) tr->current_trace = &nop_trace; } -static void update_tracer_options(struct trace_array *tr, struct tracer *t) +static void add_tracer_options(struct trace_array *tr, struct tracer *t) { - static struct trace_option_dentry *topts; - /* Only enable if the directory has been created already. */ if (!tr->dir) return; - /* Currently, only the top instance has options */ - if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL)) - return; - - destroy_trace_option_files(topts); - topts = create_trace_option_files(tr, t); + create_trace_option_files(tr, t); } static int tracing_set_tracer(struct trace_array *tr, const char *buf) @@ -4411,7 +4439,6 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf) free_snapshot(tr); } #endif - update_tracer_options(tr, t); #ifdef CONFIG_TRACER_MAX_TRACE if (t->use_max_tr && !had_max_tr) { @@ -4531,6 +4558,8 @@ out: return ret; } +#ifdef CONFIG_TRACER_MAX_TRACE + static ssize_t tracing_max_lat_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -4545,6 +4574,8 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf, return tracing_nsecs_write(filp->private_data, ubuf, cnt, ppos); } +#endif + static int tracing_open_pipe(struct inode *inode, struct file *filp) { struct trace_array *tr = inode->i_private; @@ -4578,7 +4609,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) /* trace pipe does not show start of buffer */ cpumask_setall(iter->started); - if (trace_flags & TRACE_ITER_LATENCY_FMT) + if (tr->trace_flags & TRACE_ITER_LATENCY_FMT) iter->iter_flags |= TRACE_FILE_LAT_FMT; /* Output in nanoseconds only if we are using a clock in nanoseconds. */ @@ -4635,11 +4666,13 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) static unsigned int trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_table) { + struct trace_array *tr = iter->tr; + /* Iterators are static, they should be filled or empty */ if (trace_buffer_iter(iter, iter->cpu_file)) return POLLIN | POLLRDNORM; - if (trace_flags & TRACE_ITER_BLOCK) + if (tr->trace_flags & TRACE_ITER_BLOCK) /* * Always select as readable when in blocking mode */ @@ -5056,7 +5089,7 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp) struct trace_array *tr = inode->i_private; /* disable tracing ? */ - if (trace_flags & TRACE_ITER_STOP_ON_FREE) + if (tr->trace_flags & TRACE_ITER_STOP_ON_FREE) tracer_tracing_off(tr); /* resize the ring buffer to 0 */ tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS); @@ -5089,7 +5122,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, if (tracing_disabled) return -EINVAL; - if (!(trace_flags & TRACE_ITER_MARKERS)) + if (!(tr->trace_flags & TRACE_ITER_MARKERS)) return -EINVAL; if (cnt > TRACE_BUF_SIZE) @@ -5444,12 +5477,14 @@ static const struct file_operations tracing_thresh_fops = { .llseek = generic_file_llseek, }; +#ifdef CONFIG_TRACER_MAX_TRACE static const struct file_operations tracing_max_lat_fops = { .open = tracing_open_generic, .read = tracing_max_lat_read, .write = tracing_max_lat_write, .llseek = generic_file_llseek, }; +#endif static const struct file_operations set_tracer_fops = { .open = tracing_open_generic, @@ -6141,13 +6176,6 @@ tracing_init_tracefs_percpu(struct trace_array *tr, long cpu) #include "trace_selftest.c" #endif -struct trace_option_dentry { - struct tracer_opt *opt; - struct tracer_flags *flags; - struct trace_array *tr; - struct dentry *entry; -}; - static ssize_t trace_options_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -6200,14 +6228,51 @@ static const struct file_operations trace_options_fops = { .llseek = generic_file_llseek, }; +/* + * In order to pass in both the trace_array descriptor as well as the index + * to the flag that the trace option file represents, the trace_array + * has a character array of trace_flags_index[], which holds the index + * of the bit for the flag it represents. index[0] == 0, index[1] == 1, etc. + * The address of this character array is passed to the flag option file + * read/write callbacks. + * + * In order to extract both the index and the trace_array descriptor, + * get_tr_index() uses the following algorithm. + * + * idx = *ptr; + * + * As the pointer itself contains the address of the index (remember + * index[1] == 1). + * + * Then to get the trace_array descriptor, by subtracting that index + * from the ptr, we get to the start of the index itself. + * + * ptr - idx == &index[0] + * + * Then a simple container_of() from that pointer gets us to the + * trace_array descriptor. + */ +static void get_tr_index(void *data, struct trace_array **ptr, + unsigned int *pindex) +{ + *pindex = *(unsigned char *)data; + + *ptr = container_of(data - *pindex, struct trace_array, + trace_flags_index); +} + static ssize_t trace_options_core_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - long index = (long)filp->private_data; + void *tr_index = filp->private_data; + struct trace_array *tr; + unsigned int index; char *buf; - if (trace_flags & (1 << index)) + get_tr_index(tr_index, &tr, &index); + + if (tr->trace_flags & (1 << index)) buf = "1\n"; else buf = "0\n"; @@ -6219,11 +6284,14 @@ static ssize_t trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct trace_array *tr = &global_trace; - long index = (long)filp->private_data; + void *tr_index = filp->private_data; + struct trace_array *tr; + unsigned int index; unsigned long val; int ret; + get_tr_index(tr_index, &tr, &index); + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); if (ret) return ret; @@ -6307,21 +6375,39 @@ create_trace_option_file(struct trace_array *tr, } -static struct trace_option_dentry * +static void create_trace_option_files(struct trace_array *tr, struct tracer *tracer) { struct trace_option_dentry *topts; + struct trace_options *tr_topts; struct tracer_flags *flags; struct tracer_opt *opts; int cnt; + int i; if (!tracer) - return NULL; + return; flags = tracer->flags; if (!flags || !flags->opts) - return NULL; + return; + + /* + * If this is an instance, only create flags for tracers + * the instance may have. + */ + if (!trace_ok_for_array(tracer, tr)) + return; + + for (i = 0; i < tr->nr_topts; i++) { + /* + * Check if these flags have already been added. + * Some tracers share flags. + */ + if (tr->topts[i].tracer->flags == tracer->flags) + return; + } opts = flags->opts; @@ -6330,27 +6416,27 @@ create_trace_option_files(struct trace_array *tr, struct tracer *tracer) topts = kcalloc(cnt + 1, sizeof(*topts), GFP_KERNEL); if (!topts) - return NULL; - - for (cnt = 0; opts[cnt].name; cnt++) - create_trace_option_file(tr, &topts[cnt], flags, - &opts[cnt]); - - return topts; -} - -static void -destroy_trace_option_files(struct trace_option_dentry *topts) -{ - int cnt; + return; - if (!topts) + tr_topts = krealloc(tr->topts, sizeof(*tr->topts) * (tr->nr_topts + 1), + GFP_KERNEL); + if (!tr_topts) { + kfree(topts); return; + } - for (cnt = 0; topts[cnt].opt; cnt++) - tracefs_remove(topts[cnt].entry); + tr->topts = tr_topts; + tr->topts[tr->nr_topts].tracer = tracer; + tr->topts[tr->nr_topts].topts = topts; + tr->nr_topts++; - kfree(topts); + for (cnt = 0; opts[cnt].name; cnt++) { + create_trace_option_file(tr, &topts[cnt], flags, + &opts[cnt]); + WARN_ONCE(topts[cnt].entry == NULL, + "Failed to create trace option: %s", + opts[cnt].name); + } } static struct dentry * @@ -6363,21 +6449,26 @@ create_trace_option_core_file(struct trace_array *tr, if (!t_options) return NULL; - return trace_create_file(option, 0644, t_options, (void *)index, - &trace_options_core_fops); + return trace_create_file(option, 0644, t_options, + (void *)&tr->trace_flags_index[index], + &trace_options_core_fops); } -static __init void create_trace_options_dir(struct trace_array *tr) +static void create_trace_options_dir(struct trace_array *tr) { struct dentry *t_options; + bool top_level = tr == &global_trace; int i; t_options = trace_options_init_dentry(tr); if (!t_options) return; - for (i = 0; trace_options[i]; i++) - create_trace_option_core_file(tr, trace_options[i], i); + for (i = 0; trace_options[i]; i++) { + if (top_level || + !((1 << i) & TOP_LEVEL_TRACE_FLAGS)) + create_trace_option_core_file(tr, trace_options[i], i); + } } static ssize_t @@ -6444,7 +6535,7 @@ allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size { enum ring_buffer_flags rb_flags; - rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; + rb_flags = tr->trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; buf->tr = tr; @@ -6514,6 +6605,30 @@ static void free_trace_buffers(struct trace_array *tr) #endif } +static void init_trace_flags_index(struct trace_array *tr) +{ + int i; + + /* Used by the trace options files */ + for (i = 0; i < TRACE_FLAGS_MAX_SIZE; i++) + tr->trace_flags_index[i] = i; +} + +static void __update_tracer_options(struct trace_array *tr) +{ + struct tracer *t; + + for (t = trace_types; t; t = t->next) + add_tracer_options(tr, t); +} + +static void update_tracer_options(struct trace_array *tr) +{ + mutex_lock(&trace_types_lock); + __update_tracer_options(tr); + mutex_unlock(&trace_types_lock); +} + static int instance_mkdir(const char *name) { struct trace_array *tr; @@ -6539,6 +6654,8 @@ static int instance_mkdir(const char *name) if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL)) goto out_free_tr; + tr->trace_flags = global_trace.trace_flags; + cpumask_copy(tr->tracing_cpumask, cpu_all_mask); raw_spin_lock_init(&tr->start_lock); @@ -6564,6 +6681,8 @@ static int instance_mkdir(const char *name) } init_tracer_tracefs(tr, tr->dir); + init_trace_flags_index(tr); + __update_tracer_options(tr); list_add(&tr->list, &ftrace_trace_arrays); @@ -6589,6 +6708,7 @@ static int instance_rmdir(const char *name) struct trace_array *tr; int found = 0; int ret; + int i; mutex_lock(&trace_types_lock); @@ -6611,9 +6731,14 @@ static int instance_rmdir(const char *name) tracing_set_nop(tr); event_trace_del_tracer(tr); ftrace_destroy_function_files(tr); - debugfs_remove_recursive(tr->dir); + tracefs_remove_recursive(tr->dir); free_trace_buffers(tr); + for (i = 0; i < tr->nr_topts; i++) { + kfree(tr->topts[i].topts); + } + kfree(tr->topts); + kfree(tr->name); kfree(tr); @@ -6675,6 +6800,8 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) trace_create_file("tracing_on", 0644, d_tracer, tr, &rb_simple_fops); + create_trace_options_dir(tr); + #ifdef CONFIG_TRACER_MAX_TRACE trace_create_file("tracing_max_latency", 0644, d_tracer, &tr->max_latency, &tracing_max_lat_fops); @@ -6730,7 +6857,9 @@ struct dentry *tracing_init_dentry(void) if (tr->dir) return NULL; - if (WARN_ON(!debugfs_initialized())) + if (WARN_ON(!tracefs_initialized()) || + (IS_ENABLED(CONFIG_DEBUG_FS) && + WARN_ON(!debugfs_initialized()))) return ERR_PTR(-ENODEV); /* @@ -6870,11 +6999,7 @@ static __init int tracer_init_tracefs(void) create_trace_instances(d_tracer); - create_trace_options_dir(&global_trace); - - /* If the tracer was started via cmdline, create options for it here */ - if (global_trace.current_trace != &nop_trace) - update_tracer_options(&global_trace, global_trace.current_trace); + update_tracer_options(&global_trace); return 0; } @@ -6973,6 +7098,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) /* use static because iter can be a bit big for the stack */ static struct trace_iterator iter; static atomic_t dump_running; + struct trace_array *tr = &global_trace; unsigned int old_userobj; unsigned long flags; int cnt = 0, cpu; @@ -6999,13 +7125,13 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) trace_init_global_iter(&iter); for_each_tracing_cpu(cpu) { - atomic_inc(&per_cpu_ptr(iter.tr->trace_buffer.data, cpu)->disabled); + atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); } - old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ; + old_userobj = tr->trace_flags & TRACE_ITER_SYM_USEROBJ; /* don't look at user memory in panic mode */ - trace_flags &= ~TRACE_ITER_SYM_USEROBJ; + tr->trace_flags &= ~TRACE_ITER_SYM_USEROBJ; switch (oops_dump_mode) { case DUMP_ALL: @@ -7068,7 +7194,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) printk(KERN_TRACE "---------------------------------\n"); out_enable: - trace_flags |= old_userobj; + tr->trace_flags |= old_userobj; for_each_tracing_cpu(cpu) { atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); @@ -7083,6 +7209,12 @@ __init static int tracer_alloc_buffers(void) int ring_buf_size; int ret = -ENOMEM; + /* + * Make sure we don't accidently add more trace options + * than we have bits for. + */ + BUILD_BUG_ON(TRACE_ITER_LAST_BIT > TRACE_FLAGS_MAX_SIZE); + if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) goto out; @@ -7141,6 +7273,8 @@ __init static int tracer_alloc_buffers(void) ftrace_init_global_array_ops(&global_trace); + init_trace_flags_index(&global_trace); + register_tracer(&nop_trace); /* All seems OK, enable tracing */ @@ -7157,12 +7291,7 @@ __init static int tracer_alloc_buffers(void) INIT_LIST_HEAD(&global_trace.events); list_add(&global_trace.list, &ftrace_trace_arrays); - while (trace_boot_options) { - char *option; - - option = strsep(&trace_boot_options, ","); - trace_set_options(&global_trace, option); - } + apply_trace_boot_options(); register_snapshot_cmd(); diff --git a/kernel/kernel/trace/trace.h b/kernel/kernel/trace/trace.h index c0f3c568c..3bf86ece6 100644 --- a/kernel/kernel/trace/trace.h +++ b/kernel/kernel/trace/trace.h @@ -12,7 +12,7 @@ #include <linux/ftrace.h> #include <linux/hw_breakpoint.h> #include <linux/trace_seq.h> -#include <linux/ftrace_event.h> +#include <linux/trace_events.h> #include <linux/compiler.h> #include <linux/trace_seq.h> @@ -71,9 +71,6 @@ enum trace_type { tstruct \ } -#undef TP_ARGS -#define TP_ARGS(args...) args - #undef FTRACE_ENTRY_DUP #define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk, filter) @@ -158,9 +155,12 @@ struct trace_array_cpu { pid_t pid; kuid_t uid; char comm[TASK_COMM_LEN]; + + bool ignore_pid; }; struct tracer; +struct trace_option_dentry; struct trace_buffer { struct trace_array *tr; @@ -170,6 +170,19 @@ struct trace_buffer { int cpu; }; +#define TRACE_FLAGS_MAX_SIZE 32 + +struct trace_options { + struct tracer *tracer; + struct trace_option_dentry *topts; +}; + +struct trace_pid_list { + unsigned int nr_pids; + int order; + pid_t *pids; +}; + /* * The trace array - an array of per-CPU trace arrays. This is the * highest level data structure that individual tracers deal with. @@ -195,6 +208,7 @@ struct trace_array { bool allocated_snapshot; unsigned long max_latency; #endif + struct trace_pid_list __rcu *filtered_pids; /* * max_lock is used to protect the swapping of buffers * when taking a max snapshot. The buffers themselves are @@ -213,18 +227,22 @@ struct trace_array { #ifdef CONFIG_FTRACE_SYSCALLS int sys_refcount_enter; int sys_refcount_exit; - struct ftrace_event_file __rcu *enter_syscall_files[NR_syscalls]; - struct ftrace_event_file __rcu *exit_syscall_files[NR_syscalls]; + struct trace_event_file __rcu *enter_syscall_files[NR_syscalls]; + struct trace_event_file __rcu *exit_syscall_files[NR_syscalls]; #endif int stop_count; int clock_id; + int nr_topts; struct tracer *current_trace; + unsigned int trace_flags; + unsigned char trace_flags_index[TRACE_FLAGS_MAX_SIZE]; unsigned int flags; raw_spinlock_t start_lock; struct dentry *dir; struct dentry *options; struct dentry *percpu_dir; struct dentry *event_dir; + struct trace_options *topts; struct list_head systems; struct list_head events; cpumask_var_t tracing_cpumask; /* only trace on set CPUs */ @@ -335,6 +353,13 @@ struct tracer_flags { #define TRACER_OPT(s, b) .name = #s, .bit = b +struct trace_option_dentry { + struct tracer_opt *opt; + struct tracer_flags *flags; + struct trace_array *tr; + struct dentry *entry; +}; + /** * struct tracer - a specific tracer and its callbacks to interact with tracefs * @name: the name chosen to select it on the available_tracers file @@ -613,29 +638,12 @@ void update_max_tr_single(struct trace_array *tr, #endif /* CONFIG_TRACER_MAX_TRACE */ #ifdef CONFIG_STACKTRACE -void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, - int skip, int pc); - -void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags, - int skip, int pc, struct pt_regs *regs); - void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc); void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, int pc); #else -static inline void ftrace_trace_stack(struct ring_buffer *buffer, - unsigned long flags, int skip, int pc) -{ -} - -static inline void ftrace_trace_stack_regs(struct ring_buffer *buffer, - unsigned long flags, int skip, - int pc, struct pt_regs *regs) -{ -} - static inline void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) { @@ -661,7 +669,6 @@ extern int DYN_FTRACE_TEST_NAME2(void); extern bool ring_buffer_expanded; extern bool tracing_selftest_disabled; -DECLARE_PER_CPU(int, ftrace_cpu_disabled); #ifdef CONFIG_FTRACE_STARTUP_TEST extern int trace_selftest_startup_function(struct tracer *trace, @@ -709,8 +716,6 @@ int trace_array_printk_buf(struct ring_buffer *buffer, void trace_printk_seq(struct trace_seq *s); enum print_line_t print_trace_line(struct trace_iterator *iter); -extern unsigned long trace_flags; - extern char trace_find_mark(unsigned long long duration); /* Standard output formatting function used for function return traces */ @@ -725,9 +730,14 @@ extern char trace_find_mark(unsigned long long duration); #define TRACE_GRAPH_PRINT_ABS_TIME 0x20 #define TRACE_GRAPH_PRINT_IRQS 0x40 #define TRACE_GRAPH_PRINT_TAIL 0x80 +#define TRACE_GRAPH_SLEEP_TIME 0x100 +#define TRACE_GRAPH_GRAPH_TIME 0x200 #define TRACE_GRAPH_PRINT_FILL_SHIFT 28 #define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT) +extern void ftrace_graph_sleep_time_control(bool enable); +extern void ftrace_graph_graph_time_control(bool enable); + extern enum print_line_t print_graph_function_flags(struct trace_iterator *iter, u32 flags); extern void print_graph_headers_flags(struct seq_file *s, u32 flags); @@ -861,7 +871,7 @@ void ftrace_destroy_filter_files(struct ftrace_ops *ops); #define ftrace_destroy_filter_files(ops) do { } while (0) #endif /* CONFIG_FUNCTION_TRACER && CONFIG_DYNAMIC_FTRACE */ -int ftrace_event_is_function(struct ftrace_event_call *call); +bool ftrace_event_is_function(struct trace_event_call *call); /* * struct trace_parser - servers for reading the user input separated by spaces @@ -899,42 +909,94 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, size_t cnt, loff_t *ppos); /* + * Only create function graph options if function graph is configured. + */ +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +# define FGRAPH_FLAGS \ + C(DISPLAY_GRAPH, "display-graph"), +#else +# define FGRAPH_FLAGS +#endif + +#ifdef CONFIG_BRANCH_TRACER +# define BRANCH_FLAGS \ + C(BRANCH, "branch"), +#else +# define BRANCH_FLAGS +#endif + +#ifdef CONFIG_FUNCTION_TRACER +# define FUNCTION_FLAGS \ + C(FUNCTION, "function-trace"), +# define FUNCTION_DEFAULT_FLAGS TRACE_ITER_FUNCTION +#else +# define FUNCTION_FLAGS +# define FUNCTION_DEFAULT_FLAGS 0UL +#endif + +#ifdef CONFIG_STACKTRACE +# define STACK_FLAGS \ + C(STACKTRACE, "stacktrace"), +#else +# define STACK_FLAGS +#endif + +/* * trace_iterator_flags is an enumeration that defines bit * positions into trace_flags that controls the output. * * NOTE: These bits must match the trace_options array in - * trace.c. + * trace.c (this macro guarantees it). + */ +#define TRACE_FLAGS \ + C(PRINT_PARENT, "print-parent"), \ + C(SYM_OFFSET, "sym-offset"), \ + C(SYM_ADDR, "sym-addr"), \ + C(VERBOSE, "verbose"), \ + C(RAW, "raw"), \ + C(HEX, "hex"), \ + C(BIN, "bin"), \ + C(BLOCK, "block"), \ + C(PRINTK, "trace_printk"), \ + C(ANNOTATE, "annotate"), \ + C(USERSTACKTRACE, "userstacktrace"), \ + C(SYM_USEROBJ, "sym-userobj"), \ + C(PRINTK_MSGONLY, "printk-msg-only"), \ + C(CONTEXT_INFO, "context-info"), /* Print pid/cpu/time */ \ + C(LATENCY_FMT, "latency-format"), \ + C(RECORD_CMD, "record-cmd"), \ + C(OVERWRITE, "overwrite"), \ + C(STOP_ON_FREE, "disable_on_free"), \ + C(IRQ_INFO, "irq-info"), \ + C(MARKERS, "markers"), \ + FUNCTION_FLAGS \ + FGRAPH_FLAGS \ + STACK_FLAGS \ + BRANCH_FLAGS + +/* + * By defining C, we can make TRACE_FLAGS a list of bit names + * that will define the bits for the flag masks. */ -enum trace_iterator_flags { - TRACE_ITER_PRINT_PARENT = 0x01, - TRACE_ITER_SYM_OFFSET = 0x02, - TRACE_ITER_SYM_ADDR = 0x04, - TRACE_ITER_VERBOSE = 0x08, - TRACE_ITER_RAW = 0x10, - TRACE_ITER_HEX = 0x20, - TRACE_ITER_BIN = 0x40, - TRACE_ITER_BLOCK = 0x80, - TRACE_ITER_STACKTRACE = 0x100, - TRACE_ITER_PRINTK = 0x200, - TRACE_ITER_PREEMPTONLY = 0x400, - TRACE_ITER_BRANCH = 0x800, - TRACE_ITER_ANNOTATE = 0x1000, - TRACE_ITER_USERSTACKTRACE = 0x2000, - TRACE_ITER_SYM_USEROBJ = 0x4000, - TRACE_ITER_PRINTK_MSGONLY = 0x8000, - TRACE_ITER_CONTEXT_INFO = 0x10000, /* Print pid/cpu/time */ - TRACE_ITER_LATENCY_FMT = 0x20000, - TRACE_ITER_SLEEP_TIME = 0x40000, - TRACE_ITER_GRAPH_TIME = 0x80000, - TRACE_ITER_RECORD_CMD = 0x100000, - TRACE_ITER_OVERWRITE = 0x200000, - TRACE_ITER_STOP_ON_FREE = 0x400000, - TRACE_ITER_IRQ_INFO = 0x800000, - TRACE_ITER_MARKERS = 0x1000000, - TRACE_ITER_FUNCTION = 0x2000000, +#undef C +#define C(a, b) TRACE_ITER_##a##_BIT + +enum trace_iterator_bits { + TRACE_FLAGS + /* Make sure we don't go more than we have bits for */ + TRACE_ITER_LAST_BIT }; /* + * By redefining C, we can make TRACE_FLAGS a list of masks that + * use the bits as defined above. + */ +#undef C +#define C(a, b) TRACE_ITER_##a = (1 << TRACE_ITER_##a##_BIT) + +enum trace_iterator_flags { TRACE_FLAGS }; + +/* * TRACE_ITER_SYM_MASK masks the options in trace_flags that * control the output of kernel symbols. */ @@ -948,7 +1010,7 @@ extern int enable_branch_tracing(struct trace_array *tr); extern void disable_branch_tracing(void); static inline int trace_branch_enable(struct trace_array *tr) { - if (trace_flags & TRACE_ITER_BRANCH) + if (tr->trace_flags & TRACE_ITER_BRANCH) return enable_branch_tracing(tr); return 0; } @@ -995,7 +1057,7 @@ struct event_subsystem { int ref_count; }; -struct ftrace_subsystem_dir { +struct trace_subsystem_dir { struct list_head list; struct event_subsystem *subsystem; struct trace_array *tr; @@ -1055,30 +1117,30 @@ struct filter_pred { extern enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not); -extern void print_event_filter(struct ftrace_event_file *file, +extern void print_event_filter(struct trace_event_file *file, struct trace_seq *s); -extern int apply_event_filter(struct ftrace_event_file *file, +extern int apply_event_filter(struct trace_event_file *file, char *filter_string); -extern int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir, +extern int apply_subsystem_event_filter(struct trace_subsystem_dir *dir, char *filter_string); extern void print_subsystem_event_filter(struct event_subsystem *system, struct trace_seq *s); extern int filter_assign_type(const char *type); -extern int create_event_filter(struct ftrace_event_call *call, +extern int create_event_filter(struct trace_event_call *call, char *filter_str, bool set_str, struct event_filter **filterp); extern void free_event_filter(struct event_filter *filter); struct ftrace_event_field * -trace_find_event_field(struct ftrace_event_call *call, char *name); +trace_find_event_field(struct trace_event_call *call, char *name); extern void trace_event_enable_cmd_record(bool enable); extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr); extern int event_trace_del_tracer(struct trace_array *tr); -extern struct ftrace_event_file *find_event_file(struct trace_array *tr, - const char *system, - const char *event); +extern struct trace_event_file *find_event_file(struct trace_array *tr, + const char *system, + const char *event); static inline void *event_file_data(struct file *filp) { @@ -1183,7 +1245,7 @@ struct event_trigger_ops { * commands need to do this if they themselves log to the trace * buffer (see the @post_trigger() member below). @trigger_type * values are defined by adding new values to the trigger_type - * enum in include/linux/ftrace_event.h. + * enum in include/linux/trace_events.h. * * @post_trigger: A flag that says whether or not this command needs * to have its action delayed until after the current event has @@ -1245,23 +1307,23 @@ struct event_command { enum event_trigger_type trigger_type; bool post_trigger; int (*func)(struct event_command *cmd_ops, - struct ftrace_event_file *file, + struct trace_event_file *file, char *glob, char *cmd, char *params); int (*reg)(char *glob, struct event_trigger_ops *ops, struct event_trigger_data *data, - struct ftrace_event_file *file); + struct trace_event_file *file); void (*unreg)(char *glob, struct event_trigger_ops *ops, struct event_trigger_data *data, - struct ftrace_event_file *file); + struct trace_event_file *file); int (*set_filter)(char *filter_str, struct event_trigger_data *data, - struct ftrace_event_file *file); + struct trace_event_file *file); struct event_trigger_ops *(*get_trigger_ops)(char *cmd, char *param); }; -extern int trace_event_enable_disable(struct ftrace_event_file *file, +extern int trace_event_enable_disable(struct trace_event_file *file, int enable, int soft_disable); extern int tracing_alloc_snapshot(void); @@ -1271,6 +1333,7 @@ extern const char *__stop___trace_bprintk_fmt[]; extern const char *__start___tracepoint_str[]; extern const char *__stop___tracepoint_str[]; +void trace_printk_control(bool enabled); void trace_printk_init_buffers(void); void trace_printk_start_comm(void); int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set); @@ -1289,7 +1352,7 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled); #undef FTRACE_ENTRY #define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ - extern struct ftrace_event_call \ + extern struct trace_event_call \ __aligned(4) event_##call; #undef FTRACE_ENTRY_DUP #define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter) \ @@ -1298,7 +1361,7 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled); #include "trace_entries.h" #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_FUNCTION_TRACER) -int perf_ftrace_event_register(struct ftrace_event_call *call, +int perf_ftrace_event_register(struct trace_event_call *call, enum trace_reg type, void *data); #else #define perf_ftrace_event_register NULL diff --git a/kernel/kernel/trace/trace_benchmark.c b/kernel/kernel/trace/trace_benchmark.c index 40a14cbcf..0f109c413 100644 --- a/kernel/kernel/trace/trace_benchmark.c +++ b/kernel/kernel/trace/trace_benchmark.c @@ -43,7 +43,7 @@ static void trace_do_benchmark(void) unsigned int std = 0; /* Only run if the tracepoint is actually active */ - if (!trace_benchmark_event_enabled()) + if (!trace_benchmark_event_enabled() || !tracing_is_on()) return; local_irq_disable(); diff --git a/kernel/kernel/trace/trace_branch.c b/kernel/kernel/trace/trace_branch.c index 1879980f0..3a2a73716 100644 --- a/kernel/kernel/trace/trace_branch.c +++ b/kernel/kernel/trace/trace_branch.c @@ -29,7 +29,7 @@ static struct trace_array *branch_tracer; static void probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) { - struct ftrace_event_call *call = &event_branch; + struct trace_event_call *call = &event_branch; struct trace_array *tr = branch_tracer; struct trace_array_cpu *data; struct ring_buffer_event *event; @@ -125,25 +125,14 @@ void disable_branch_tracing(void) mutex_unlock(&branch_tracing_mutex); } -static void start_branch_trace(struct trace_array *tr) -{ - enable_branch_tracing(tr); -} - -static void stop_branch_trace(struct trace_array *tr) -{ - disable_branch_tracing(); -} - static int branch_trace_init(struct trace_array *tr) { - start_branch_trace(tr); - return 0; + return enable_branch_tracing(tr); } static void branch_trace_reset(struct trace_array *tr) { - stop_branch_trace(tr); + disable_branch_tracing(); } static enum print_line_t trace_branch_print(struct trace_iterator *iter, @@ -194,7 +183,7 @@ __init static int init_branch_tracer(void) { int ret; - ret = register_ftrace_event(&trace_branch_event); + ret = register_trace_event(&trace_branch_event); if (!ret) { printk(KERN_WARNING "Warning: could not register " "branch events\n"); diff --git a/kernel/kernel/trace/trace_clock.c b/kernel/kernel/trace/trace_clock.c index 57b67b1f2..0f06532a7 100644 --- a/kernel/kernel/trace/trace_clock.c +++ b/kernel/kernel/trace/trace_clock.c @@ -56,6 +56,7 @@ u64 notrace trace_clock(void) { return local_clock(); } +EXPORT_SYMBOL_GPL(trace_clock); /* * trace_jiffy_clock(): Simply use jiffies as a clock counter. @@ -68,6 +69,7 @@ u64 notrace trace_clock_jiffies(void) { return jiffies_64_to_clock_t(jiffies_64 - INITIAL_JIFFIES); } +EXPORT_SYMBOL_GPL(trace_clock_jiffies); /* * trace_clock_global(): special globally coherent trace clock @@ -123,6 +125,7 @@ u64 notrace trace_clock_global(void) return now; } +EXPORT_SYMBOL_GPL(trace_clock_global); static atomic64_t trace_counter; diff --git a/kernel/kernel/trace/trace_event_perf.c b/kernel/kernel/trace/trace_event_perf.c index 6fa484de2..cc9f7a931 100644 --- a/kernel/kernel/trace/trace_event_perf.c +++ b/kernel/kernel/trace/trace_event_perf.c @@ -1,7 +1,7 @@ /* * trace event based perf event profiling/tracing * - * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra * Copyright (C) 2009-2010 Frederic Weisbecker <fweisbec@gmail.com> */ @@ -21,7 +21,7 @@ typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)]) /* Count the events in use (per event id, not per instance) */ static int total_ref_count; -static int perf_trace_event_perm(struct ftrace_event_call *tp_event, +static int perf_trace_event_perm(struct trace_event_call *tp_event, struct perf_event *p_event) { if (tp_event->perf_perm) { @@ -83,7 +83,7 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event, return 0; } -static int perf_trace_event_reg(struct ftrace_event_call *tp_event, +static int perf_trace_event_reg(struct trace_event_call *tp_event, struct perf_event *p_event) { struct hlist_head __percpu *list; @@ -143,7 +143,7 @@ fail: static void perf_trace_event_unreg(struct perf_event *p_event) { - struct ftrace_event_call *tp_event = p_event->tp_event; + struct trace_event_call *tp_event = p_event->tp_event; int i; if (--tp_event->perf_refcount > 0) @@ -172,17 +172,17 @@ out: static int perf_trace_event_open(struct perf_event *p_event) { - struct ftrace_event_call *tp_event = p_event->tp_event; + struct trace_event_call *tp_event = p_event->tp_event; return tp_event->class->reg(tp_event, TRACE_REG_PERF_OPEN, p_event); } static void perf_trace_event_close(struct perf_event *p_event) { - struct ftrace_event_call *tp_event = p_event->tp_event; + struct trace_event_call *tp_event = p_event->tp_event; tp_event->class->reg(tp_event, TRACE_REG_PERF_CLOSE, p_event); } -static int perf_trace_event_init(struct ftrace_event_call *tp_event, +static int perf_trace_event_init(struct trace_event_call *tp_event, struct perf_event *p_event) { int ret; @@ -206,7 +206,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event, int perf_trace_init(struct perf_event *p_event) { - struct ftrace_event_call *tp_event; + struct trace_event_call *tp_event; u64 event_id = p_event->attr.config; int ret = -EINVAL; @@ -236,7 +236,7 @@ void perf_trace_destroy(struct perf_event *p_event) int perf_trace_add(struct perf_event *p_event, int flags) { - struct ftrace_event_call *tp_event = p_event->tp_event; + struct trace_event_call *tp_event = p_event->tp_event; struct hlist_head __percpu *pcpu_list; struct hlist_head *list; @@ -255,7 +255,7 @@ int perf_trace_add(struct perf_event *p_event, int flags) void perf_trace_del(struct perf_event *p_event, int flags) { - struct ftrace_event_call *tp_event = p_event->tp_event; + struct trace_event_call *tp_event = p_event->tp_event; hlist_del_rcu(&p_event->hlist_entry); tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event); } @@ -357,7 +357,7 @@ static void perf_ftrace_function_disable(struct perf_event *event) ftrace_function_local_disable(&event->ftrace_ops); } -int perf_ftrace_event_register(struct ftrace_event_call *call, +int perf_ftrace_event_register(struct trace_event_call *call, enum trace_reg type, void *data) { switch (type) { diff --git a/kernel/kernel/trace/trace_events.c b/kernel/kernel/trace/trace_events.c index 711529bba..0e508e99b 100644 --- a/kernel/kernel/trace/trace_events.c +++ b/kernel/kernel/trace/trace_events.c @@ -15,11 +15,15 @@ #include <linux/kthread.h> #include <linux/tracefs.h> #include <linux/uaccess.h> +#include <linux/bsearch.h> #include <linux/module.h> #include <linux/ctype.h> +#include <linux/sort.h> #include <linux/slab.h> #include <linux/delay.h> +#include <trace/events/sched.h> + #include <asm/setup.h> #include "trace_output.h" @@ -30,6 +34,7 @@ DEFINE_MUTEX(event_mutex); LIST_HEAD(ftrace_events); +static LIST_HEAD(ftrace_generic_fields); static LIST_HEAD(ftrace_common_fields); #define GFP_TRACE (GFP_KERNEL | __GFP_ZERO) @@ -37,21 +42,19 @@ static LIST_HEAD(ftrace_common_fields); static struct kmem_cache *field_cachep; static struct kmem_cache *file_cachep; -#define SYSTEM_FL_FREE_NAME (1 << 31) - static inline int system_refcount(struct event_subsystem *system) { - return system->ref_count & ~SYSTEM_FL_FREE_NAME; + return system->ref_count; } static int system_refcount_inc(struct event_subsystem *system) { - return (system->ref_count++) & ~SYSTEM_FL_FREE_NAME; + return system->ref_count++; } static int system_refcount_dec(struct event_subsystem *system) { - return (--system->ref_count) & ~SYSTEM_FL_FREE_NAME; + return --system->ref_count; } /* Double loops, do not use break, only goto's work */ @@ -61,14 +64,14 @@ static int system_refcount_dec(struct event_subsystem *system) #define do_for_each_event_file_safe(tr, file) \ list_for_each_entry(tr, &ftrace_trace_arrays, list) { \ - struct ftrace_event_file *___n; \ + struct trace_event_file *___n; \ list_for_each_entry_safe(file, ___n, &tr->events, list) #define while_for_each_event_file() \ } static struct list_head * -trace_get_fields(struct ftrace_event_call *event_call) +trace_get_fields(struct trace_event_call *event_call) { if (!event_call->class->get_fields) return &event_call->class->fields; @@ -89,17 +92,21 @@ __find_event_field(struct list_head *head, char *name) } struct ftrace_event_field * -trace_find_event_field(struct ftrace_event_call *call, char *name) +trace_find_event_field(struct trace_event_call *call, char *name) { struct ftrace_event_field *field; struct list_head *head; - field = __find_event_field(&ftrace_common_fields, name); + head = trace_get_fields(call); + field = __find_event_field(head, name); if (field) return field; - head = trace_get_fields(call); - return __find_event_field(head, name); + field = __find_event_field(&ftrace_generic_fields, name); + if (field) + return field; + + return __find_event_field(&ftrace_common_fields, name); } static int __trace_define_field(struct list_head *head, const char *type, @@ -129,7 +136,7 @@ static int __trace_define_field(struct list_head *head, const char *type, return 0; } -int trace_define_field(struct ftrace_event_call *call, const char *type, +int trace_define_field(struct trace_event_call *call, const char *type, const char *name, int offset, int size, int is_signed, int filter_type) { @@ -144,6 +151,13 @@ int trace_define_field(struct ftrace_event_call *call, const char *type, } EXPORT_SYMBOL_GPL(trace_define_field); +#define __generic_field(type, item, filter_type) \ + ret = __trace_define_field(&ftrace_generic_fields, #type, \ + #item, 0, 0, is_signed_type(type), \ + filter_type); \ + if (ret) \ + return ret; + #define __common_field(type, item) \ ret = __trace_define_field(&ftrace_common_fields, #type, \ "common_" #item, \ @@ -153,6 +167,18 @@ EXPORT_SYMBOL_GPL(trace_define_field); if (ret) \ return ret; +static int trace_define_generic_fields(void) +{ + int ret; + + __generic_field(int, CPU, FILTER_CPU); + __generic_field(int, cpu, FILTER_CPU); + __generic_field(char *, COMM, FILTER_COMM); + __generic_field(char *, comm, FILTER_COMM); + + return ret; +} + static int trace_define_common_fields(void) { int ret; @@ -168,7 +194,7 @@ static int trace_define_common_fields(void) return ret; } -static void trace_destroy_fields(struct ftrace_event_call *call) +static void trace_destroy_fields(struct trace_event_call *call) { struct ftrace_event_field *field, *next; struct list_head *head; @@ -180,11 +206,11 @@ static void trace_destroy_fields(struct ftrace_event_call *call) } } -int trace_event_raw_init(struct ftrace_event_call *call) +int trace_event_raw_init(struct trace_event_call *call) { int id; - id = register_ftrace_event(&call->event); + id = register_trace_event(&call->event); if (!id) return -ENODEV; @@ -192,18 +218,38 @@ int trace_event_raw_init(struct ftrace_event_call *call) } EXPORT_SYMBOL_GPL(trace_event_raw_init); -void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer, - struct ftrace_event_file *ftrace_file, - unsigned long len) +bool trace_event_ignore_this_pid(struct trace_event_file *trace_file) +{ + struct trace_array *tr = trace_file->tr; + struct trace_array_cpu *data; + struct trace_pid_list *pid_list; + + pid_list = rcu_dereference_sched(tr->filtered_pids); + if (!pid_list) + return false; + + data = this_cpu_ptr(tr->trace_buffer.data); + + return data->ignore_pid; +} +EXPORT_SYMBOL_GPL(trace_event_ignore_this_pid); + +void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer, + struct trace_event_file *trace_file, + unsigned long len) { - struct ftrace_event_call *event_call = ftrace_file->event_call; + struct trace_event_call *event_call = trace_file->event_call; + + if ((trace_file->flags & EVENT_FILE_FL_PID_FILTER) && + trace_event_ignore_this_pid(trace_file)) + return NULL; local_save_flags(fbuffer->flags); fbuffer->pc = preempt_count(); - fbuffer->ftrace_file = ftrace_file; + fbuffer->trace_file = trace_file; fbuffer->event = - trace_event_buffer_lock_reserve(&fbuffer->buffer, ftrace_file, + trace_event_buffer_lock_reserve(&fbuffer->buffer, trace_file, event_call->event.type, len, fbuffer->flags, fbuffer->pc); if (!fbuffer->event) @@ -212,13 +258,13 @@ void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer, fbuffer->entry = ring_buffer_event_data(fbuffer->event); return fbuffer->entry; } -EXPORT_SYMBOL_GPL(ftrace_event_buffer_reserve); +EXPORT_SYMBOL_GPL(trace_event_buffer_reserve); static DEFINE_SPINLOCK(tracepoint_iter_lock); -static void output_printk(struct ftrace_event_buffer *fbuffer) +static void output_printk(struct trace_event_buffer *fbuffer) { - struct ftrace_event_call *event_call; + struct trace_event_call *event_call; struct trace_event *event; unsigned long flags; struct trace_iterator *iter = tracepoint_print_iter; @@ -226,12 +272,12 @@ static void output_printk(struct ftrace_event_buffer *fbuffer) if (!iter) return; - event_call = fbuffer->ftrace_file->event_call; + event_call = fbuffer->trace_file->event_call; if (!event_call || !event_call->event.funcs || !event_call->event.funcs->trace) return; - event = &fbuffer->ftrace_file->event_call->event; + event = &fbuffer->trace_file->event_call->event; spin_lock_irqsave(&tracepoint_iter_lock, flags); trace_seq_init(&iter->seq); @@ -243,21 +289,21 @@ static void output_printk(struct ftrace_event_buffer *fbuffer) spin_unlock_irqrestore(&tracepoint_iter_lock, flags); } -void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer) +void trace_event_buffer_commit(struct trace_event_buffer *fbuffer) { if (tracepoint_printk) output_printk(fbuffer); - event_trigger_unlock_commit(fbuffer->ftrace_file, fbuffer->buffer, + event_trigger_unlock_commit(fbuffer->trace_file, fbuffer->buffer, fbuffer->event, fbuffer->entry, fbuffer->flags, fbuffer->pc); } -EXPORT_SYMBOL_GPL(ftrace_event_buffer_commit); +EXPORT_SYMBOL_GPL(trace_event_buffer_commit); -int ftrace_event_reg(struct ftrace_event_call *call, - enum trace_reg type, void *data) +int trace_event_reg(struct trace_event_call *call, + enum trace_reg type, void *data) { - struct ftrace_event_file *file = data; + struct trace_event_file *file = data; WARN_ON(!(call->flags & TRACE_EVENT_FL_TRACEPOINT)); switch (type) { @@ -290,34 +336,35 @@ int ftrace_event_reg(struct ftrace_event_call *call, } return 0; } -EXPORT_SYMBOL_GPL(ftrace_event_reg); +EXPORT_SYMBOL_GPL(trace_event_reg); void trace_event_enable_cmd_record(bool enable) { - struct ftrace_event_file *file; + struct trace_event_file *file; struct trace_array *tr; mutex_lock(&event_mutex); do_for_each_event_file(tr, file) { - if (!(file->flags & FTRACE_EVENT_FL_ENABLED)) + if (!(file->flags & EVENT_FILE_FL_ENABLED)) continue; if (enable) { tracing_start_cmdline_record(); - set_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags); + set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); } else { tracing_stop_cmdline_record(); - clear_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags); + clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); } } while_for_each_event_file(); mutex_unlock(&event_mutex); } -static int __ftrace_event_enable_disable(struct ftrace_event_file *file, +static int __ftrace_event_enable_disable(struct trace_event_file *file, int enable, int soft_disable) { - struct ftrace_event_call *call = file->event_call; + struct trace_event_call *call = file->event_call; + struct trace_array *tr = file->tr; int ret = 0; int disable; @@ -339,24 +386,24 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file, if (soft_disable) { if (atomic_dec_return(&file->sm_ref) > 0) break; - disable = file->flags & FTRACE_EVENT_FL_SOFT_DISABLED; - clear_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags); + disable = file->flags & EVENT_FILE_FL_SOFT_DISABLED; + clear_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags); } else - disable = !(file->flags & FTRACE_EVENT_FL_SOFT_MODE); + disable = !(file->flags & EVENT_FILE_FL_SOFT_MODE); - if (disable && (file->flags & FTRACE_EVENT_FL_ENABLED)) { - clear_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags); - if (file->flags & FTRACE_EVENT_FL_RECORDED_CMD) { + if (disable && (file->flags & EVENT_FILE_FL_ENABLED)) { + clear_bit(EVENT_FILE_FL_ENABLED_BIT, &file->flags); + if (file->flags & EVENT_FILE_FL_RECORDED_CMD) { tracing_stop_cmdline_record(); - clear_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags); + clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); } call->class->reg(call, TRACE_REG_UNREGISTER, file); } /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */ - if (file->flags & FTRACE_EVENT_FL_SOFT_MODE) - set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags); + if (file->flags & EVENT_FILE_FL_SOFT_MODE) + set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); else - clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags); + clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); break; case 1: /* @@ -368,31 +415,31 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file, * it still seems to be disabled. */ if (!soft_disable) - clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags); + clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); else { if (atomic_inc_return(&file->sm_ref) > 1) break; - set_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags); + set_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags); } - if (!(file->flags & FTRACE_EVENT_FL_ENABLED)) { + if (!(file->flags & EVENT_FILE_FL_ENABLED)) { /* Keep the event disabled, when going to SOFT_MODE. */ if (soft_disable) - set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags); + set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); - if (trace_flags & TRACE_ITER_RECORD_CMD) { + if (tr->trace_flags & TRACE_ITER_RECORD_CMD) { tracing_start_cmdline_record(); - set_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags); + set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); } ret = call->class->reg(call, TRACE_REG_REGISTER, file); if (ret) { tracing_stop_cmdline_record(); pr_info("event trace: Could not enable event " - "%s\n", ftrace_event_name(call)); + "%s\n", trace_event_name(call)); break; } - set_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags); + set_bit(EVENT_FILE_FL_ENABLED_BIT, &file->flags); /* WAS_ENABLED gets set but never cleared. */ call->flags |= TRACE_EVENT_FL_WAS_ENABLED; @@ -403,13 +450,13 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file, return ret; } -int trace_event_enable_disable(struct ftrace_event_file *file, +int trace_event_enable_disable(struct trace_event_file *file, int enable, int soft_disable) { return __ftrace_event_enable_disable(file, enable, soft_disable); } -static int ftrace_event_enable_disable(struct ftrace_event_file *file, +static int ftrace_event_enable_disable(struct trace_event_file *file, int enable) { return __ftrace_event_enable_disable(file, enable, 0); @@ -417,7 +464,7 @@ static int ftrace_event_enable_disable(struct ftrace_event_file *file, static void ftrace_clear_events(struct trace_array *tr) { - struct ftrace_event_file *file; + struct trace_event_file *file; mutex_lock(&event_mutex); list_for_each_entry(file, &tr->events, list) { @@ -426,6 +473,148 @@ static void ftrace_clear_events(struct trace_array *tr) mutex_unlock(&event_mutex); } +static int cmp_pid(const void *key, const void *elt) +{ + const pid_t *search_pid = key; + const pid_t *pid = elt; + + if (*search_pid == *pid) + return 0; + if (*search_pid < *pid) + return -1; + return 1; +} + +static bool +check_ignore_pid(struct trace_pid_list *filtered_pids, struct task_struct *task) +{ + pid_t search_pid; + pid_t *pid; + + /* + * Return false, because if filtered_pids does not exist, + * all pids are good to trace. + */ + if (!filtered_pids) + return false; + + search_pid = task->pid; + + pid = bsearch(&search_pid, filtered_pids->pids, + filtered_pids->nr_pids, sizeof(pid_t), + cmp_pid); + if (!pid) + return true; + + return false; +} + +static void +event_filter_pid_sched_switch_probe_pre(void *data, bool preempt, + struct task_struct *prev, struct task_struct *next) +{ + struct trace_array *tr = data; + struct trace_pid_list *pid_list; + + pid_list = rcu_dereference_sched(tr->filtered_pids); + + this_cpu_write(tr->trace_buffer.data->ignore_pid, + check_ignore_pid(pid_list, prev) && + check_ignore_pid(pid_list, next)); +} + +static void +event_filter_pid_sched_switch_probe_post(void *data, bool preempt, + struct task_struct *prev, struct task_struct *next) +{ + struct trace_array *tr = data; + struct trace_pid_list *pid_list; + + pid_list = rcu_dereference_sched(tr->filtered_pids); + + this_cpu_write(tr->trace_buffer.data->ignore_pid, + check_ignore_pid(pid_list, next)); +} + +static void +event_filter_pid_sched_wakeup_probe_pre(void *data, struct task_struct *task) +{ + struct trace_array *tr = data; + struct trace_pid_list *pid_list; + + /* Nothing to do if we are already tracing */ + if (!this_cpu_read(tr->trace_buffer.data->ignore_pid)) + return; + + pid_list = rcu_dereference_sched(tr->filtered_pids); + + this_cpu_write(tr->trace_buffer.data->ignore_pid, + check_ignore_pid(pid_list, task)); +} + +static void +event_filter_pid_sched_wakeup_probe_post(void *data, struct task_struct *task) +{ + struct trace_array *tr = data; + struct trace_pid_list *pid_list; + + /* Nothing to do if we are not tracing */ + if (this_cpu_read(tr->trace_buffer.data->ignore_pid)) + return; + + pid_list = rcu_dereference_sched(tr->filtered_pids); + + /* Set tracing if current is enabled */ + this_cpu_write(tr->trace_buffer.data->ignore_pid, + check_ignore_pid(pid_list, current)); +} + +static void __ftrace_clear_event_pids(struct trace_array *tr) +{ + struct trace_pid_list *pid_list; + struct trace_event_file *file; + int cpu; + + pid_list = rcu_dereference_protected(tr->filtered_pids, + lockdep_is_held(&event_mutex)); + if (!pid_list) + return; + + unregister_trace_sched_switch(event_filter_pid_sched_switch_probe_pre, tr); + unregister_trace_sched_switch(event_filter_pid_sched_switch_probe_post, tr); + + unregister_trace_sched_wakeup(event_filter_pid_sched_wakeup_probe_pre, tr); + unregister_trace_sched_wakeup(event_filter_pid_sched_wakeup_probe_post, tr); + + unregister_trace_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_pre, tr); + unregister_trace_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_post, tr); + + unregister_trace_sched_waking(event_filter_pid_sched_wakeup_probe_pre, tr); + unregister_trace_sched_waking(event_filter_pid_sched_wakeup_probe_post, tr); + + list_for_each_entry(file, &tr->events, list) { + clear_bit(EVENT_FILE_FL_PID_FILTER_BIT, &file->flags); + } + + for_each_possible_cpu(cpu) + per_cpu_ptr(tr->trace_buffer.data, cpu)->ignore_pid = false; + + rcu_assign_pointer(tr->filtered_pids, NULL); + + /* Wait till all users are no longer using pid filtering */ + synchronize_sched(); + + free_pages((unsigned long)pid_list->pids, pid_list->order); + kfree(pid_list); +} + +static void ftrace_clear_event_pids(struct trace_array *tr) +{ + mutex_lock(&event_mutex); + __ftrace_clear_event_pids(tr); + mutex_unlock(&event_mutex); +} + static void __put_system(struct event_subsystem *system) { struct event_filter *filter = system->filter; @@ -440,8 +629,7 @@ static void __put_system(struct event_subsystem *system) kfree(filter->filter_string); kfree(filter); } - if (system->ref_count & SYSTEM_FL_FREE_NAME) - kfree(system->name); + kfree_const(system->name); kfree(system); } @@ -451,14 +639,14 @@ static void __get_system(struct event_subsystem *system) system_refcount_inc(system); } -static void __get_system_dir(struct ftrace_subsystem_dir *dir) +static void __get_system_dir(struct trace_subsystem_dir *dir) { WARN_ON_ONCE(dir->ref_count == 0); dir->ref_count++; __get_system(dir->subsystem); } -static void __put_system_dir(struct ftrace_subsystem_dir *dir) +static void __put_system_dir(struct trace_subsystem_dir *dir) { WARN_ON_ONCE(dir->ref_count == 0); /* If the subsystem is about to be freed, the dir must be too */ @@ -469,14 +657,14 @@ static void __put_system_dir(struct ftrace_subsystem_dir *dir) kfree(dir); } -static void put_system(struct ftrace_subsystem_dir *dir) +static void put_system(struct trace_subsystem_dir *dir) { mutex_lock(&event_mutex); __put_system_dir(dir); mutex_unlock(&event_mutex); } -static void remove_subsystem(struct ftrace_subsystem_dir *dir) +static void remove_subsystem(struct trace_subsystem_dir *dir) { if (!dir) return; @@ -488,7 +676,7 @@ static void remove_subsystem(struct ftrace_subsystem_dir *dir) } } -static void remove_event_file_dir(struct ftrace_event_file *file) +static void remove_event_file_dir(struct trace_event_file *file) { struct dentry *dir = file->dir; struct dentry *child; @@ -517,15 +705,15 @@ static int __ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match, const char *sub, const char *event, int set) { - struct ftrace_event_file *file; - struct ftrace_event_call *call; + struct trace_event_file *file; + struct trace_event_call *call; const char *name; int ret = -EINVAL; list_for_each_entry(file, &tr->events, list) { call = file->event_call; - name = ftrace_event_name(call); + name = trace_event_name(call); if (!name || !call->class || !call->class->reg) continue; @@ -673,8 +861,8 @@ ftrace_event_write(struct file *file, const char __user *ubuf, static void * t_next(struct seq_file *m, void *v, loff_t *pos) { - struct ftrace_event_file *file = v; - struct ftrace_event_call *call; + struct trace_event_file *file = v; + struct trace_event_call *call; struct trace_array *tr = m->private; (*pos)++; @@ -685,7 +873,8 @@ t_next(struct seq_file *m, void *v, loff_t *pos) * The ftrace subsystem is for showing formats only. * They can not be enabled or disabled via the event files. */ - if (call->class && call->class->reg) + if (call->class && call->class->reg && + !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) return file; } @@ -694,13 +883,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos) static void *t_start(struct seq_file *m, loff_t *pos) { - struct ftrace_event_file *file; + struct trace_event_file *file; struct trace_array *tr = m->private; loff_t l; mutex_lock(&event_mutex); - file = list_entry(&tr->events, struct ftrace_event_file, list); + file = list_entry(&tr->events, struct trace_event_file, list); for (l = 0; l <= *pos; ) { file = t_next(m, file, &l); if (!file) @@ -712,13 +901,13 @@ static void *t_start(struct seq_file *m, loff_t *pos) static void * s_next(struct seq_file *m, void *v, loff_t *pos) { - struct ftrace_event_file *file = v; + struct trace_event_file *file = v; struct trace_array *tr = m->private; (*pos)++; list_for_each_entry_continue(file, &tr->events, list) { - if (file->flags & FTRACE_EVENT_FL_ENABLED) + if (file->flags & EVENT_FILE_FL_ENABLED) return file; } @@ -727,13 +916,13 @@ s_next(struct seq_file *m, void *v, loff_t *pos) static void *s_start(struct seq_file *m, loff_t *pos) { - struct ftrace_event_file *file; + struct trace_event_file *file; struct trace_array *tr = m->private; loff_t l; mutex_lock(&event_mutex); - file = list_entry(&tr->events, struct ftrace_event_file, list); + file = list_entry(&tr->events, struct trace_event_file, list); for (l = 0; l <= *pos; ) { file = s_next(m, file, &l); if (!file) @@ -744,12 +933,12 @@ static void *s_start(struct seq_file *m, loff_t *pos) static int t_show(struct seq_file *m, void *v) { - struct ftrace_event_file *file = v; - struct ftrace_event_call *call = file->event_call; + struct trace_event_file *file = v; + struct trace_event_call *call = file->event_call; if (strcmp(call->class->system, TRACE_SYSTEM) != 0) seq_printf(m, "%s:", call->class->system); - seq_printf(m, "%s\n", ftrace_event_name(call)); + seq_printf(m, "%s\n", trace_event_name(call)); return 0; } @@ -759,11 +948,63 @@ static void t_stop(struct seq_file *m, void *p) mutex_unlock(&event_mutex); } +static void *p_start(struct seq_file *m, loff_t *pos) + __acquires(RCU) +{ + struct trace_pid_list *pid_list; + struct trace_array *tr = m->private; + + /* + * Grab the mutex, to keep calls to p_next() having the same + * tr->filtered_pids as p_start() has. + * If we just passed the tr->filtered_pids around, then RCU would + * have been enough, but doing that makes things more complex. + */ + mutex_lock(&event_mutex); + rcu_read_lock_sched(); + + pid_list = rcu_dereference_sched(tr->filtered_pids); + + if (!pid_list || *pos >= pid_list->nr_pids) + return NULL; + + return (void *)&pid_list->pids[*pos]; +} + +static void p_stop(struct seq_file *m, void *p) + __releases(RCU) +{ + rcu_read_unlock_sched(); + mutex_unlock(&event_mutex); +} + +static void * +p_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct trace_array *tr = m->private; + struct trace_pid_list *pid_list = rcu_dereference_sched(tr->filtered_pids); + + (*pos)++; + + if (*pos >= pid_list->nr_pids) + return NULL; + + return (void *)&pid_list->pids[*pos]; +} + +static int p_show(struct seq_file *m, void *v) +{ + pid_t *pid = v; + + seq_printf(m, "%d\n", *pid); + return 0; +} + static ssize_t event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ftrace_event_file *file; + struct trace_event_file *file; unsigned long flags; char buf[4] = "0"; @@ -776,12 +1017,12 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, if (!file) return -ENODEV; - if (flags & FTRACE_EVENT_FL_ENABLED && - !(flags & FTRACE_EVENT_FL_SOFT_DISABLED)) + if (flags & EVENT_FILE_FL_ENABLED && + !(flags & EVENT_FILE_FL_SOFT_DISABLED)) strcpy(buf, "1"); - if (flags & FTRACE_EVENT_FL_SOFT_DISABLED || - flags & FTRACE_EVENT_FL_SOFT_MODE) + if (flags & EVENT_FILE_FL_SOFT_DISABLED || + flags & EVENT_FILE_FL_SOFT_MODE) strcat(buf, "*"); strcat(buf, "\n"); @@ -793,7 +1034,7 @@ static ssize_t event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ftrace_event_file *file; + struct trace_event_file *file; unsigned long val; int ret; @@ -830,10 +1071,10 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { const char set_to_char[4] = { '?', '0', '1', 'X' }; - struct ftrace_subsystem_dir *dir = filp->private_data; + struct trace_subsystem_dir *dir = filp->private_data; struct event_subsystem *system = dir->subsystem; - struct ftrace_event_call *call; - struct ftrace_event_file *file; + struct trace_event_call *call; + struct trace_event_file *file; struct trace_array *tr = dir->tr; char buf[2]; int set = 0; @@ -842,7 +1083,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, mutex_lock(&event_mutex); list_for_each_entry(file, &tr->events, list) { call = file->event_call; - if (!ftrace_event_name(call) || !call->class || !call->class->reg) + if (!trace_event_name(call) || !call->class || !call->class->reg) continue; if (system && strcmp(call->class->system, system->name) != 0) @@ -853,7 +1094,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, * or if all events or cleared, or if we have * a mixture. */ - set |= (1 << !!(file->flags & FTRACE_EVENT_FL_ENABLED)); + set |= (1 << !!(file->flags & EVENT_FILE_FL_ENABLED)); /* * If we have a mixture, no need to look further. @@ -875,7 +1116,7 @@ static ssize_t system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ftrace_subsystem_dir *dir = filp->private_data; + struct trace_subsystem_dir *dir = filp->private_data; struct event_subsystem *system = dir->subsystem; const char *name = NULL; unsigned long val; @@ -919,7 +1160,7 @@ enum { static void *f_next(struct seq_file *m, void *v, loff_t *pos) { - struct ftrace_event_call *call = event_file_data(m->private); + struct trace_event_call *call = event_file_data(m->private); struct list_head *common_head = &ftrace_common_fields; struct list_head *head = trace_get_fields(call); struct list_head *node = v; @@ -951,13 +1192,13 @@ static void *f_next(struct seq_file *m, void *v, loff_t *pos) static int f_show(struct seq_file *m, void *v) { - struct ftrace_event_call *call = event_file_data(m->private); + struct trace_event_call *call = event_file_data(m->private); struct ftrace_event_field *field; const char *array_descriptor; switch ((unsigned long)v) { case FORMAT_HEADER: - seq_printf(m, "name: %s\n", ftrace_event_name(call)); + seq_printf(m, "name: %s\n", trace_event_name(call)); seq_printf(m, "ID: %d\n", call->event.type); seq_puts(m, "format:\n"); return 0; @@ -1064,7 +1305,7 @@ static ssize_t event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ftrace_event_file *file; + struct trace_event_file *file; struct trace_seq *s; int r = -ENODEV; @@ -1097,7 +1338,7 @@ static ssize_t event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ftrace_event_file *file; + struct trace_event_file *file; char *buf; int err = -ENODEV; @@ -1134,7 +1375,7 @@ static LIST_HEAD(event_subsystems); static int subsystem_open(struct inode *inode, struct file *filp) { struct event_subsystem *system = NULL; - struct ftrace_subsystem_dir *dir = NULL; /* Initialize for gcc */ + struct trace_subsystem_dir *dir = NULL; /* Initialize for gcc */ struct trace_array *tr; int ret; @@ -1183,7 +1424,7 @@ static int subsystem_open(struct inode *inode, struct file *filp) static int system_tr_open(struct inode *inode, struct file *filp) { - struct ftrace_subsystem_dir *dir; + struct trace_subsystem_dir *dir; struct trace_array *tr = inode->i_private; int ret; @@ -1216,7 +1457,7 @@ static int system_tr_open(struct inode *inode, struct file *filp) static int subsystem_release(struct inode *inode, struct file *file) { - struct ftrace_subsystem_dir *dir = file->private_data; + struct trace_subsystem_dir *dir = file->private_data; trace_array_put(dir->tr); @@ -1237,7 +1478,7 @@ static ssize_t subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ftrace_subsystem_dir *dir = filp->private_data; + struct trace_subsystem_dir *dir = filp->private_data; struct event_subsystem *system = dir->subsystem; struct trace_seq *s; int r; @@ -1264,7 +1505,7 @@ static ssize_t subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ftrace_subsystem_dir *dir = filp->private_data; + struct trace_subsystem_dir *dir = filp->private_data; char *buf; int err; @@ -1316,8 +1557,219 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) return r; } +static int max_pids(struct trace_pid_list *pid_list) +{ + return (PAGE_SIZE << pid_list->order) / sizeof(pid_t); +} + +static void ignore_task_cpu(void *data) +{ + struct trace_array *tr = data; + struct trace_pid_list *pid_list; + + /* + * This function is called by on_each_cpu() while the + * event_mutex is held. + */ + pid_list = rcu_dereference_protected(tr->filtered_pids, + mutex_is_locked(&event_mutex)); + + this_cpu_write(tr->trace_buffer.data->ignore_pid, + check_ignore_pid(pid_list, current)); +} + +static ssize_t +ftrace_event_pid_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct seq_file *m = filp->private_data; + struct trace_array *tr = m->private; + struct trace_pid_list *filtered_pids = NULL; + struct trace_pid_list *pid_list = NULL; + struct trace_event_file *file; + struct trace_parser parser; + unsigned long val; + loff_t this_pos; + ssize_t read = 0; + ssize_t ret = 0; + pid_t pid; + int i; + + if (!cnt) + return 0; + + ret = tracing_update_buffers(); + if (ret < 0) + return ret; + + if (trace_parser_get_init(&parser, EVENT_BUF_SIZE + 1)) + return -ENOMEM; + + mutex_lock(&event_mutex); + /* + * Load as many pids into the array before doing a + * swap from the tr->filtered_pids to the new list. + */ + while (cnt > 0) { + + this_pos = 0; + + ret = trace_get_user(&parser, ubuf, cnt, &this_pos); + if (ret < 0 || !trace_parser_loaded(&parser)) + break; + + read += ret; + ubuf += ret; + cnt -= ret; + + parser.buffer[parser.idx] = 0; + + ret = -EINVAL; + if (kstrtoul(parser.buffer, 0, &val)) + break; + if (val > INT_MAX) + break; + + pid = (pid_t)val; + + ret = -ENOMEM; + if (!pid_list) { + pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL); + if (!pid_list) + break; + + filtered_pids = rcu_dereference_protected(tr->filtered_pids, + lockdep_is_held(&event_mutex)); + if (filtered_pids) + pid_list->order = filtered_pids->order; + else + pid_list->order = 0; + + pid_list->pids = (void *)__get_free_pages(GFP_KERNEL, + pid_list->order); + if (!pid_list->pids) + break; + + if (filtered_pids) { + pid_list->nr_pids = filtered_pids->nr_pids; + memcpy(pid_list->pids, filtered_pids->pids, + pid_list->nr_pids * sizeof(pid_t)); + } else + pid_list->nr_pids = 0; + } + + if (pid_list->nr_pids >= max_pids(pid_list)) { + pid_t *pid_page; + + pid_page = (void *)__get_free_pages(GFP_KERNEL, + pid_list->order + 1); + if (!pid_page) + break; + memcpy(pid_page, pid_list->pids, + pid_list->nr_pids * sizeof(pid_t)); + free_pages((unsigned long)pid_list->pids, pid_list->order); + + pid_list->order++; + pid_list->pids = pid_page; + } + + pid_list->pids[pid_list->nr_pids++] = pid; + trace_parser_clear(&parser); + ret = 0; + } + trace_parser_put(&parser); + + if (ret < 0) { + if (pid_list) + free_pages((unsigned long)pid_list->pids, pid_list->order); + kfree(pid_list); + mutex_unlock(&event_mutex); + return ret; + } + + if (!pid_list) { + mutex_unlock(&event_mutex); + return ret; + } + + sort(pid_list->pids, pid_list->nr_pids, sizeof(pid_t), cmp_pid, NULL); + + /* Remove duplicates */ + for (i = 1; i < pid_list->nr_pids; i++) { + int start = i; + + while (i < pid_list->nr_pids && + pid_list->pids[i - 1] == pid_list->pids[i]) + i++; + + if (start != i) { + if (i < pid_list->nr_pids) { + memmove(&pid_list->pids[start], &pid_list->pids[i], + (pid_list->nr_pids - i) * sizeof(pid_t)); + pid_list->nr_pids -= i - start; + i = start; + } else + pid_list->nr_pids = start; + } + } + + rcu_assign_pointer(tr->filtered_pids, pid_list); + + list_for_each_entry(file, &tr->events, list) { + set_bit(EVENT_FILE_FL_PID_FILTER_BIT, &file->flags); + } + + if (filtered_pids) { + synchronize_sched(); + + free_pages((unsigned long)filtered_pids->pids, filtered_pids->order); + kfree(filtered_pids); + } else { + /* + * Register a probe that is called before all other probes + * to set ignore_pid if next or prev do not match. + * Register a probe this is called after all other probes + * to only keep ignore_pid set if next pid matches. + */ + register_trace_prio_sched_switch(event_filter_pid_sched_switch_probe_pre, + tr, INT_MAX); + register_trace_prio_sched_switch(event_filter_pid_sched_switch_probe_post, + tr, 0); + + register_trace_prio_sched_wakeup(event_filter_pid_sched_wakeup_probe_pre, + tr, INT_MAX); + register_trace_prio_sched_wakeup(event_filter_pid_sched_wakeup_probe_post, + tr, 0); + + register_trace_prio_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_pre, + tr, INT_MAX); + register_trace_prio_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_post, + tr, 0); + + register_trace_prio_sched_waking(event_filter_pid_sched_wakeup_probe_pre, + tr, INT_MAX); + register_trace_prio_sched_waking(event_filter_pid_sched_wakeup_probe_post, + tr, 0); + } + + /* + * Ignoring of pids is done at task switch. But we have to + * check for those tasks that are currently running. + * Always do this in case a pid was appended or removed. + */ + on_each_cpu(ignore_task_cpu, tr, 1); + + mutex_unlock(&event_mutex); + + ret = read; + *ppos += read; + + return ret; +} + static int ftrace_event_avail_open(struct inode *inode, struct file *file); static int ftrace_event_set_open(struct inode *inode, struct file *file); +static int ftrace_event_set_pid_open(struct inode *inode, struct file *file); static int ftrace_event_release(struct inode *inode, struct file *file); static const struct seq_operations show_event_seq_ops = { @@ -1334,6 +1786,13 @@ static const struct seq_operations show_set_event_seq_ops = { .stop = t_stop, }; +static const struct seq_operations show_set_pid_seq_ops = { + .start = p_start, + .next = p_next, + .show = p_show, + .stop = p_stop, +}; + static const struct file_operations ftrace_avail_fops = { .open = ftrace_event_avail_open, .read = seq_read, @@ -1349,6 +1808,14 @@ static const struct file_operations ftrace_set_event_fops = { .release = ftrace_event_release, }; +static const struct file_operations ftrace_set_event_pid_fops = { + .open = ftrace_event_set_pid_open, + .read = seq_read, + .write = ftrace_event_pid_write, + .llseek = seq_lseek, + .release = ftrace_event_release, +}; + static const struct file_operations ftrace_enable_fops = { .open = tracing_open_generic, .read = event_enable_read, @@ -1459,6 +1926,26 @@ ftrace_event_set_open(struct inode *inode, struct file *file) return ret; } +static int +ftrace_event_set_pid_open(struct inode *inode, struct file *file) +{ + const struct seq_operations *seq_ops = &show_set_pid_seq_ops; + struct trace_array *tr = inode->i_private; + int ret; + + if (trace_array_get(tr) < 0) + return -ENODEV; + + if ((file->f_mode & FMODE_WRITE) && + (file->f_flags & O_TRUNC)) + ftrace_clear_event_pids(tr); + + ret = ftrace_event_open(inode, file, seq_ops); + if (ret < 0) + trace_array_put(tr); + return ret; +} + static struct event_subsystem * create_new_subsystem(const char *name) { @@ -1472,13 +1959,9 @@ create_new_subsystem(const char *name) system->ref_count = 1; /* Only allocate if dynamic (kprobes and modules) */ - if (!core_kernel_data((unsigned long)name)) { - system->ref_count |= SYSTEM_FL_FREE_NAME; - system->name = kstrdup(name, GFP_KERNEL); - if (!system->name) - goto out_free; - } else - system->name = name; + system->name = kstrdup_const(name, GFP_KERNEL); + if (!system->name) + goto out_free; system->filter = NULL; @@ -1491,17 +1974,16 @@ create_new_subsystem(const char *name) return system; out_free: - if (system->ref_count & SYSTEM_FL_FREE_NAME) - kfree(system->name); + kfree_const(system->name); kfree(system); return NULL; } static struct dentry * event_subsystem_dir(struct trace_array *tr, const char *name, - struct ftrace_event_file *file, struct dentry *parent) + struct trace_event_file *file, struct dentry *parent) { - struct ftrace_subsystem_dir *dir; + struct trace_subsystem_dir *dir; struct event_subsystem *system; struct dentry *entry; @@ -1573,9 +2055,9 @@ event_subsystem_dir(struct trace_array *tr, const char *name, } static int -event_create_dir(struct dentry *parent, struct ftrace_event_file *file) +event_create_dir(struct dentry *parent, struct trace_event_file *file) { - struct ftrace_event_call *call = file->event_call; + struct trace_event_call *call = file->event_call; struct trace_array *tr = file->tr; struct list_head *head; struct dentry *d_events; @@ -1593,7 +2075,7 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file) } else d_events = parent; - name = ftrace_event_name(call); + name = trace_event_name(call); file->dir = tracefs_create_dir(name, d_events); if (!file->dir) { pr_warn("Could not create tracefs '%s' directory\n", name); @@ -1636,9 +2118,9 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file) return 0; } -static void remove_event_from_tracers(struct ftrace_event_call *call) +static void remove_event_from_tracers(struct trace_event_call *call) { - struct ftrace_event_file *file; + struct trace_event_file *file; struct trace_array *tr; do_for_each_event_file_safe(tr, file) { @@ -1656,10 +2138,10 @@ static void remove_event_from_tracers(struct ftrace_event_call *call) } while_for_each_event_file(); } -static void event_remove(struct ftrace_event_call *call) +static void event_remove(struct trace_event_call *call) { struct trace_array *tr; - struct ftrace_event_file *file; + struct trace_event_file *file; do_for_each_event_file(tr, file) { if (file->event_call != call) @@ -1675,17 +2157,17 @@ static void event_remove(struct ftrace_event_call *call) } while_for_each_event_file(); if (call->event.funcs) - __unregister_ftrace_event(&call->event); + __unregister_trace_event(&call->event); remove_event_from_tracers(call); list_del(&call->list); } -static int event_init(struct ftrace_event_call *call) +static int event_init(struct trace_event_call *call) { int ret = 0; const char *name; - name = ftrace_event_name(call); + name = trace_event_name(call); if (WARN_ON(!name)) return -EINVAL; @@ -1699,7 +2181,7 @@ static int event_init(struct ftrace_event_call *call) } static int -__register_event(struct ftrace_event_call *call, struct module *mod) +__register_event(struct trace_event_call *call, struct module *mod) { int ret; @@ -1735,7 +2217,7 @@ static char *enum_replace(char *ptr, struct trace_enum_map *map, int len) return ptr + elen; } -static void update_event_printk(struct ftrace_event_call *call, +static void update_event_printk(struct trace_event_call *call, struct trace_enum_map *map) { char *ptr; @@ -1813,7 +2295,7 @@ static void update_event_printk(struct ftrace_event_call *call, void trace_event_enum_update(struct trace_enum_map **map, int len) { - struct ftrace_event_call *call, *p; + struct trace_event_call *call, *p; const char *last_system = NULL; int last_i; int i; @@ -1838,11 +2320,11 @@ void trace_event_enum_update(struct trace_enum_map **map, int len) up_write(&trace_event_sem); } -static struct ftrace_event_file * -trace_create_new_event(struct ftrace_event_call *call, +static struct trace_event_file * +trace_create_new_event(struct trace_event_call *call, struct trace_array *tr) { - struct ftrace_event_file *file; + struct trace_event_file *file; file = kmem_cache_alloc(file_cachep, GFP_TRACE); if (!file) @@ -1860,9 +2342,9 @@ trace_create_new_event(struct ftrace_event_call *call, /* Add an event to a trace directory */ static int -__trace_add_new_event(struct ftrace_event_call *call, struct trace_array *tr) +__trace_add_new_event(struct trace_event_call *call, struct trace_array *tr) { - struct ftrace_event_file *file; + struct trace_event_file *file; file = trace_create_new_event(call, tr); if (!file) @@ -1877,10 +2359,10 @@ __trace_add_new_event(struct ftrace_event_call *call, struct trace_array *tr) * the filesystem is initialized. */ static __init int -__trace_early_add_new_event(struct ftrace_event_call *call, +__trace_early_add_new_event(struct trace_event_call *call, struct trace_array *tr) { - struct ftrace_event_file *file; + struct trace_event_file *file; file = trace_create_new_event(call, tr); if (!file) @@ -1890,10 +2372,10 @@ __trace_early_add_new_event(struct ftrace_event_call *call, } struct ftrace_module_file_ops; -static void __add_event_to_tracers(struct ftrace_event_call *call); +static void __add_event_to_tracers(struct trace_event_call *call); /* Add an additional event_call dynamically */ -int trace_add_event_call(struct ftrace_event_call *call) +int trace_add_event_call(struct trace_event_call *call) { int ret; mutex_lock(&trace_types_lock); @@ -1912,7 +2394,7 @@ int trace_add_event_call(struct ftrace_event_call *call) * Must be called under locking of trace_types_lock, event_mutex and * trace_event_sem. */ -static void __trace_remove_event_call(struct ftrace_event_call *call) +static void __trace_remove_event_call(struct trace_event_call *call) { event_remove(call); trace_destroy_fields(call); @@ -1920,10 +2402,10 @@ static void __trace_remove_event_call(struct ftrace_event_call *call) call->filter = NULL; } -static int probe_remove_event_call(struct ftrace_event_call *call) +static int probe_remove_event_call(struct trace_event_call *call) { struct trace_array *tr; - struct ftrace_event_file *file; + struct trace_event_file *file; #ifdef CONFIG_PERF_EVENTS if (call->perf_refcount) @@ -1934,10 +2416,10 @@ static int probe_remove_event_call(struct ftrace_event_call *call) continue; /* * We can't rely on ftrace_event_enable_disable(enable => 0) - * we are going to do, FTRACE_EVENT_FL_SOFT_MODE can suppress + * we are going to do, EVENT_FILE_FL_SOFT_MODE can suppress * TRACE_REG_UNREGISTER. */ - if (file->flags & FTRACE_EVENT_FL_ENABLED) + if (file->flags & EVENT_FILE_FL_ENABLED) return -EBUSY; /* * The do_for_each_event_file_safe() is @@ -1954,7 +2436,7 @@ static int probe_remove_event_call(struct ftrace_event_call *call) } /* Remove an event_call */ -int trace_remove_event_call(struct ftrace_event_call *call) +int trace_remove_event_call(struct trace_event_call *call) { int ret; @@ -1978,7 +2460,7 @@ int trace_remove_event_call(struct ftrace_event_call *call) static void trace_module_add_events(struct module *mod) { - struct ftrace_event_call **call, **start, **end; + struct trace_event_call **call, **start, **end; if (!mod->num_trace_events) return; @@ -2001,7 +2483,7 @@ static void trace_module_add_events(struct module *mod) static void trace_module_remove_events(struct module *mod) { - struct ftrace_event_call *call, *p; + struct trace_event_call *call, *p; bool clear_trace = false; down_write(&trace_event_sem); @@ -2057,28 +2539,28 @@ static struct notifier_block trace_module_nb = { static void __trace_add_event_dirs(struct trace_array *tr) { - struct ftrace_event_call *call; + struct trace_event_call *call; int ret; list_for_each_entry(call, &ftrace_events, list) { ret = __trace_add_new_event(call, tr); if (ret < 0) pr_warn("Could not create directory for event %s\n", - ftrace_event_name(call)); + trace_event_name(call)); } } -struct ftrace_event_file * +struct trace_event_file * find_event_file(struct trace_array *tr, const char *system, const char *event) { - struct ftrace_event_file *file; - struct ftrace_event_call *call; + struct trace_event_file *file; + struct trace_event_call *call; const char *name; list_for_each_entry(file, &tr->events, list) { call = file->event_call; - name = ftrace_event_name(call); + name = trace_event_name(call); if (!name || !call->class || !call->class->reg) continue; @@ -2100,7 +2582,7 @@ find_event_file(struct trace_array *tr, const char *system, const char *event) #define DISABLE_EVENT_STR "disable_event" struct event_probe_data { - struct ftrace_event_file *file; + struct trace_event_file *file; unsigned long count; int ref; bool enable; @@ -2116,9 +2598,9 @@ event_enable_probe(unsigned long ip, unsigned long parent_ip, void **_data) return; if (data->enable) - clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &data->file->flags); + clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &data->file->flags); else - set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &data->file->flags); + set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &data->file->flags); } static void @@ -2134,7 +2616,7 @@ event_enable_count_probe(unsigned long ip, unsigned long parent_ip, void **_data return; /* Skip if the event is in a state we want to switch to */ - if (data->enable == !(data->file->flags & FTRACE_EVENT_FL_SOFT_DISABLED)) + if (data->enable == !(data->file->flags & EVENT_FILE_FL_SOFT_DISABLED)) return; if (data->count != -1) @@ -2154,7 +2636,7 @@ event_enable_print(struct seq_file *m, unsigned long ip, seq_printf(m, "%s:%s:%s", data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR, data->file->event_call->class->system, - ftrace_event_name(data->file->event_call)); + trace_event_name(data->file->event_call)); if (data->count == -1) seq_puts(m, ":unlimited\n"); @@ -2228,7 +2710,7 @@ event_enable_func(struct ftrace_hash *hash, char *glob, char *cmd, char *param, int enabled) { struct trace_array *tr = top_trace_array(); - struct ftrace_event_file *file; + struct trace_event_file *file; struct ftrace_probe_ops *ops; struct event_probe_data *data; const char *system; @@ -2360,7 +2842,7 @@ static inline int register_event_cmds(void) { return 0; } #endif /* CONFIG_DYNAMIC_FTRACE */ /* - * The top level array has already had its ftrace_event_file + * The top level array has already had its trace_event_file * descriptors created in order to allow for early events to * be recorded. This function is called after the tracefs has been * initialized, and we now have to create the files associated @@ -2369,7 +2851,7 @@ static inline int register_event_cmds(void) { return 0; } static __init void __trace_early_add_event_dirs(struct trace_array *tr) { - struct ftrace_event_file *file; + struct trace_event_file *file; int ret; @@ -2377,7 +2859,7 @@ __trace_early_add_event_dirs(struct trace_array *tr) ret = event_create_dir(tr->event_dir, file); if (ret < 0) pr_warn("Could not create directory for event %s\n", - ftrace_event_name(file->event_call)); + trace_event_name(file->event_call)); } } @@ -2390,7 +2872,7 @@ __trace_early_add_event_dirs(struct trace_array *tr) static __init void __trace_early_add_events(struct trace_array *tr) { - struct ftrace_event_call *call; + struct trace_event_call *call; int ret; list_for_each_entry(call, &ftrace_events, list) { @@ -2401,7 +2883,7 @@ __trace_early_add_events(struct trace_array *tr) ret = __trace_early_add_new_event(call, tr); if (ret < 0) pr_warn("Could not create early event %s\n", - ftrace_event_name(call)); + trace_event_name(call)); } } @@ -2409,13 +2891,13 @@ __trace_early_add_events(struct trace_array *tr) static void __trace_remove_event_dirs(struct trace_array *tr) { - struct ftrace_event_file *file, *next; + struct trace_event_file *file, *next; list_for_each_entry_safe(file, next, &tr->events, list) remove_event_file_dir(file); } -static void __add_event_to_tracers(struct ftrace_event_call *call) +static void __add_event_to_tracers(struct trace_event_call *call) { struct trace_array *tr; @@ -2423,8 +2905,8 @@ static void __add_event_to_tracers(struct ftrace_event_call *call) __trace_add_new_event(call, tr); } -extern struct ftrace_event_call *__start_ftrace_events[]; -extern struct ftrace_event_call *__stop_ftrace_events[]; +extern struct trace_event_call *__start_ftrace_events[]; +extern struct trace_event_call *__stop_ftrace_events[]; static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata; @@ -2458,6 +2940,9 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) return -ENOMEM; } + entry = tracefs_create_file("set_event_pid", 0644, parent, + tr, &ftrace_set_event_pid_fops); + /* ring buffer internal formats */ trace_create_file("header_page", 0444, d_events, ring_buffer_print_page_header, @@ -2538,6 +3023,9 @@ int event_trace_del_tracer(struct trace_array *tr) /* Disable any event triggers and associated soft-disabled events */ clear_event_triggers(tr); + /* Clear the pid list */ + __ftrace_clear_event_pids(tr); + /* Disable any running events */ __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0); @@ -2559,7 +3047,7 @@ int event_trace_del_tracer(struct trace_array *tr) static __init int event_trace_memsetup(void) { field_cachep = KMEM_CACHE(ftrace_event_field, SLAB_PANIC); - file_cachep = KMEM_CACHE(ftrace_event_file, SLAB_PANIC); + file_cachep = KMEM_CACHE(trace_event_file, SLAB_PANIC); return 0; } @@ -2575,16 +3063,16 @@ early_enable_events(struct trace_array *tr, bool disable_first) if (!token) break; - if (!*token) - continue; - /* Restarting syscalls requires that we stop them first */ - if (disable_first) - ftrace_set_clr_event(tr, token, 0); + if (*token) { + /* Restarting syscalls requires that we stop them first */ + if (disable_first) + ftrace_set_clr_event(tr, token, 0); - ret = ftrace_set_clr_event(tr, token, 1); - if (ret) - pr_warn("Failed to enable trace event: %s\n", token); + ret = ftrace_set_clr_event(tr, token, 1); + if (ret) + pr_warn("Failed to enable trace event: %s\n", token); + } /* Put back the comma to allow this to be called again */ if (buf) @@ -2595,7 +3083,7 @@ early_enable_events(struct trace_array *tr, bool disable_first) static __init int event_trace_enable(void) { struct trace_array *tr = top_trace_array(); - struct ftrace_event_call **iter, *call; + struct trace_event_call **iter, *call; int ret; if (!tr) @@ -2673,6 +3161,9 @@ static __init int event_trace_init(void) if (!entry) pr_warn("Could not create tracefs 'available_events' entry\n"); + if (trace_define_generic_fields()) + pr_warn("tracing: Failed to allocated generic fields"); + if (trace_define_common_fields()) pr_warn("tracing: Failed to allocate common fields"); @@ -2756,9 +3247,9 @@ static __init void event_test_stuff(void) */ static __init void event_trace_self_tests(void) { - struct ftrace_subsystem_dir *dir; - struct ftrace_event_file *file; - struct ftrace_event_call *call; + struct trace_subsystem_dir *dir; + struct trace_event_file *file; + struct trace_event_call *call; struct event_subsystem *system; struct trace_array *tr; int ret; @@ -2789,13 +3280,13 @@ static __init void event_trace_self_tests(void) continue; #endif - pr_info("Testing event %s: ", ftrace_event_name(call)); + pr_info("Testing event %s: ", trace_event_name(call)); /* * If an event is already enabled, someone is using * it and the self test should not be on. */ - if (file->flags & FTRACE_EVENT_FL_ENABLED) { + if (file->flags & EVENT_FILE_FL_ENABLED) { pr_warn("Enabled event during self test!\n"); WARN_ON_ONCE(1); continue; @@ -2868,7 +3359,9 @@ static __init void event_trace_self_tests(void) static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable); -static void +static struct trace_array *event_tr; + +static void __init function_test_events_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *pt_regs) { @@ -2899,7 +3392,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip, entry->ip = ip; entry->parent_ip = parent_ip; - trace_buffer_unlock_commit(buffer, event, flags, pc); + trace_buffer_unlock_commit(event_tr, buffer, event, flags, pc); out: atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); @@ -2915,6 +3408,9 @@ static struct ftrace_ops trace_ops __initdata = static __init void event_trace_self_test_with_function(void) { int ret; + event_tr = top_trace_array(); + if (WARN_ON(!event_tr)) + return; ret = register_ftrace_function(&trace_ops); if (WARN_ON(ret < 0)) { pr_info("Failed to enable function tracer for event tests\n"); diff --git a/kernel/kernel/trace/trace_events_filter.c b/kernel/kernel/trace/trace_events_filter.c index 52adf02d7..681630254 100644 --- a/kernel/kernel/trace/trace_events_filter.c +++ b/kernel/kernel/trace/trace_events_filter.c @@ -252,6 +252,50 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event) return match; } +/* Filter predicate for CPUs. */ +static int filter_pred_cpu(struct filter_pred *pred, void *event) +{ + int cpu, cmp; + int match = 0; + + cpu = raw_smp_processor_id(); + cmp = pred->val; + + switch (pred->op) { + case OP_EQ: + match = cpu == cmp; + break; + case OP_LT: + match = cpu < cmp; + break; + case OP_LE: + match = cpu <= cmp; + break; + case OP_GT: + match = cpu > cmp; + break; + case OP_GE: + match = cpu >= cmp; + break; + default: + break; + } + + return !!match == !pred->not; +} + +/* Filter predicate for COMM. */ +static int filter_pred_comm(struct filter_pred *pred, void *event) +{ + int cmp, match; + + cmp = pred->regex.match(current->comm, &pred->regex, + pred->regex.field_len); + match = cmp ^ pred->not; + + return match; +} + static int filter_pred_none(struct filter_pred *pred, void *event) { return 0; @@ -643,7 +687,7 @@ static void append_filter_err(struct filter_parse_state *ps, free_page((unsigned long) buf); } -static inline struct event_filter *event_filter(struct ftrace_event_file *file) +static inline struct event_filter *event_filter(struct trace_event_file *file) { if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) return file->event_call->filter; @@ -652,7 +696,7 @@ static inline struct event_filter *event_filter(struct ftrace_event_file *file) } /* caller must hold event_mutex */ -void print_event_filter(struct ftrace_event_file *file, struct trace_seq *s) +void print_event_filter(struct trace_event_file *file, struct trace_seq *s) { struct event_filter *filter = event_filter(file); @@ -780,14 +824,14 @@ static void __free_preds(struct event_filter *filter) filter->n_preds = 0; } -static void filter_disable(struct ftrace_event_file *file) +static void filter_disable(struct trace_event_file *file) { - struct ftrace_event_call *call = file->event_call; + struct trace_event_call *call = file->event_call; if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) call->flags &= ~TRACE_EVENT_FL_FILTERED; else - file->flags &= ~FTRACE_EVENT_FL_FILTERED; + file->flags &= ~EVENT_FILE_FL_FILTERED; } static void __free_filter(struct event_filter *filter) @@ -837,9 +881,9 @@ static int __alloc_preds(struct event_filter *filter, int n_preds) return 0; } -static inline void __remove_filter(struct ftrace_event_file *file) +static inline void __remove_filter(struct trace_event_file *file) { - struct ftrace_event_call *call = file->event_call; + struct trace_event_call *call = file->event_call; filter_disable(file); if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) @@ -848,10 +892,10 @@ static inline void __remove_filter(struct ftrace_event_file *file) remove_filter_string(file->filter); } -static void filter_free_subsystem_preds(struct ftrace_subsystem_dir *dir, +static void filter_free_subsystem_preds(struct trace_subsystem_dir *dir, struct trace_array *tr) { - struct ftrace_event_file *file; + struct trace_event_file *file; list_for_each_entry(file, &tr->events, list) { if (file->system != dir) @@ -860,9 +904,9 @@ static void filter_free_subsystem_preds(struct ftrace_subsystem_dir *dir, } } -static inline void __free_subsystem_filter(struct ftrace_event_file *file) +static inline void __free_subsystem_filter(struct trace_event_file *file) { - struct ftrace_event_call *call = file->event_call; + struct trace_event_call *call = file->event_call; if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) { __free_filter(call->filter); @@ -873,10 +917,10 @@ static inline void __free_subsystem_filter(struct ftrace_event_file *file) } } -static void filter_free_subsystem_filters(struct ftrace_subsystem_dir *dir, +static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir, struct trace_array *tr) { - struct ftrace_event_file *file; + struct trace_event_file *file; list_for_each_entry(file, &tr->events, list) { if (file->system != dir) @@ -929,15 +973,15 @@ static bool is_string_field(struct ftrace_event_field *field) field->filter_type == FILTER_PTR_STRING; } -static int is_legal_op(struct ftrace_event_field *field, int op) +static bool is_legal_op(struct ftrace_event_field *field, int op) { if (is_string_field(field) && (op != OP_EQ && op != OP_NE && op != OP_GLOB)) - return 0; + return false; if (!is_string_field(field) && op == OP_GLOB) - return 0; + return false; - return 1; + return true; } static filter_pred_fn_t select_comparison_fn(int op, int field_size, @@ -999,7 +1043,11 @@ static int init_pred(struct filter_parse_state *ps, return -EINVAL; } - if (is_string_field(field)) { + if (field->filter_type == FILTER_COMM) { + filter_build_regex(pred); + fn = filter_pred_comm; + pred->regex.field_len = TASK_COMM_LEN; + } else if (is_string_field(field)) { filter_build_regex(pred); if (field->filter_type == FILTER_STATIC_STRING) { @@ -1025,7 +1073,10 @@ static int init_pred(struct filter_parse_state *ps, } pred->val = val; - fn = select_comparison_fn(pred->op, field->size, + if (field->filter_type == FILTER_CPU) + fn = filter_pred_cpu; + else + fn = select_comparison_fn(pred->op, field->size, field->is_signed); if (!fn) { parse_error(ps, FILT_ERR_INVALID_OP, 0); @@ -1342,7 +1393,7 @@ parse_operand: } static struct filter_pred *create_pred(struct filter_parse_state *ps, - struct ftrace_event_call *call, + struct trace_event_call *call, int op, char *operand1, char *operand2) { struct ftrace_event_field *field; @@ -1564,7 +1615,7 @@ static int fold_pred_tree(struct event_filter *filter, filter->preds); } -static int replace_preds(struct ftrace_event_call *call, +static int replace_preds(struct trace_event_call *call, struct event_filter *filter, struct filter_parse_state *ps, bool dry_run) @@ -1677,20 +1728,20 @@ fail: return err; } -static inline void event_set_filtered_flag(struct ftrace_event_file *file) +static inline void event_set_filtered_flag(struct trace_event_file *file) { - struct ftrace_event_call *call = file->event_call; + struct trace_event_call *call = file->event_call; if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) call->flags |= TRACE_EVENT_FL_FILTERED; else - file->flags |= FTRACE_EVENT_FL_FILTERED; + file->flags |= EVENT_FILE_FL_FILTERED; } -static inline void event_set_filter(struct ftrace_event_file *file, +static inline void event_set_filter(struct trace_event_file *file, struct event_filter *filter) { - struct ftrace_event_call *call = file->event_call; + struct trace_event_call *call = file->event_call; if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) rcu_assign_pointer(call->filter, filter); @@ -1698,9 +1749,9 @@ static inline void event_set_filter(struct ftrace_event_file *file, rcu_assign_pointer(file->filter, filter); } -static inline void event_clear_filter(struct ftrace_event_file *file) +static inline void event_clear_filter(struct trace_event_file *file) { - struct ftrace_event_call *call = file->event_call; + struct trace_event_call *call = file->event_call; if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) RCU_INIT_POINTER(call->filter, NULL); @@ -1709,33 +1760,33 @@ static inline void event_clear_filter(struct ftrace_event_file *file) } static inline void -event_set_no_set_filter_flag(struct ftrace_event_file *file) +event_set_no_set_filter_flag(struct trace_event_file *file) { - struct ftrace_event_call *call = file->event_call; + struct trace_event_call *call = file->event_call; if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) call->flags |= TRACE_EVENT_FL_NO_SET_FILTER; else - file->flags |= FTRACE_EVENT_FL_NO_SET_FILTER; + file->flags |= EVENT_FILE_FL_NO_SET_FILTER; } static inline void -event_clear_no_set_filter_flag(struct ftrace_event_file *file) +event_clear_no_set_filter_flag(struct trace_event_file *file) { - struct ftrace_event_call *call = file->event_call; + struct trace_event_call *call = file->event_call; if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER; else - file->flags &= ~FTRACE_EVENT_FL_NO_SET_FILTER; + file->flags &= ~EVENT_FILE_FL_NO_SET_FILTER; } static inline bool -event_no_set_filter_flag(struct ftrace_event_file *file) +event_no_set_filter_flag(struct trace_event_file *file) { - struct ftrace_event_call *call = file->event_call; + struct trace_event_call *call = file->event_call; - if (file->flags & FTRACE_EVENT_FL_NO_SET_FILTER) + if (file->flags & EVENT_FILE_FL_NO_SET_FILTER) return true; if ((call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) && @@ -1750,12 +1801,12 @@ struct filter_list { struct event_filter *filter; }; -static int replace_system_preds(struct ftrace_subsystem_dir *dir, +static int replace_system_preds(struct trace_subsystem_dir *dir, struct trace_array *tr, struct filter_parse_state *ps, char *filter_string) { - struct ftrace_event_file *file; + struct trace_event_file *file; struct filter_list *filter_item; struct filter_list *tmp; LIST_HEAD(filter_list); @@ -1899,8 +1950,8 @@ static void create_filter_finish(struct filter_parse_state *ps) } /** - * create_filter - create a filter for a ftrace_event_call - * @call: ftrace_event_call to create a filter for + * create_filter - create a filter for a trace_event_call + * @call: trace_event_call to create a filter for * @filter_str: filter string * @set_str: remember @filter_str and enable detailed error in filter * @filterp: out param for created filter (always updated on return) @@ -1914,7 +1965,7 @@ static void create_filter_finish(struct filter_parse_state *ps) * information if @set_str is %true and the caller is responsible for * freeing it. */ -static int create_filter(struct ftrace_event_call *call, +static int create_filter(struct trace_event_call *call, char *filter_str, bool set_str, struct event_filter **filterp) { @@ -1934,7 +1985,7 @@ static int create_filter(struct ftrace_event_call *call, return err; } -int create_event_filter(struct ftrace_event_call *call, +int create_event_filter(struct trace_event_call *call, char *filter_str, bool set_str, struct event_filter **filterp) { @@ -1950,7 +2001,7 @@ int create_event_filter(struct ftrace_event_call *call, * Identical to create_filter() except that it creates a subsystem filter * and always remembers @filter_str. */ -static int create_system_filter(struct ftrace_subsystem_dir *dir, +static int create_system_filter(struct trace_subsystem_dir *dir, struct trace_array *tr, char *filter_str, struct event_filter **filterp) { @@ -1976,9 +2027,9 @@ static int create_system_filter(struct ftrace_subsystem_dir *dir, } /* caller must hold event_mutex */ -int apply_event_filter(struct ftrace_event_file *file, char *filter_string) +int apply_event_filter(struct trace_event_file *file, char *filter_string) { - struct ftrace_event_call *call = file->event_call; + struct trace_event_call *call = file->event_call; struct event_filter *filter; int err; @@ -2027,7 +2078,7 @@ int apply_event_filter(struct ftrace_event_file *file, char *filter_string) return err; } -int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir, +int apply_subsystem_event_filter(struct trace_subsystem_dir *dir, char *filter_string) { struct event_subsystem *system = dir->subsystem; @@ -2090,7 +2141,7 @@ struct function_filter_data { static char ** ftrace_function_filter_re(char *buf, int len, int *count) { - char *str, *sep, **re; + char *str, **re; str = kstrndup(buf, len, GFP_KERNEL); if (!str) @@ -2100,8 +2151,7 @@ ftrace_function_filter_re(char *buf, int len, int *count) * The argv_split function takes white space * as a separator, so convert ',' into spaces. */ - while ((sep = strchr(str, ','))) - *sep = ' '; + strreplace(str, ',', ' '); re = argv_split(GFP_KERNEL, str, count); kfree(str); @@ -2227,7 +2277,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, { int err; struct event_filter *filter; - struct ftrace_event_call *call; + struct trace_event_call *call; mutex_lock(&event_mutex); @@ -2283,7 +2333,7 @@ out_unlock: static struct test_filter_data_t { char *filter; - struct ftrace_raw_ftrace_test_filter rec; + struct trace_event_raw_ftrace_test_filter rec; int match; char *not_visited; } test_filter_data[] = { diff --git a/kernel/kernel/trace/trace_events_trigger.c b/kernel/kernel/trace/trace_events_trigger.c index 8712df9de..42a4009fd 100644 --- a/kernel/kernel/trace/trace_events_trigger.c +++ b/kernel/kernel/trace/trace_events_trigger.c @@ -40,7 +40,7 @@ trigger_data_free(struct event_trigger_data *data) /** * event_triggers_call - Call triggers associated with a trace event - * @file: The ftrace_event_file associated with the event + * @file: The trace_event_file associated with the event * @rec: The trace entry for the event, NULL for unconditional invocation * * For each trigger associated with an event, invoke the trigger @@ -63,7 +63,7 @@ trigger_data_free(struct event_trigger_data *data) * any trigger that should be deferred, ETT_NONE if nothing to defer. */ enum event_trigger_type -event_triggers_call(struct ftrace_event_file *file, void *rec) +event_triggers_call(struct trace_event_file *file, void *rec) { struct event_trigger_data *data; enum event_trigger_type tt = ETT_NONE; @@ -92,7 +92,7 @@ EXPORT_SYMBOL_GPL(event_triggers_call); /** * event_triggers_post_call - Call 'post_triggers' for a trace event - * @file: The ftrace_event_file associated with the event + * @file: The trace_event_file associated with the event * @tt: enum event_trigger_type containing a set bit for each trigger to invoke * * For each trigger associated with an event, invoke the trigger @@ -103,7 +103,7 @@ EXPORT_SYMBOL_GPL(event_triggers_call); * Called from tracepoint handlers (with rcu_read_lock_sched() held). */ void -event_triggers_post_call(struct ftrace_event_file *file, +event_triggers_post_call(struct trace_event_file *file, enum event_trigger_type tt) { struct event_trigger_data *data; @@ -119,7 +119,7 @@ EXPORT_SYMBOL_GPL(event_triggers_post_call); static void *trigger_next(struct seq_file *m, void *t, loff_t *pos) { - struct ftrace_event_file *event_file = event_file_data(m->private); + struct trace_event_file *event_file = event_file_data(m->private); if (t == SHOW_AVAILABLE_TRIGGERS) return NULL; @@ -129,7 +129,7 @@ static void *trigger_next(struct seq_file *m, void *t, loff_t *pos) static void *trigger_start(struct seq_file *m, loff_t *pos) { - struct ftrace_event_file *event_file; + struct trace_event_file *event_file; /* ->stop() is called even if ->start() fails */ mutex_lock(&event_mutex); @@ -201,7 +201,7 @@ static int event_trigger_regex_open(struct inode *inode, struct file *file) return ret; } -static int trigger_process_regex(struct ftrace_event_file *file, char *buff) +static int trigger_process_regex(struct trace_event_file *file, char *buff) { char *command, *next = buff; struct event_command *p; @@ -227,7 +227,7 @@ static ssize_t event_trigger_regex_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ftrace_event_file *event_file; + struct trace_event_file *event_file; ssize_t ret; char *buf; @@ -430,7 +430,7 @@ event_trigger_free(struct event_trigger_ops *ops, trigger_data_free(data); } -static int trace_event_trigger_enable_disable(struct ftrace_event_file *file, +static int trace_event_trigger_enable_disable(struct trace_event_file *file, int trigger_enable) { int ret = 0; @@ -438,12 +438,12 @@ static int trace_event_trigger_enable_disable(struct ftrace_event_file *file, if (trigger_enable) { if (atomic_inc_return(&file->tm_ref) > 1) return ret; - set_bit(FTRACE_EVENT_FL_TRIGGER_MODE_BIT, &file->flags); + set_bit(EVENT_FILE_FL_TRIGGER_MODE_BIT, &file->flags); ret = trace_event_enable_disable(file, 1, 1); } else { if (atomic_dec_return(&file->tm_ref) > 0) return ret; - clear_bit(FTRACE_EVENT_FL_TRIGGER_MODE_BIT, &file->flags); + clear_bit(EVENT_FILE_FL_TRIGGER_MODE_BIT, &file->flags); ret = trace_event_enable_disable(file, 0, 1); } @@ -466,7 +466,7 @@ static int trace_event_trigger_enable_disable(struct ftrace_event_file *file, void clear_event_triggers(struct trace_array *tr) { - struct ftrace_event_file *file; + struct trace_event_file *file; list_for_each_entry(file, &tr->events, list) { struct event_trigger_data *data; @@ -480,7 +480,7 @@ clear_event_triggers(struct trace_array *tr) /** * update_cond_flag - Set or reset the TRIGGER_COND bit - * @file: The ftrace_event_file associated with the event + * @file: The trace_event_file associated with the event * * If an event has triggers and any of those triggers has a filter or * a post_trigger, trigger invocation needs to be deferred until after @@ -488,7 +488,7 @@ clear_event_triggers(struct trace_array *tr) * its TRIGGER_COND bit set, otherwise the TRIGGER_COND bit should be * cleared. */ -static void update_cond_flag(struct ftrace_event_file *file) +static void update_cond_flag(struct trace_event_file *file) { struct event_trigger_data *data; bool set_cond = false; @@ -501,9 +501,9 @@ static void update_cond_flag(struct ftrace_event_file *file) } if (set_cond) - set_bit(FTRACE_EVENT_FL_TRIGGER_COND_BIT, &file->flags); + set_bit(EVENT_FILE_FL_TRIGGER_COND_BIT, &file->flags); else - clear_bit(FTRACE_EVENT_FL_TRIGGER_COND_BIT, &file->flags); + clear_bit(EVENT_FILE_FL_TRIGGER_COND_BIT, &file->flags); } /** @@ -511,7 +511,7 @@ static void update_cond_flag(struct ftrace_event_file *file) * @glob: The raw string used to register the trigger * @ops: The trigger ops associated with the trigger * @data: Trigger-specific data to associate with the trigger - * @file: The ftrace_event_file associated with the event + * @file: The trace_event_file associated with the event * * Common implementation for event trigger registration. * @@ -522,7 +522,7 @@ static void update_cond_flag(struct ftrace_event_file *file) */ static int register_trigger(char *glob, struct event_trigger_ops *ops, struct event_trigger_data *data, - struct ftrace_event_file *file) + struct trace_event_file *file) { struct event_trigger_data *test; int ret = 0; @@ -557,7 +557,7 @@ out: * @glob: The raw string used to register the trigger * @ops: The trigger ops associated with the trigger * @test: Trigger-specific data used to find the trigger to remove - * @file: The ftrace_event_file associated with the event + * @file: The trace_event_file associated with the event * * Common implementation for event trigger unregistration. * @@ -566,7 +566,7 @@ out: */ static void unregister_trigger(char *glob, struct event_trigger_ops *ops, struct event_trigger_data *test, - struct ftrace_event_file *file) + struct trace_event_file *file) { struct event_trigger_data *data; bool unregistered = false; @@ -588,7 +588,7 @@ static void unregister_trigger(char *glob, struct event_trigger_ops *ops, /** * event_trigger_callback - Generic event_command @func implementation * @cmd_ops: The command ops, used for trigger registration - * @file: The ftrace_event_file associated with the event + * @file: The trace_event_file associated with the event * @glob: The raw string used to register the trigger * @cmd: The cmd portion of the string used to register the trigger * @param: The params portion of the string used to register the trigger @@ -603,7 +603,7 @@ static void unregister_trigger(char *glob, struct event_trigger_ops *ops, */ static int event_trigger_callback(struct event_command *cmd_ops, - struct ftrace_event_file *file, + struct trace_event_file *file, char *glob, char *cmd, char *param) { struct event_trigger_data *trigger_data; @@ -688,7 +688,7 @@ event_trigger_callback(struct event_command *cmd_ops, * set_trigger_filter - Generic event_command @set_filter implementation * @filter_str: The filter string for the trigger, NULL to remove filter * @trigger_data: Trigger-specific data - * @file: The ftrace_event_file associated with the event + * @file: The trace_event_file associated with the event * * Common implementation for event command filter parsing and filter * instantiation. @@ -702,7 +702,7 @@ event_trigger_callback(struct event_command *cmd_ops, */ static int set_trigger_filter(char *filter_str, struct event_trigger_data *trigger_data, - struct ftrace_event_file *file) + struct trace_event_file *file) { struct event_trigger_data *data = trigger_data; struct event_filter *filter = NULL, *tmp; @@ -900,7 +900,7 @@ snapshot_count_trigger(struct event_trigger_data *data) static int register_snapshot_trigger(char *glob, struct event_trigger_ops *ops, struct event_trigger_data *data, - struct ftrace_event_file *file) + struct trace_event_file *file) { int ret = register_trigger(glob, ops, data, file); @@ -968,7 +968,7 @@ static __init int register_trigger_snapshot_cmd(void) { return 0; } * Skip 3: * stacktrace_trigger() * event_triggers_post_call() - * ftrace_raw_event_xxx() + * trace_event_raw_event_xxx() */ #define STACK_SKIP 3 @@ -1053,7 +1053,7 @@ static __init void unregister_trigger_traceon_traceoff_cmds(void) #define DISABLE_EVENT_STR "disable_event" struct enable_trigger_data { - struct ftrace_event_file *file; + struct trace_event_file *file; bool enable; }; @@ -1063,9 +1063,9 @@ event_enable_trigger(struct event_trigger_data *data) struct enable_trigger_data *enable_data = data->private_data; if (enable_data->enable) - clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &enable_data->file->flags); + clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &enable_data->file->flags); else - set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &enable_data->file->flags); + set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &enable_data->file->flags); } static void @@ -1077,7 +1077,7 @@ event_enable_count_trigger(struct event_trigger_data *data) return; /* Skip if the event is in a state we want to switch to */ - if (enable_data->enable == !(enable_data->file->flags & FTRACE_EVENT_FL_SOFT_DISABLED)) + if (enable_data->enable == !(enable_data->file->flags & EVENT_FILE_FL_SOFT_DISABLED)) return; if (data->count != -1) @@ -1095,7 +1095,7 @@ event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, seq_printf(m, "%s:%s:%s", enable_data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR, enable_data->file->event_call->class->system, - ftrace_event_name(enable_data->file->event_call)); + trace_event_name(enable_data->file->event_call)); if (data->count == -1) seq_puts(m, ":unlimited"); @@ -1159,10 +1159,10 @@ static struct event_trigger_ops event_disable_count_trigger_ops = { static int event_enable_trigger_func(struct event_command *cmd_ops, - struct ftrace_event_file *file, + struct trace_event_file *file, char *glob, char *cmd, char *param) { - struct ftrace_event_file *event_enable_file; + struct trace_event_file *event_enable_file; struct enable_trigger_data *enable_data; struct event_trigger_data *trigger_data; struct event_trigger_ops *trigger_ops; @@ -1294,7 +1294,7 @@ event_enable_trigger_func(struct event_command *cmd_ops, static int event_enable_register_trigger(char *glob, struct event_trigger_ops *ops, struct event_trigger_data *data, - struct ftrace_event_file *file) + struct trace_event_file *file) { struct enable_trigger_data *enable_data = data->private_data; struct enable_trigger_data *test_enable_data; @@ -1331,7 +1331,7 @@ out: static void event_enable_unregister_trigger(char *glob, struct event_trigger_ops *ops, struct event_trigger_data *test, - struct ftrace_event_file *file) + struct trace_event_file *file) { struct enable_trigger_data *test_enable_data = test->private_data; struct enable_trigger_data *enable_data; diff --git a/kernel/kernel/trace/trace_export.c b/kernel/kernel/trace/trace_export.c index 174a6a711..39aa7aa66 100644 --- a/kernel/kernel/trace/trace_export.c +++ b/kernel/kernel/trace/trace_export.c @@ -125,7 +125,7 @@ static void __always_unused ____ftrace_check_##name(void) \ #undef FTRACE_ENTRY #define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \ static int __init \ -ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ +ftrace_define_fields_##name(struct trace_event_call *event_call) \ { \ struct struct_name field; \ int ret; \ @@ -163,14 +163,14 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ #define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\ regfn) \ \ -struct ftrace_event_class __refdata event_class_ftrace_##call = { \ +struct trace_event_class __refdata event_class_ftrace_##call = { \ .system = __stringify(TRACE_SYSTEM), \ .define_fields = ftrace_define_fields_##call, \ .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ .reg = regfn, \ }; \ \ -struct ftrace_event_call __used event_##call = { \ +struct trace_event_call __used event_##call = { \ .class = &event_class_ftrace_##call, \ { \ .name = #call, \ @@ -179,7 +179,7 @@ struct ftrace_event_call __used event_##call = { \ .print_fmt = print, \ .flags = TRACE_EVENT_FL_IGNORE_ENABLE, \ }; \ -struct ftrace_event_call __used \ +struct trace_event_call __used \ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; #undef FTRACE_ENTRY @@ -187,7 +187,7 @@ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; FTRACE_ENTRY_REG(call, struct_name, etype, \ PARAMS(tstruct), PARAMS(print), filter, NULL) -int ftrace_event_is_function(struct ftrace_event_call *call) +bool ftrace_event_is_function(struct trace_event_call *call) { return call == &event_function; } diff --git a/kernel/kernel/trace/trace_functions_graph.c b/kernel/kernel/trace/trace_functions_graph.c index a51e79688..a663cbb84 100644 --- a/kernel/kernel/trace/trace_functions_graph.c +++ b/kernel/kernel/trace/trace_functions_graph.c @@ -83,13 +83,18 @@ static struct tracer_opt trace_opts[] = { { TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) }, /* Display function name after trailing } */ { TRACER_OPT(funcgraph-tail, TRACE_GRAPH_PRINT_TAIL) }, + /* Include sleep time (scheduled out) between entry and return */ + { TRACER_OPT(sleep-time, TRACE_GRAPH_SLEEP_TIME) }, + /* Include time within nested functions */ + { TRACER_OPT(graph-time, TRACE_GRAPH_GRAPH_TIME) }, { } /* Empty entry */ }; static struct tracer_flags tracer_flags = { /* Don't display overruns, proc, or tail by default */ .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD | - TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS, + TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS | + TRACE_GRAPH_SLEEP_TIME | TRACE_GRAPH_GRAPH_TIME, .opts = trace_opts }; @@ -107,8 +112,8 @@ enum { }; static void -print_graph_duration(unsigned long long duration, struct trace_seq *s, - u32 flags); +print_graph_duration(struct trace_array *tr, unsigned long long duration, + struct trace_seq *s, u32 flags); /* Add a function return address to the trace stack on thread info.*/ int @@ -278,14 +283,11 @@ int __trace_graph_entry(struct trace_array *tr, unsigned long flags, int pc) { - struct ftrace_event_call *call = &event_funcgraph_entry; + struct trace_event_call *call = &event_funcgraph_entry; struct ring_buffer_event *event; struct ring_buffer *buffer = tr->trace_buffer.buffer; struct ftrace_graph_ent_entry *entry; - if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) - return 0; - event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, sizeof(*entry), flags, pc); if (!event) @@ -393,14 +395,11 @@ void __trace_graph_return(struct trace_array *tr, unsigned long flags, int pc) { - struct ftrace_event_call *call = &event_funcgraph_exit; + struct trace_event_call *call = &event_funcgraph_exit; struct ring_buffer_event *event; struct ring_buffer *buffer = tr->trace_buffer.buffer; struct ftrace_graph_ret_entry *entry; - if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) - return; - event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET, sizeof(*entry), flags, pc); if (!event) @@ -653,6 +652,7 @@ static void print_graph_irq(struct trace_iterator *iter, unsigned long addr, enum trace_type type, int cpu, pid_t pid, u32 flags) { + struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; struct trace_entry *ent = iter->ent; @@ -660,7 +660,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, addr >= (unsigned long)__irqentry_text_end) return; - if (trace_flags & TRACE_ITER_CONTEXT_INFO) { + if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) { /* Absolute time */ if (flags & TRACE_GRAPH_PRINT_ABS_TIME) print_graph_abs_time(iter->ts, s); @@ -676,19 +676,19 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, } /* Latency format */ - if (trace_flags & TRACE_ITER_LATENCY_FMT) + if (tr->trace_flags & TRACE_ITER_LATENCY_FMT) print_graph_lat_fmt(s, ent); } /* No overhead */ - print_graph_duration(0, s, flags | FLAGS_FILL_START); + print_graph_duration(tr, 0, s, flags | FLAGS_FILL_START); if (type == TRACE_GRAPH_ENT) trace_seq_puts(s, "==========>"); else trace_seq_puts(s, "<=========="); - print_graph_duration(0, s, flags | FLAGS_FILL_END); + print_graph_duration(tr, 0, s, flags | FLAGS_FILL_END); trace_seq_putc(s, '\n'); } @@ -715,22 +715,22 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) snprintf(nsecs_str, slen, "%03lu", nsecs_rem); trace_seq_printf(s, ".%s", nsecs_str); - len += strlen(nsecs_str); + len += strlen(nsecs_str) + 1; } trace_seq_puts(s, " us "); /* Print remaining spaces to fit the row's width */ - for (i = len; i < 7; i++) + for (i = len; i < 8; i++) trace_seq_putc(s, ' '); } static void -print_graph_duration(unsigned long long duration, struct trace_seq *s, - u32 flags) +print_graph_duration(struct trace_array *tr, unsigned long long duration, + struct trace_seq *s, u32 flags) { if (!(flags & TRACE_GRAPH_PRINT_DURATION) || - !(trace_flags & TRACE_ITER_CONTEXT_INFO)) + !(tr->trace_flags & TRACE_ITER_CONTEXT_INFO)) return; /* No real adata, just filling the column with spaces */ @@ -764,6 +764,7 @@ print_graph_entry_leaf(struct trace_iterator *iter, struct trace_seq *s, u32 flags) { struct fgraph_data *data = iter->private; + struct trace_array *tr = iter->tr; struct ftrace_graph_ret *graph_ret; struct ftrace_graph_ent *call; unsigned long long duration; @@ -792,7 +793,7 @@ print_graph_entry_leaf(struct trace_iterator *iter, } /* Overhead and duration */ - print_graph_duration(duration, s, flags); + print_graph_duration(tr, duration, s, flags); /* Function */ for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) @@ -810,6 +811,7 @@ print_graph_entry_nested(struct trace_iterator *iter, { struct ftrace_graph_ent *call = &entry->graph_ent; struct fgraph_data *data = iter->private; + struct trace_array *tr = iter->tr; int i; if (data) { @@ -825,7 +827,7 @@ print_graph_entry_nested(struct trace_iterator *iter, } /* No time */ - print_graph_duration(0, s, flags | FLAGS_FILL_FULL); + print_graph_duration(tr, 0, s, flags | FLAGS_FILL_FULL); /* Function */ for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) @@ -849,6 +851,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, { struct fgraph_data *data = iter->private; struct trace_entry *ent = iter->ent; + struct trace_array *tr = iter->tr; int cpu = iter->cpu; /* Pid */ @@ -858,7 +861,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, /* Interrupt */ print_graph_irq(iter, addr, type, cpu, ent->pid, flags); - if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) + if (!(tr->trace_flags & TRACE_ITER_CONTEXT_INFO)) return; /* Absolute time */ @@ -876,7 +879,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, } /* Latency format */ - if (trace_flags & TRACE_ITER_LATENCY_FMT) + if (tr->trace_flags & TRACE_ITER_LATENCY_FMT) print_graph_lat_fmt(s, ent); return; @@ -1027,6 +1030,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, { unsigned long long duration = trace->rettime - trace->calltime; struct fgraph_data *data = iter->private; + struct trace_array *tr = iter->tr; pid_t pid = ent->pid; int cpu = iter->cpu; int func_match = 1; @@ -1058,7 +1062,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, print_graph_prologue(iter, s, 0, 0, flags); /* Overhead and duration */ - print_graph_duration(duration, s, flags); + print_graph_duration(tr, duration, s, flags); /* Closing brace */ for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) @@ -1091,7 +1095,8 @@ static enum print_line_t print_graph_comment(struct trace_seq *s, struct trace_entry *ent, struct trace_iterator *iter, u32 flags) { - unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); + struct trace_array *tr = iter->tr; + unsigned long sym_flags = (tr->trace_flags & TRACE_ITER_SYM_MASK); struct fgraph_data *data = iter->private; struct trace_event *event; int depth = 0; @@ -1104,7 +1109,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, print_graph_prologue(iter, s, 0, 0, flags); /* No time */ - print_graph_duration(0, s, flags | FLAGS_FILL_FULL); + print_graph_duration(tr, 0, s, flags | FLAGS_FILL_FULL); /* Indentation */ if (depth > 0) @@ -1245,9 +1250,10 @@ static void print_lat_header(struct seq_file *s, u32 flags) seq_printf(s, "#%.*s||| / \n", size, spaces); } -static void __print_graph_headers_flags(struct seq_file *s, u32 flags) +static void __print_graph_headers_flags(struct trace_array *tr, + struct seq_file *s, u32 flags) { - int lat = trace_flags & TRACE_ITER_LATENCY_FMT; + int lat = tr->trace_flags & TRACE_ITER_LATENCY_FMT; if (lat) print_lat_header(s, flags); @@ -1289,11 +1295,12 @@ static void print_graph_headers(struct seq_file *s) void print_graph_headers_flags(struct seq_file *s, u32 flags) { struct trace_iterator *iter = s->private; + struct trace_array *tr = iter->tr; - if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) + if (!(tr->trace_flags & TRACE_ITER_CONTEXT_INFO)) return; - if (trace_flags & TRACE_ITER_LATENCY_FMT) { + if (tr->trace_flags & TRACE_ITER_LATENCY_FMT) { /* print nothing if the buffers are empty */ if (trace_empty(iter)) return; @@ -1301,7 +1308,7 @@ void print_graph_headers_flags(struct seq_file *s, u32 flags) print_trace_header(s, iter); } - __print_graph_headers_flags(s, flags); + __print_graph_headers_flags(tr, s, flags); } void graph_trace_open(struct trace_iterator *iter) @@ -1362,6 +1369,12 @@ func_graph_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) if (bit == TRACE_GRAPH_PRINT_IRQS) ftrace_graph_skip_irqs = !set; + if (bit == TRACE_GRAPH_SLEEP_TIME) + ftrace_graph_sleep_time_control(set); + + if (bit == TRACE_GRAPH_GRAPH_TIME) + ftrace_graph_graph_time_control(set); + return 0; } @@ -1454,12 +1467,12 @@ static __init int init_graph_trace(void) { max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1); - if (!register_ftrace_event(&graph_trace_entry_event)) { + if (!register_trace_event(&graph_trace_entry_event)) { pr_warning("Warning: could not register graph trace events\n"); return 1; } - if (!register_ftrace_event(&graph_trace_ret_event)) { + if (!register_trace_event(&graph_trace_ret_event)) { pr_warning("Warning: could not register graph trace events\n"); return 1; } diff --git a/kernel/kernel/trace/trace_irqsoff.c b/kernel/kernel/trace/trace_irqsoff.c index d0e1d0e48..069942c22 100644 --- a/kernel/kernel/trace/trace_irqsoff.c +++ b/kernel/kernel/trace/trace_irqsoff.c @@ -32,7 +32,6 @@ enum { static int trace_type __read_mostly; static int save_flags; -static bool function_enabled; static void stop_irqsoff_tracer(struct trace_array *tr, int graph); static int start_irqsoff_tracer(struct trace_array *tr, int graph); @@ -58,22 +57,16 @@ irq_trace(void) # define irq_trace() (0) #endif -#define TRACE_DISPLAY_GRAPH 1 - -static struct tracer_opt trace_opts[] = { #ifdef CONFIG_FUNCTION_GRAPH_TRACER - /* display latency trace as call graph */ - { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) }, +static int irqsoff_display_graph(struct trace_array *tr, int set); +# define is_graph(tr) ((tr)->trace_flags & TRACE_ITER_DISPLAY_GRAPH) +#else +static inline int irqsoff_display_graph(struct trace_array *tr, int set) +{ + return -EINVAL; +} +# define is_graph(tr) false #endif - { } /* Empty entry */ -}; - -static struct tracer_flags tracer_flags = { - .val = 0, - .opts = trace_opts, -}; - -#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH) /* * Sequence count - we record it when starting a measurement and @@ -153,15 +146,11 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip, #endif /* CONFIG_FUNCTION_TRACER */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER -static int -irqsoff_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) +static int irqsoff_display_graph(struct trace_array *tr, int set) { int cpu; - if (!(bit & TRACE_DISPLAY_GRAPH)) - return -EINVAL; - - if (!(is_graph() ^ set)) + if (!(is_graph(tr) ^ set)) return 0; stop_irqsoff_tracer(irqsoff_trace, !set); @@ -210,7 +199,7 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace) static void irqsoff_trace_open(struct trace_iterator *iter) { - if (is_graph()) + if (is_graph(iter->tr)) graph_trace_open(iter); } @@ -232,7 +221,7 @@ static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) * In graph mode call the graph tracer output function, * otherwise go with the TRACE_FN event handler */ - if (is_graph()) + if (is_graph(iter->tr)) return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS); return TRACE_TYPE_UNHANDLED; @@ -240,7 +229,9 @@ static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) static void irqsoff_print_header(struct seq_file *s) { - if (is_graph()) + struct trace_array *tr = irqsoff_trace; + + if (is_graph(tr)) print_graph_headers_flags(s, GRAPH_TRACER_FLAGS); else trace_default_header(s); @@ -251,7 +242,7 @@ __trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, int pc) { - if (is_graph()) + if (is_graph(tr)) trace_graph_function(tr, ip, parent_ip, flags, pc); else trace_function(tr, ip, parent_ip, flags, pc); @@ -260,27 +251,23 @@ __trace_function(struct trace_array *tr, #else #define __trace_function trace_function -static int -irqsoff_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) -{ - return -EINVAL; -} - +#ifdef CONFIG_FUNCTION_TRACER static int irqsoff_graph_entry(struct ftrace_graph_ent *trace) { return -1; } +#endif static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) { return TRACE_TYPE_UNHANDLED; } -static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { } static void irqsoff_trace_open(struct trace_iterator *iter) { } static void irqsoff_trace_close(struct trace_iterator *iter) { } #ifdef CONFIG_FUNCTION_TRACER +static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { } static void irqsoff_print_header(struct seq_file *s) { trace_default_header(s); @@ -296,16 +283,16 @@ static void irqsoff_print_header(struct seq_file *s) /* * Should this new latency be reported/recorded? */ -static int report_latency(struct trace_array *tr, cycle_t delta) +static bool report_latency(struct trace_array *tr, cycle_t delta) { if (tracing_thresh) { if (delta < tracing_thresh) - return 0; + return false; } else { if (delta <= tr->max_latency) - return 0; + return false; } - return 1; + return true; } static void @@ -434,13 +421,13 @@ void start_critical_timings(void) { if (preempt_trace() || irq_trace()) start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); - trace_preemptirqsoff_hist(TRACE_START, 1); + trace_preemptirqsoff_hist_rcuidle(TRACE_START, 1); } EXPORT_SYMBOL_GPL(start_critical_timings); void stop_critical_timings(void) { - trace_preemptirqsoff_hist(TRACE_STOP, 0); + trace_preemptirqsoff_hist_rcuidle(TRACE_STOP, 0); if (preempt_trace() || irq_trace()) stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } @@ -534,12 +521,15 @@ void trace_preempt_off(unsigned long a0, unsigned long a1) } #endif /* CONFIG_PREEMPT_TRACER */ +#ifdef CONFIG_FUNCTION_TRACER +static bool function_enabled; + static int register_irqsoff_function(struct trace_array *tr, int graph, int set) { int ret; /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */ - if (function_enabled || (!set && !(trace_flags & TRACE_ITER_FUNCTION))) + if (function_enabled || (!set && !(tr->trace_flags & TRACE_ITER_FUNCTION))) return 0; if (graph) @@ -567,20 +557,40 @@ static void unregister_irqsoff_function(struct trace_array *tr, int graph) function_enabled = false; } -static void irqsoff_function_set(struct trace_array *tr, int set) +static int irqsoff_function_set(struct trace_array *tr, u32 mask, int set) { + if (!(mask & TRACE_ITER_FUNCTION)) + return 0; + if (set) - register_irqsoff_function(tr, is_graph(), 1); + register_irqsoff_function(tr, is_graph(tr), 1); else - unregister_irqsoff_function(tr, is_graph()); + unregister_irqsoff_function(tr, is_graph(tr)); + return 1; +} +#else +static int register_irqsoff_function(struct trace_array *tr, int graph, int set) +{ + return 0; +} +static void unregister_irqsoff_function(struct trace_array *tr, int graph) { } +static inline int irqsoff_function_set(struct trace_array *tr, u32 mask, int set) +{ + return 0; } +#endif /* CONFIG_FUNCTION_TRACER */ static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set) { struct tracer *tracer = tr->current_trace; - if (mask & TRACE_ITER_FUNCTION) - irqsoff_function_set(tr, set); + if (irqsoff_function_set(tr, mask, set)) + return 0; + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + if (mask & TRACE_ITER_DISPLAY_GRAPH) + return irqsoff_display_graph(tr, set); +#endif return trace_keep_overwrite(tracer, mask, set); } @@ -613,7 +623,7 @@ static int __irqsoff_tracer_init(struct trace_array *tr) if (irqsoff_busy) return -EBUSY; - save_flags = trace_flags; + save_flags = tr->trace_flags; /* non overwrite screws up the latency tracers */ set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); @@ -629,7 +639,7 @@ static int __irqsoff_tracer_init(struct trace_array *tr) /* Only toplevel instance supports graph tracing */ if (start_irqsoff_tracer(tr, (tr->flags & TRACE_ARRAY_FL_GLOBAL && - is_graph()))) + is_graph(tr)))) printk(KERN_ERR "failed to start irqsoff tracer\n"); irqsoff_busy = true; @@ -641,7 +651,7 @@ static void irqsoff_tracer_reset(struct trace_array *tr) int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT; int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE; - stop_irqsoff_tracer(tr, is_graph()); + stop_irqsoff_tracer(tr, is_graph(tr)); set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); @@ -677,8 +687,6 @@ static struct tracer irqsoff_tracer __read_mostly = .print_max = true, .print_header = irqsoff_print_header, .print_line = irqsoff_print_line, - .flags = &tracer_flags, - .set_flag = irqsoff_set_flag, .flag_changed = irqsoff_flag_changed, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_irqsoff, @@ -711,8 +719,6 @@ static struct tracer preemptoff_tracer __read_mostly = .print_max = true, .print_header = irqsoff_print_header, .print_line = irqsoff_print_line, - .flags = &tracer_flags, - .set_flag = irqsoff_set_flag, .flag_changed = irqsoff_flag_changed, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_preemptoff, @@ -747,8 +753,6 @@ static struct tracer preemptirqsoff_tracer __read_mostly = .print_max = true, .print_header = irqsoff_print_header, .print_line = irqsoff_print_line, - .flags = &tracer_flags, - .set_flag = irqsoff_set_flag, .flag_changed = irqsoff_flag_changed, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_preemptirqsoff, diff --git a/kernel/kernel/trace/trace_kdb.c b/kernel/kernel/trace/trace_kdb.c index 3ccf5c2c1..57149bce6 100644 --- a/kernel/kernel/trace/trace_kdb.c +++ b/kernel/kernel/trace/trace_kdb.c @@ -21,20 +21,22 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file) /* use static because iter can be a bit big for the stack */ static struct trace_iterator iter; static struct ring_buffer_iter *buffer_iter[CONFIG_NR_CPUS]; + struct trace_array *tr; unsigned int old_userobj; int cnt = 0, cpu; trace_init_global_iter(&iter); iter.buffer_iter = buffer_iter; + tr = iter.tr; for_each_tracing_cpu(cpu) { atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); } - old_userobj = trace_flags; + old_userobj = tr->trace_flags; /* don't look at user memory in panic mode */ - trace_flags &= ~TRACE_ITER_SYM_USEROBJ; + tr->trace_flags &= ~TRACE_ITER_SYM_USEROBJ; kdb_printf("Dumping ftrace buffer:\n"); @@ -82,7 +84,7 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file) kdb_printf("---------------------------------\n"); out: - trace_flags = old_userobj; + tr->trace_flags = old_userobj; for_each_tracing_cpu(cpu) { atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); diff --git a/kernel/kernel/trace/trace_kprobe.c b/kernel/kernel/trace/trace_kprobe.c index d0ce590f0..c9956440d 100644 --- a/kernel/kernel/trace/trace_kprobe.c +++ b/kernel/kernel/trace/trace_kprobe.c @@ -165,11 +165,9 @@ DEFINE_BASIC_FETCH_FUNCS(memory) static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, void *addr, void *dest) { - long ret; int maxlen = get_rloc_len(*(u32 *)dest); u8 *dst = get_rloc_data(dest); - u8 *src = addr; - mm_segment_t old_fs = get_fs(); + long ret; if (!maxlen) return; @@ -178,23 +176,13 @@ static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, * Try to get string again, since the string can be changed while * probing. */ - set_fs(KERNEL_DS); - pagefault_disable(); - - do - ret = __copy_from_user_inatomic(dst++, src++, 1); - while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen); - - dst[-1] = '\0'; - pagefault_enable(); - set_fs(old_fs); + ret = strncpy_from_unsafe(dst, addr, maxlen); if (ret < 0) { /* Failed to fetch string */ - ((u8 *)get_rloc_data(dest))[0] = '\0'; + dst[0] = '\0'; *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest)); } else { - *(u32 *)dest = make_data_rloc(src - (u8 *)addr, - get_rloc_offs(*(u32 *)dest)); + *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(*(u32 *)dest)); } } NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string)); @@ -348,7 +336,7 @@ static struct trace_kprobe *find_trace_kprobe(const char *event, struct trace_kprobe *tk; list_for_each_entry(tk, &probe_list, list) - if (strcmp(ftrace_event_name(&tk->tp.call), event) == 0 && + if (strcmp(trace_event_name(&tk->tp.call), event) == 0 && strcmp(tk->tp.call.class->system, group) == 0) return tk; return NULL; @@ -359,7 +347,7 @@ static struct trace_kprobe *find_trace_kprobe(const char *event, * if the file is NULL, enable "perf" handler, or enable "trace" handler. */ static int -enable_trace_kprobe(struct trace_kprobe *tk, struct ftrace_event_file *file) +enable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) { int ret = 0; @@ -394,7 +382,7 @@ enable_trace_kprobe(struct trace_kprobe *tk, struct ftrace_event_file *file) * if the file is NULL, disable "perf" handler, or disable "trace" handler. */ static int -disable_trace_kprobe(struct trace_kprobe *tk, struct ftrace_event_file *file) +disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) { struct event_file_link *link = NULL; int wait = 0; @@ -523,7 +511,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk) mutex_lock(&probe_lock); /* Delete old (same name) event if exist */ - old_tk = find_trace_kprobe(ftrace_event_name(&tk->tp.call), + old_tk = find_trace_kprobe(trace_event_name(&tk->tp.call), tk->tp.call.class->system); if (old_tk) { ret = unregister_trace_kprobe(old_tk); @@ -572,7 +560,7 @@ static int trace_kprobe_module_callback(struct notifier_block *nb, if (ret) pr_warning("Failed to re-register probe %s on" "%s: %d\n", - ftrace_event_name(&tk->tp.call), + trace_event_name(&tk->tp.call), mod->name, ret); } } @@ -829,7 +817,7 @@ static int probes_seq_show(struct seq_file *m, void *v) seq_putc(m, trace_kprobe_is_return(tk) ? 'r' : 'p'); seq_printf(m, ":%s/%s", tk->tp.call.class->system, - ftrace_event_name(&tk->tp.call)); + trace_event_name(&tk->tp.call)); if (!tk->symbol) seq_printf(m, " 0x%p", tk->rp.kp.addr); @@ -888,7 +876,7 @@ static int probes_profile_seq_show(struct seq_file *m, void *v) struct trace_kprobe *tk = v; seq_printf(m, " %-44s %15lu %15lu\n", - ftrace_event_name(&tk->tp.call), tk->nhit, + trace_event_name(&tk->tp.call), tk->nhit, tk->rp.kp.nmissed); return 0; @@ -917,18 +905,18 @@ static const struct file_operations kprobe_profile_ops = { /* Kprobe handler */ static nokprobe_inline void __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, - struct ftrace_event_file *ftrace_file) + struct trace_event_file *trace_file) { struct kprobe_trace_entry_head *entry; struct ring_buffer_event *event; struct ring_buffer *buffer; int size, dsize, pc; unsigned long irq_flags; - struct ftrace_event_call *call = &tk->tp.call; + struct trace_event_call *call = &tk->tp.call; - WARN_ON(call != ftrace_file->event_call); + WARN_ON(call != trace_file->event_call); - if (ftrace_trigger_soft_disabled(ftrace_file)) + if (trace_trigger_soft_disabled(trace_file)) return; local_save_flags(irq_flags); @@ -937,7 +925,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, dsize = __get_data_size(&tk->tp, regs); size = sizeof(*entry) + tk->tp.size + dsize; - event = trace_event_buffer_lock_reserve(&buffer, ftrace_file, + event = trace_event_buffer_lock_reserve(&buffer, trace_file, call->event.type, size, irq_flags, pc); if (!event) @@ -947,7 +935,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, entry->ip = (unsigned long)tk->rp.kp.addr; store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); - event_trigger_unlock_commit_regs(ftrace_file, buffer, event, + event_trigger_unlock_commit_regs(trace_file, buffer, event, entry, irq_flags, pc, regs); } @@ -965,18 +953,18 @@ NOKPROBE_SYMBOL(kprobe_trace_func); static nokprobe_inline void __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, struct pt_regs *regs, - struct ftrace_event_file *ftrace_file) + struct trace_event_file *trace_file) { struct kretprobe_trace_entry_head *entry; struct ring_buffer_event *event; struct ring_buffer *buffer; int size, pc, dsize; unsigned long irq_flags; - struct ftrace_event_call *call = &tk->tp.call; + struct trace_event_call *call = &tk->tp.call; - WARN_ON(call != ftrace_file->event_call); + WARN_ON(call != trace_file->event_call); - if (ftrace_trigger_soft_disabled(ftrace_file)) + if (trace_trigger_soft_disabled(trace_file)) return; local_save_flags(irq_flags); @@ -985,7 +973,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, dsize = __get_data_size(&tk->tp, regs); size = sizeof(*entry) + tk->tp.size + dsize; - event = trace_event_buffer_lock_reserve(&buffer, ftrace_file, + event = trace_event_buffer_lock_reserve(&buffer, trace_file, call->event.type, size, irq_flags, pc); if (!event) @@ -996,7 +984,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, entry->ret_ip = (unsigned long)ri->ret_addr; store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); - event_trigger_unlock_commit_regs(ftrace_file, buffer, event, + event_trigger_unlock_commit_regs(trace_file, buffer, event, entry, irq_flags, pc, regs); } @@ -1025,7 +1013,7 @@ print_kprobe_event(struct trace_iterator *iter, int flags, field = (struct kprobe_trace_entry_head *)iter->ent; tp = container_of(event, struct trace_probe, call.event); - trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call)); + trace_seq_printf(s, "%s: (", trace_event_name(&tp->call)); if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) goto out; @@ -1056,7 +1044,7 @@ print_kretprobe_event(struct trace_iterator *iter, int flags, field = (struct kretprobe_trace_entry_head *)iter->ent; tp = container_of(event, struct trace_probe, call.event); - trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call)); + trace_seq_printf(s, "%s: (", trace_event_name(&tp->call)); if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET)) goto out; @@ -1081,7 +1069,7 @@ print_kretprobe_event(struct trace_iterator *iter, int flags, } -static int kprobe_event_define_fields(struct ftrace_event_call *event_call) +static int kprobe_event_define_fields(struct trace_event_call *event_call) { int ret, i; struct kprobe_trace_entry_head field; @@ -1104,7 +1092,7 @@ static int kprobe_event_define_fields(struct ftrace_event_call *event_call) return 0; } -static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) +static int kretprobe_event_define_fields(struct trace_event_call *event_call) { int ret, i; struct kretprobe_trace_entry_head field; @@ -1134,7 +1122,7 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) static void kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) { - struct ftrace_event_call *call = &tk->tp.call; + struct trace_event_call *call = &tk->tp.call; struct bpf_prog *prog = call->prog; struct kprobe_trace_entry_head *entry; struct hlist_head *head; @@ -1169,7 +1157,7 @@ static void kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, struct pt_regs *regs) { - struct ftrace_event_call *call = &tk->tp.call; + struct trace_event_call *call = &tk->tp.call; struct bpf_prog *prog = call->prog; struct kretprobe_trace_entry_head *entry; struct hlist_head *head; @@ -1206,11 +1194,11 @@ NOKPROBE_SYMBOL(kretprobe_perf_func); * kprobe_trace_self_tests_init() does enable_trace_probe/disable_trace_probe * lockless, but we can't race with this __init function. */ -static int kprobe_register(struct ftrace_event_call *event, +static int kprobe_register(struct trace_event_call *event, enum trace_reg type, void *data) { struct trace_kprobe *tk = (struct trace_kprobe *)event->data; - struct ftrace_event_file *file = data; + struct trace_event_file *file = data; switch (type) { case TRACE_REG_REGISTER: @@ -1276,10 +1264,10 @@ static struct trace_event_functions kprobe_funcs = { static int register_kprobe_event(struct trace_kprobe *tk) { - struct ftrace_event_call *call = &tk->tp.call; + struct trace_event_call *call = &tk->tp.call; int ret; - /* Initialize ftrace_event_call */ + /* Initialize trace_event_call */ INIT_LIST_HEAD(&call->class->fields); if (trace_kprobe_is_return(tk)) { call->event.funcs = &kretprobe_funcs; @@ -1290,7 +1278,7 @@ static int register_kprobe_event(struct trace_kprobe *tk) } if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) return -ENOMEM; - ret = register_ftrace_event(&call->event); + ret = register_trace_event(&call->event); if (!ret) { kfree(call->print_fmt); return -ENODEV; @@ -1301,9 +1289,9 @@ static int register_kprobe_event(struct trace_kprobe *tk) ret = trace_add_event_call(call); if (ret) { pr_info("Failed to register kprobe event: %s\n", - ftrace_event_name(call)); + trace_event_name(call)); kfree(call->print_fmt); - unregister_ftrace_event(&call->event); + unregister_trace_event(&call->event); } return ret; } @@ -1364,10 +1352,10 @@ static __used int kprobe_trace_selftest_target(int a1, int a2, int a3, return a1 + a2 + a3 + a4 + a5 + a6; } -static struct ftrace_event_file * +static struct trace_event_file * find_trace_probe_file(struct trace_kprobe *tk, struct trace_array *tr) { - struct ftrace_event_file *file; + struct trace_event_file *file; list_for_each_entry(file, &tr->events, list) if (file->event_call == &tk->tp.call) @@ -1385,7 +1373,7 @@ static __init int kprobe_trace_self_tests_init(void) int ret, warn = 0; int (*target)(int, int, int, int, int, int); struct trace_kprobe *tk; - struct ftrace_event_file *file; + struct trace_event_file *file; if (tracing_is_disabled()) return -ENODEV; diff --git a/kernel/kernel/trace/trace_mmiotrace.c b/kernel/kernel/trace/trace_mmiotrace.c index 7a9ba62e9..2be8c4f24 100644 --- a/kernel/kernel/trace/trace_mmiotrace.c +++ b/kernel/kernel/trace/trace_mmiotrace.c @@ -298,7 +298,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, struct trace_array_cpu *data, struct mmiotrace_rw *rw) { - struct ftrace_event_call *call = &event_mmiotrace_rw; + struct trace_event_call *call = &event_mmiotrace_rw; struct ring_buffer *buffer = tr->trace_buffer.buffer; struct ring_buffer_event *event; struct trace_mmiotrace_rw *entry; @@ -314,7 +314,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, entry->rw = *rw; if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(buffer, event, 0, pc); + trace_buffer_unlock_commit(tr, buffer, event, 0, pc); } void mmio_trace_rw(struct mmiotrace_rw *rw) @@ -328,7 +328,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr, struct trace_array_cpu *data, struct mmiotrace_map *map) { - struct ftrace_event_call *call = &event_mmiotrace_map; + struct trace_event_call *call = &event_mmiotrace_map; struct ring_buffer *buffer = tr->trace_buffer.buffer; struct ring_buffer_event *event; struct trace_mmiotrace_map *entry; @@ -344,7 +344,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr, entry->map = *map; if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(buffer, event, 0, pc); + trace_buffer_unlock_commit(tr, buffer, event, 0, pc); } void mmio_trace_mapping(struct mmiotrace_map *map) diff --git a/kernel/kernel/trace/trace_output.c b/kernel/kernel/trace/trace_output.c index c86bed272..9f19d839a 100644 --- a/kernel/kernel/trace/trace_output.c +++ b/kernel/kernel/trace/trace_output.c @@ -60,9 +60,9 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter) } const char * -ftrace_print_flags_seq(struct trace_seq *p, const char *delim, - unsigned long flags, - const struct trace_print_flags *flag_array) +trace_print_flags_seq(struct trace_seq *p, const char *delim, + unsigned long flags, + const struct trace_print_flags *flag_array) { unsigned long mask; const char *str; @@ -95,11 +95,11 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim, return ret; } -EXPORT_SYMBOL(ftrace_print_flags_seq); +EXPORT_SYMBOL(trace_print_flags_seq); const char * -ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, - const struct trace_print_flags *symbol_array) +trace_print_symbols_seq(struct trace_seq *p, unsigned long val, + const struct trace_print_flags *symbol_array) { int i; const char *ret = trace_seq_buffer_ptr(p); @@ -120,11 +120,11 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, return ret; } -EXPORT_SYMBOL(ftrace_print_symbols_seq); +EXPORT_SYMBOL(trace_print_symbols_seq); #if BITS_PER_LONG == 32 const char * -ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, +trace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, const struct trace_print_flags_u64 *symbol_array) { int i; @@ -146,12 +146,12 @@ ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, return ret; } -EXPORT_SYMBOL(ftrace_print_symbols_seq_u64); +EXPORT_SYMBOL(trace_print_symbols_seq_u64); #endif const char * -ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr, - unsigned int bitmask_size) +trace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr, + unsigned int bitmask_size) { const char *ret = trace_seq_buffer_ptr(p); @@ -160,10 +160,10 @@ ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr, return ret; } -EXPORT_SYMBOL_GPL(ftrace_print_bitmask_seq); +EXPORT_SYMBOL_GPL(trace_print_bitmask_seq); const char * -ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) +trace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) { int i; const char *ret = trace_seq_buffer_ptr(p); @@ -175,11 +175,11 @@ ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) return ret; } -EXPORT_SYMBOL(ftrace_print_hex_seq); +EXPORT_SYMBOL(trace_print_hex_seq); const char * -ftrace_print_array_seq(struct trace_seq *p, const void *buf, int count, - size_t el_size) +trace_print_array_seq(struct trace_seq *p, const void *buf, int count, + size_t el_size) { const char *ret = trace_seq_buffer_ptr(p); const char *prefix = ""; @@ -220,17 +220,17 @@ ftrace_print_array_seq(struct trace_seq *p, const void *buf, int count, return ret; } -EXPORT_SYMBOL(ftrace_print_array_seq); +EXPORT_SYMBOL(trace_print_array_seq); -int ftrace_raw_output_prep(struct trace_iterator *iter, - struct trace_event *trace_event) +int trace_raw_output_prep(struct trace_iterator *iter, + struct trace_event *trace_event) { - struct ftrace_event_call *event; + struct trace_event_call *event; struct trace_seq *s = &iter->seq; struct trace_seq *p = &iter->tmp_seq; struct trace_entry *entry; - event = container_of(trace_event, struct ftrace_event_call, event); + event = container_of(trace_event, struct trace_event_call, event); entry = iter->ent; if (entry->type != event->event.type) { @@ -239,14 +239,14 @@ int ftrace_raw_output_prep(struct trace_iterator *iter, } trace_seq_init(p); - trace_seq_printf(s, "%s: ", ftrace_event_name(event)); + trace_seq_printf(s, "%s: ", trace_event_name(event)); return trace_handle_return(s); } -EXPORT_SYMBOL(ftrace_raw_output_prep); +EXPORT_SYMBOL(trace_raw_output_prep); -static int ftrace_output_raw(struct trace_iterator *iter, char *name, - char *fmt, va_list ap) +static int trace_output_raw(struct trace_iterator *iter, char *name, + char *fmt, va_list ap) { struct trace_seq *s = &iter->seq; @@ -256,18 +256,18 @@ static int ftrace_output_raw(struct trace_iterator *iter, char *name, return trace_handle_return(s); } -int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...) +int trace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...) { va_list ap; int ret; va_start(ap, fmt); - ret = ftrace_output_raw(iter, name, fmt, ap); + ret = trace_output_raw(iter, name, fmt, ap); va_end(ap); return ret; } -EXPORT_SYMBOL_GPL(ftrace_output_call); +EXPORT_SYMBOL_GPL(trace_output_call); #ifdef CONFIG_KRETPROBES static inline const char *kretprobed(const char *name) @@ -322,8 +322,8 @@ seq_print_sym_offset(struct trace_seq *s, const char *fmt, # define IP_FMT "%016lx" #endif -int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, - unsigned long ip, unsigned long sym_flags) +static int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, + unsigned long ip, unsigned long sym_flags) { struct file *file = NULL; unsigned long vmstart = 0; @@ -355,50 +355,6 @@ int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, } int -seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s, - unsigned long sym_flags) -{ - struct mm_struct *mm = NULL; - unsigned int i; - - if (trace_flags & TRACE_ITER_SYM_USEROBJ) { - struct task_struct *task; - /* - * we do the lookup on the thread group leader, - * since individual threads might have already quit! - */ - rcu_read_lock(); - task = find_task_by_vpid(entry->tgid); - if (task) - mm = get_task_mm(task); - rcu_read_unlock(); - } - - for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { - unsigned long ip = entry->caller[i]; - - if (ip == ULONG_MAX || trace_seq_has_overflowed(s)) - break; - - trace_seq_puts(s, " => "); - - if (!ip) { - trace_seq_puts(s, "??"); - trace_seq_putc(s, '\n'); - continue; - } - - seq_print_user_ip(s, mm, ip, sym_flags); - trace_seq_putc(s, '\n'); - } - - if (mm) - mmput(mm); - - return !trace_seq_has_overflowed(s); -} - -int seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) { if (!ip) { @@ -510,6 +466,8 @@ static const struct trace_mark { char sym; } mark[] = { MARK(1000000000ULL , '$'), /* 1 sec */ + MARK(100000000ULL , '@'), /* 100 msec */ + MARK(10000000ULL , '*'), /* 10 msec */ MARK(1000000ULL , '#'), /* 1000 usecs */ MARK(100000ULL , '!'), /* 100 usecs */ MARK(10000ULL , '+'), /* 10 usecs */ @@ -522,7 +480,7 @@ char trace_find_mark(unsigned long long d) int size = ARRAY_SIZE(mark); for (i = 0; i < size; i++) { - if (d >= mark[i].val) + if (d > mark[i].val) break; } @@ -532,7 +490,8 @@ char trace_find_mark(unsigned long long d) static int lat_print_timestamp(struct trace_iterator *iter, u64 next_ts) { - unsigned long verbose = trace_flags & TRACE_ITER_VERBOSE; + struct trace_array *tr = iter->tr; + unsigned long verbose = tr->trace_flags & TRACE_ITER_VERBOSE; unsigned long in_ns = iter->iter_flags & TRACE_FILE_TIME_IN_NS; unsigned long long abs_ts = iter->ts - iter->trace_buffer->time_start; unsigned long long rel_ts = next_ts - iter->ts; @@ -575,6 +534,7 @@ lat_print_timestamp(struct trace_iterator *iter, u64 next_ts) int trace_print_context(struct trace_iterator *iter) { + struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; struct trace_entry *entry = iter->ent; unsigned long long t; @@ -586,7 +546,7 @@ int trace_print_context(struct trace_iterator *iter) trace_seq_printf(s, "%16s-%-5d [%03d] ", comm, entry->pid, iter->cpu); - if (trace_flags & TRACE_ITER_IRQ_INFO) + if (tr->trace_flags & TRACE_ITER_IRQ_INFO) trace_print_lat_fmt(s, entry); if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) { @@ -602,14 +562,15 @@ int trace_print_context(struct trace_iterator *iter) int trace_print_lat_context(struct trace_iterator *iter) { - u64 next_ts; + struct trace_array *tr = iter->tr; /* trace_find_next_entry will reset ent_size */ int ent_size = iter->ent_size; struct trace_seq *s = &iter->seq; + u64 next_ts; struct trace_entry *entry = iter->ent, *next_entry = trace_find_next_entry(iter, NULL, &next_ts); - unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE); + unsigned long verbose = (tr->trace_flags & TRACE_ITER_VERBOSE); /* Restore the original ent_size */ iter->ent_size = ent_size; @@ -689,7 +650,7 @@ static int trace_search_list(struct list_head **list) } /* Did we used up all 65 thousand events??? */ - if ((last + 1) > FTRACE_MAX_EVENT) + if ((last + 1) > TRACE_EVENT_TYPE_MAX) return 0; *list = &e->list; @@ -707,7 +668,7 @@ void trace_event_read_unlock(void) } /** - * register_ftrace_event - register output for an event type + * register_trace_event - register output for an event type * @event: the event type to register * * Event types are stored in a hash and this hash is used to @@ -721,7 +682,7 @@ void trace_event_read_unlock(void) * * Returns the event type number or zero on error. */ -int register_ftrace_event(struct trace_event *event) +int register_trace_event(struct trace_event *event) { unsigned key; int ret = 0; @@ -739,7 +700,7 @@ int register_ftrace_event(struct trace_event *event) if (!event->type) { struct list_head *list = NULL; - if (next_event_type > FTRACE_MAX_EVENT) { + if (next_event_type > TRACE_EVENT_TYPE_MAX) { event->type = trace_search_list(&list); if (!event->type) @@ -785,12 +746,12 @@ int register_ftrace_event(struct trace_event *event) return ret; } -EXPORT_SYMBOL_GPL(register_ftrace_event); +EXPORT_SYMBOL_GPL(register_trace_event); /* * Used by module code with the trace_event_sem held for write. */ -int __unregister_ftrace_event(struct trace_event *event) +int __unregister_trace_event(struct trace_event *event) { hlist_del(&event->node); list_del(&event->list); @@ -798,18 +759,18 @@ int __unregister_ftrace_event(struct trace_event *event) } /** - * unregister_ftrace_event - remove a no longer used event + * unregister_trace_event - remove a no longer used event * @event: the event to remove */ -int unregister_ftrace_event(struct trace_event *event) +int unregister_trace_event(struct trace_event *event) { down_write(&trace_event_sem); - __unregister_ftrace_event(event); + __unregister_trace_event(event); up_write(&trace_event_sem); return 0; } -EXPORT_SYMBOL_GPL(unregister_ftrace_event); +EXPORT_SYMBOL_GPL(unregister_trace_event); /* * Standard events @@ -1091,13 +1052,49 @@ static struct trace_event trace_stack_event = { static enum print_line_t trace_user_stack_print(struct trace_iterator *iter, int flags, struct trace_event *event) { + struct trace_array *tr = iter->tr; struct userstack_entry *field; struct trace_seq *s = &iter->seq; + struct mm_struct *mm = NULL; + unsigned int i; trace_assign_type(field, iter->ent); trace_seq_puts(s, "<user stack trace>\n"); - seq_print_userip_objs(field, s, flags); + + if (tr->trace_flags & TRACE_ITER_SYM_USEROBJ) { + struct task_struct *task; + /* + * we do the lookup on the thread group leader, + * since individual threads might have already quit! + */ + rcu_read_lock(); + task = find_task_by_vpid(field->tgid); + if (task) + mm = get_task_mm(task); + rcu_read_unlock(); + } + + for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { + unsigned long ip = field->caller[i]; + + if (ip == ULONG_MAX || trace_seq_has_overflowed(s)) + break; + + trace_seq_puts(s, " => "); + + if (!ip) { + trace_seq_puts(s, "??"); + trace_seq_putc(s, '\n'); + continue; + } + + seq_print_user_ip(s, mm, ip, flags); + trace_seq_putc(s, '\n'); + } + + if (mm) + mmput(mm); return trace_handle_return(s); } @@ -1257,7 +1254,7 @@ __init static int init_events(void) for (i = 0; events[i]; i++) { event = events[i]; - ret = register_ftrace_event(event); + ret = register_trace_event(event); if (!ret) { printk(KERN_WARNING "event %d failed to register\n", event->type); diff --git a/kernel/kernel/trace/trace_output.h b/kernel/kernel/trace/trace_output.h index 8ef2c40ef..fabc49bcd 100644 --- a/kernel/kernel/trace/trace_output.h +++ b/kernel/kernel/trace/trace_output.h @@ -14,10 +14,6 @@ trace_print_printk_msg_only(struct trace_iterator *iter); extern int seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags); -extern int seq_print_userip_objs(const struct userstack_entry *entry, - struct trace_seq *s, unsigned long sym_flags); -extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, - unsigned long ip, unsigned long sym_flags); extern int trace_print_context(struct trace_iterator *iter); extern int trace_print_lat_context(struct trace_iterator *iter); @@ -32,7 +28,7 @@ extern int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry); /* used by module unregistering */ -extern int __unregister_ftrace_event(struct trace_event *event); +extern int __unregister_trace_event(struct trace_event *event); extern struct rw_semaphore trace_event_sem; #define SEQ_PUT_FIELD(s, x) \ diff --git a/kernel/kernel/trace/trace_printk.c b/kernel/kernel/trace/trace_printk.c index 36c1455b7..060df67db 100644 --- a/kernel/kernel/trace/trace_printk.c +++ b/kernel/kernel/trace/trace_printk.c @@ -178,6 +178,12 @@ static inline void format_mod_start(void) { } static inline void format_mod_stop(void) { } #endif /* CONFIG_MODULES */ +static bool __read_mostly trace_printk_enabled = true; + +void trace_printk_control(bool enabled) +{ + trace_printk_enabled = enabled; +} __initdata_or_module static struct notifier_block module_trace_bprintk_format_nb = { @@ -192,7 +198,7 @@ int __trace_bprintk(unsigned long ip, const char *fmt, ...) if (unlikely(!fmt)) return 0; - if (!(trace_flags & TRACE_ITER_PRINTK)) + if (!trace_printk_enabled) return 0; va_start(ap, fmt); @@ -207,7 +213,7 @@ int __ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap) if (unlikely(!fmt)) return 0; - if (!(trace_flags & TRACE_ITER_PRINTK)) + if (!trace_printk_enabled) return 0; return trace_vbprintk(ip, fmt, ap); @@ -219,7 +225,7 @@ int __trace_printk(unsigned long ip, const char *fmt, ...) int ret; va_list ap; - if (!(trace_flags & TRACE_ITER_PRINTK)) + if (!trace_printk_enabled) return 0; va_start(ap, fmt); @@ -231,7 +237,7 @@ EXPORT_SYMBOL_GPL(__trace_printk); int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap) { - if (!(trace_flags & TRACE_ITER_PRINTK)) + if (!trace_printk_enabled) return 0; return trace_vprintk(ip, fmt, ap); @@ -267,6 +273,7 @@ static const char **find_next(void *v, loff_t *pos) if (*pos < last_index + start_index) return __start___tracepoint_str + (*pos - last_index); + start_index += last_index; return find_next_mod_format(start_index, v, fmt, pos); } diff --git a/kernel/kernel/trace/trace_probe.h b/kernel/kernel/trace/trace_probe.h index ab283e146..f6398db09 100644 --- a/kernel/kernel/trace/trace_probe.h +++ b/kernel/kernel/trace/trace_probe.h @@ -272,8 +272,8 @@ struct probe_arg { struct trace_probe { unsigned int flags; /* For TP_FLAG_* */ - struct ftrace_event_class class; - struct ftrace_event_call call; + struct trace_event_class class; + struct trace_event_call call; struct list_head files; ssize_t size; /* trace entry size */ unsigned int nr_args; @@ -281,7 +281,7 @@ struct trace_probe { }; struct event_file_link { - struct ftrace_event_file *file; + struct trace_event_file *file; struct list_head list; }; @@ -302,19 +302,19 @@ static nokprobe_inline void call_fetch(struct fetch_param *fprm, } /* Check the name is good for event/group/fields */ -static inline int is_good_name(const char *name) +static inline bool is_good_name(const char *name) { if (!isalpha(*name) && *name != '_') - return 0; + return false; while (*++name != '\0') { if (!isalpha(*name) && !isdigit(*name) && *name != '_') - return 0; + return false; } - return 1; + return true; } static inline struct event_file_link * -find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file) +find_event_file_link(struct trace_probe *tp, struct trace_event_file *file) { struct event_file_link *link; diff --git a/kernel/kernel/trace/trace_sched_switch.c b/kernel/kernel/trace/trace_sched_switch.c index 419ca37e7..4c896a010 100644 --- a/kernel/kernel/trace/trace_sched_switch.c +++ b/kernel/kernel/trace/trace_sched_switch.c @@ -16,7 +16,8 @@ static int sched_ref; static DEFINE_MUTEX(sched_register_mutex); static void -probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next) +probe_sched_switch(void *ignore, bool preempt, + struct task_struct *prev, struct task_struct *next) { if (unlikely(!sched_ref)) return; @@ -26,7 +27,7 @@ probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *n } static void -probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success) +probe_sched_wakeup(void *ignore, struct task_struct *wakee) { if (unlikely(!sched_ref)) return; diff --git a/kernel/kernel/trace/trace_sched_wakeup.c b/kernel/kernel/trace/trace_sched_wakeup.c index d6e100372..9d4399b55 100644 --- a/kernel/kernel/trace/trace_sched_wakeup.c +++ b/kernel/kernel/trace/trace_sched_wakeup.c @@ -34,31 +34,28 @@ static arch_spinlock_t wakeup_lock = static void wakeup_reset(struct trace_array *tr); static void __wakeup_reset(struct trace_array *tr); -static int wakeup_graph_entry(struct ftrace_graph_ent *trace); -static void wakeup_graph_return(struct ftrace_graph_ret *trace); static int save_flags; -static bool function_enabled; - -#define TRACE_DISPLAY_GRAPH 1 -static struct tracer_opt trace_opts[] = { #ifdef CONFIG_FUNCTION_GRAPH_TRACER - /* display latency trace as call graph */ - { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) }, +static int wakeup_display_graph(struct trace_array *tr, int set); +# define is_graph(tr) ((tr)->trace_flags & TRACE_ITER_DISPLAY_GRAPH) +#else +static inline int wakeup_display_graph(struct trace_array *tr, int set) +{ + return 0; +} +# define is_graph(tr) false #endif - { } /* Empty entry */ -}; - -static struct tracer_flags tracer_flags = { - .val = 0, - .opts = trace_opts, -}; -#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH) #ifdef CONFIG_FUNCTION_TRACER +static int wakeup_graph_entry(struct ftrace_graph_ent *trace); +static void wakeup_graph_return(struct ftrace_graph_ret *trace); + +static bool function_enabled; + /* * Prologue for the wakeup function tracers. * @@ -128,14 +125,13 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip, atomic_dec(&data->disabled); preempt_enable_notrace(); } -#endif /* CONFIG_FUNCTION_TRACER */ static int register_wakeup_function(struct trace_array *tr, int graph, int set) { int ret; /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */ - if (function_enabled || (!set && !(trace_flags & TRACE_ITER_FUNCTION))) + if (function_enabled || (!set && !(tr->trace_flags & TRACE_ITER_FUNCTION))) return 0; if (graph) @@ -163,20 +159,40 @@ static void unregister_wakeup_function(struct trace_array *tr, int graph) function_enabled = false; } -static void wakeup_function_set(struct trace_array *tr, int set) +static int wakeup_function_set(struct trace_array *tr, u32 mask, int set) { + if (!(mask & TRACE_ITER_FUNCTION)) + return 0; + if (set) - register_wakeup_function(tr, is_graph(), 1); + register_wakeup_function(tr, is_graph(tr), 1); else - unregister_wakeup_function(tr, is_graph()); + unregister_wakeup_function(tr, is_graph(tr)); + return 1; +} +#else +static int register_wakeup_function(struct trace_array *tr, int graph, int set) +{ + return 0; +} +static void unregister_wakeup_function(struct trace_array *tr, int graph) { } +static int wakeup_function_set(struct trace_array *tr, u32 mask, int set) +{ + return 0; } +#endif /* CONFIG_FUNCTION_TRACER */ static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set) { struct tracer *tracer = tr->current_trace; - if (mask & TRACE_ITER_FUNCTION) - wakeup_function_set(tr, set); + if (wakeup_function_set(tr, mask, set)) + return 0; + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + if (mask & TRACE_ITER_DISPLAY_GRAPH) + return wakeup_display_graph(tr, set); +#endif return trace_keep_overwrite(tracer, mask, set); } @@ -203,14 +219,9 @@ static void stop_func_tracer(struct trace_array *tr, int graph) } #ifdef CONFIG_FUNCTION_GRAPH_TRACER -static int -wakeup_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) +static int wakeup_display_graph(struct trace_array *tr, int set) { - - if (!(bit & TRACE_DISPLAY_GRAPH)) - return -EINVAL; - - if (!(is_graph() ^ set)) + if (!(is_graph(tr) ^ set)) return 0; stop_func_tracer(tr, !set); @@ -259,7 +270,7 @@ static void wakeup_graph_return(struct ftrace_graph_ret *trace) static void wakeup_trace_open(struct trace_iterator *iter) { - if (is_graph()) + if (is_graph(iter->tr)) graph_trace_open(iter); } @@ -279,7 +290,7 @@ static enum print_line_t wakeup_print_line(struct trace_iterator *iter) * In graph mode call the graph tracer output function, * otherwise go with the TRACE_FN event handler */ - if (is_graph()) + if (is_graph(iter->tr)) return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS); return TRACE_TYPE_UNHANDLED; @@ -287,7 +298,7 @@ static enum print_line_t wakeup_print_line(struct trace_iterator *iter) static void wakeup_print_header(struct seq_file *s) { - if (is_graph()) + if (is_graph(wakeup_trace)) print_graph_headers_flags(s, GRAPH_TRACER_FLAGS); else trace_default_header(s); @@ -298,7 +309,7 @@ __trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, int pc) { - if (is_graph()) + if (is_graph(tr)) trace_graph_function(tr, ip, parent_ip, flags, pc); else trace_function(tr, ip, parent_ip, flags, pc); @@ -306,27 +317,20 @@ __trace_function(struct trace_array *tr, #else #define __trace_function trace_function -static int -wakeup_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) -{ - return -EINVAL; -} - -static int wakeup_graph_entry(struct ftrace_graph_ent *trace) -{ - return -1; -} - static enum print_line_t wakeup_print_line(struct trace_iterator *iter) { return TRACE_TYPE_UNHANDLED; } -static void wakeup_graph_return(struct ftrace_graph_ret *trace) { } static void wakeup_trace_open(struct trace_iterator *iter) { } static void wakeup_trace_close(struct trace_iterator *iter) { } #ifdef CONFIG_FUNCTION_TRACER +static int wakeup_graph_entry(struct ftrace_graph_ent *trace) +{ + return -1; +} +static void wakeup_graph_return(struct ftrace_graph_ret *trace) { } static void wakeup_print_header(struct seq_file *s) { trace_default_header(s); @@ -342,16 +346,16 @@ static void wakeup_print_header(struct seq_file *s) /* * Should this new latency be reported/recorded? */ -static int report_latency(struct trace_array *tr, cycle_t delta) +static bool report_latency(struct trace_array *tr, cycle_t delta) { if (tracing_thresh) { if (delta < tracing_thresh) - return 0; + return false; } else { if (delta <= tr->max_latency) - return 0; + return false; } - return 1; + return true; } static void @@ -369,7 +373,7 @@ tracing_sched_switch_trace(struct trace_array *tr, struct task_struct *next, unsigned long flags, int pc) { - struct ftrace_event_call *call = &event_context_switch; + struct trace_event_call *call = &event_context_switch; struct ring_buffer *buffer = tr->trace_buffer.buffer; struct ring_buffer_event *event; struct ctx_switch_entry *entry; @@ -388,7 +392,7 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->next_cpu = task_cpu(next); if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(buffer, event, flags, pc); + trace_buffer_unlock_commit(tr, buffer, event, flags, pc); } static void @@ -397,7 +401,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, struct task_struct *curr, unsigned long flags, int pc) { - struct ftrace_event_call *call = &event_wakeup; + struct trace_event_call *call = &event_wakeup; struct ring_buffer_event *event; struct ctx_switch_entry *entry; struct ring_buffer *buffer = tr->trace_buffer.buffer; @@ -416,11 +420,11 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->next_cpu = task_cpu(wakee); if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(buffer, event, flags, pc); + trace_buffer_unlock_commit(tr, buffer, event, flags, pc); } static void notrace -probe_wakeup_sched_switch(void *ignore, +probe_wakeup_sched_switch(void *ignore, bool preempt, struct task_struct *prev, struct task_struct *next) { struct trace_array_cpu *data; @@ -514,7 +518,7 @@ static void wakeup_reset(struct trace_array *tr) } static void -probe_wakeup(void *ignore, struct task_struct *p, int success) +probe_wakeup(void *ignore, struct task_struct *p) { struct trace_array_cpu *data; int cpu = smp_processor_id(); @@ -635,7 +639,7 @@ static void start_wakeup_tracer(struct trace_array *tr) */ smp_wmb(); - if (start_func_tracer(tr, is_graph())) + if (start_func_tracer(tr, is_graph(tr))) printk(KERN_ERR "failed to start wakeup tracer\n"); return; @@ -648,7 +652,7 @@ fail_deprobe: static void stop_wakeup_tracer(struct trace_array *tr) { tracer_enabled = 0; - stop_func_tracer(tr, is_graph()); + stop_func_tracer(tr, is_graph(tr)); unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL); unregister_trace_sched_wakeup_new(probe_wakeup, NULL); unregister_trace_sched_wakeup(probe_wakeup, NULL); @@ -659,7 +663,7 @@ static bool wakeup_busy; static int __wakeup_tracer_init(struct trace_array *tr) { - save_flags = trace_flags; + save_flags = tr->trace_flags; /* non overwrite screws up the latency tracers */ set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); @@ -740,8 +744,6 @@ static struct tracer wakeup_tracer __read_mostly = .print_max = true, .print_header = wakeup_print_header, .print_line = wakeup_print_line, - .flags = &tracer_flags, - .set_flag = wakeup_set_flag, .flag_changed = wakeup_flag_changed, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_wakeup, @@ -762,8 +764,6 @@ static struct tracer wakeup_rt_tracer __read_mostly = .print_max = true, .print_header = wakeup_print_header, .print_line = wakeup_print_line, - .flags = &tracer_flags, - .set_flag = wakeup_set_flag, .flag_changed = wakeup_flag_changed, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_wakeup, @@ -784,8 +784,6 @@ static struct tracer wakeup_dl_tracer __read_mostly = .print_max = true, .print_header = wakeup_print_header, .print_line = wakeup_print_line, - .flags = &tracer_flags, - .set_flag = wakeup_set_flag, .flag_changed = wakeup_flag_changed, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_wakeup, diff --git a/kernel/kernel/trace/trace_stack.c b/kernel/kernel/trace/trace_stack.c index 3f3449624..202df6cff 100644 --- a/kernel/kernel/trace/trace_stack.c +++ b/kernel/kernel/trace/trace_stack.c @@ -16,30 +16,22 @@ #include "trace.h" -#define STACK_TRACE_ENTRIES 500 - -#ifdef CC_USING_FENTRY -# define fentry 1 -#else -# define fentry 0 -#endif - static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] = { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX }; -static unsigned stack_dump_index[STACK_TRACE_ENTRIES]; +unsigned stack_trace_index[STACK_TRACE_ENTRIES]; /* * Reserve one entry for the passed in ip. This will allow * us to remove most or all of the stack size overhead * added by the stack tracer itself. */ -static struct stack_trace max_stack_trace = { +struct stack_trace stack_trace_max = { .max_entries = STACK_TRACE_ENTRIES - 1, - .entries = &stack_dump_trace[1], + .entries = &stack_dump_trace[0], }; -static unsigned long max_stack_size; -static arch_spinlock_t max_stack_lock = +unsigned long stack_trace_max_size; +arch_spinlock_t stack_trace_max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; static DEFINE_PER_CPU(int, trace_active); @@ -48,83 +40,102 @@ static DEFINE_MUTEX(stack_sysctl_mutex); int stack_tracer_enabled; static int last_stack_tracer_enabled; -static inline void print_max_stack(void) +void stack_trace_print(void) { long i; int size; pr_emerg(" Depth Size Location (%d entries)\n" " ----- ---- --------\n", - max_stack_trace.nr_entries - 1); + stack_trace_max.nr_entries); - for (i = 0; i < max_stack_trace.nr_entries; i++) { + for (i = 0; i < stack_trace_max.nr_entries; i++) { if (stack_dump_trace[i] == ULONG_MAX) break; - if (i+1 == max_stack_trace.nr_entries || + if (i+1 == stack_trace_max.nr_entries || stack_dump_trace[i+1] == ULONG_MAX) - size = stack_dump_index[i]; + size = stack_trace_index[i]; else - size = stack_dump_index[i] - stack_dump_index[i+1]; + size = stack_trace_index[i] - stack_trace_index[i+1]; - pr_emerg("%3ld) %8d %5d %pS\n", i, stack_dump_index[i], + pr_emerg("%3ld) %8d %5d %pS\n", i, stack_trace_index[i], size, (void *)stack_dump_trace[i]); } } -static inline void +/* + * When arch-specific code overides this function, the following + * data should be filled up, assuming stack_trace_max_lock is held to + * prevent concurrent updates. + * stack_trace_index[] + * stack_trace_max + * stack_trace_max_size + */ +void __weak check_stack(unsigned long ip, unsigned long *stack) { unsigned long this_size, flags; unsigned long *p, *top, *start; static int tracer_frame; int frame_size = ACCESS_ONCE(tracer_frame); - int i; + int i, x; this_size = ((unsigned long)stack) & (THREAD_SIZE-1); this_size = THREAD_SIZE - this_size; /* Remove the frame of the tracer */ this_size -= frame_size; - if (this_size <= max_stack_size) + if (this_size <= stack_trace_max_size) return; /* we do not handle interrupt stacks yet */ if (!object_is_on_stack(stack)) return; + /* Can't do this from NMI context (can cause deadlocks) */ + if (in_nmi()) + return; + local_irq_save(flags); - arch_spin_lock(&max_stack_lock); + arch_spin_lock(&stack_trace_max_lock); + + /* + * RCU may not be watching, make it see us. + * The stack trace code uses rcu_sched. + */ + rcu_irq_enter(); /* In case another CPU set the tracer_frame on us */ if (unlikely(!frame_size)) this_size -= tracer_frame; /* a race could have already updated it */ - if (this_size <= max_stack_size) + if (this_size <= stack_trace_max_size) goto out; - max_stack_size = this_size; + stack_trace_max_size = this_size; - max_stack_trace.nr_entries = 0; + stack_trace_max.nr_entries = 0; + stack_trace_max.skip = 3; - if (using_ftrace_ops_list_func()) - max_stack_trace.skip = 4; - else - max_stack_trace.skip = 3; + save_stack_trace(&stack_trace_max); - save_stack_trace(&max_stack_trace); + /* Skip over the overhead of the stack tracer itself */ + for (i = 0; i < stack_trace_max.nr_entries; i++) { + if (stack_dump_trace[i] == ip) + break; + } /* - * Add the passed in ip from the function tracer. - * Searching for this on the stack will skip over - * most of the overhead from the stack tracer itself. + * Some archs may not have the passed in ip in the dump. + * If that happens, we need to show everything. */ - stack_dump_trace[0] = ip; - max_stack_trace.nr_entries++; + if (i == stack_trace_max.nr_entries) + i = 0; /* * Now find where in the stack these are. */ - i = 0; + x = 0; start = stack; top = (unsigned long *) (((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE); @@ -136,15 +147,18 @@ check_stack(unsigned long ip, unsigned long *stack) * loop will only happen once. This code only takes place * on a new max, so it is far from a fast path. */ - while (i < max_stack_trace.nr_entries) { + while (i < stack_trace_max.nr_entries) { int found = 0; - stack_dump_index[i] = this_size; + stack_trace_index[x] = this_size; p = start; - for (; p < top && i < max_stack_trace.nr_entries; p++) { + for (; p < top && i < stack_trace_max.nr_entries; p++) { + if (stack_dump_trace[i] == ULONG_MAX) + break; if (*p == stack_dump_trace[i]) { - this_size = stack_dump_index[i++] = + stack_dump_trace[x] = stack_dump_trace[i++]; + this_size = stack_trace_index[x++] = (top - p) * sizeof(unsigned long); found = 1; /* Start the search from here */ @@ -156,10 +170,10 @@ check_stack(unsigned long ip, unsigned long *stack) * out what that is, then figure it out * now. */ - if (unlikely(!tracer_frame) && i == 1) { + if (unlikely(!tracer_frame)) { tracer_frame = (p - stack) * sizeof(unsigned long); - max_stack_size -= tracer_frame; + stack_trace_max_size -= tracer_frame; } } } @@ -168,13 +182,18 @@ check_stack(unsigned long ip, unsigned long *stack) i++; } + stack_trace_max.nr_entries = x; + for (; x < i; x++) + stack_dump_trace[x] = ULONG_MAX; + if (task_stack_end_corrupted(current)) { - print_max_stack(); + stack_trace_print(); BUG(); } out: - arch_spin_unlock(&max_stack_lock); + rcu_irq_exit(); + arch_spin_unlock(&stack_trace_max_lock); local_irq_restore(flags); } @@ -192,24 +211,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip, if (per_cpu(trace_active, cpu)++ != 0) goto out; - /* - * When fentry is used, the traced function does not get - * its stack frame set up, and we lose the parent. - * The ip is pretty useless because the function tracer - * was called before that function set up its stack frame. - * In this case, we use the parent ip. - * - * By adding the return address of either the parent ip - * or the current ip we can disregard most of the stack usage - * caused by the stack tracer itself. - * - * The function tracer always reports the address of where the - * mcount call was, but the stack will hold the return address. - */ - if (fentry) - ip = parent_ip; - else - ip += MCOUNT_INSN_SIZE; + ip += MCOUNT_INSN_SIZE; check_stack(ip, &stack); @@ -262,9 +264,9 @@ stack_max_size_write(struct file *filp, const char __user *ubuf, cpu = smp_processor_id(); per_cpu(trace_active, cpu)++; - arch_spin_lock(&max_stack_lock); + arch_spin_lock(&stack_trace_max_lock); *ptr = val; - arch_spin_unlock(&max_stack_lock); + arch_spin_unlock(&stack_trace_max_lock); per_cpu(trace_active, cpu)--; local_irq_restore(flags); @@ -284,7 +286,7 @@ __next(struct seq_file *m, loff_t *pos) { long n = *pos - 1; - if (n >= max_stack_trace.nr_entries || stack_dump_trace[n] == ULONG_MAX) + if (n > stack_trace_max.nr_entries || stack_dump_trace[n] == ULONG_MAX) return NULL; m->private = (void *)n; @@ -307,7 +309,7 @@ static void *t_start(struct seq_file *m, loff_t *pos) cpu = smp_processor_id(); per_cpu(trace_active, cpu)++; - arch_spin_lock(&max_stack_lock); + arch_spin_lock(&stack_trace_max_lock); if (*pos == 0) return SEQ_START_TOKEN; @@ -319,7 +321,7 @@ static void t_stop(struct seq_file *m, void *p) { int cpu; - arch_spin_unlock(&max_stack_lock); + arch_spin_unlock(&stack_trace_max_lock); cpu = smp_processor_id(); per_cpu(trace_active, cpu)--; @@ -354,9 +356,9 @@ static int t_show(struct seq_file *m, void *v) seq_printf(m, " Depth Size Location" " (%d entries)\n" " ----- ---- --------\n", - max_stack_trace.nr_entries - 1); + stack_trace_max.nr_entries); - if (!stack_tracer_enabled && !max_stack_size) + if (!stack_tracer_enabled && !stack_trace_max_size) print_disabled(m); return 0; @@ -364,17 +366,17 @@ static int t_show(struct seq_file *m, void *v) i = *(long *)v; - if (i >= max_stack_trace.nr_entries || + if (i >= stack_trace_max.nr_entries || stack_dump_trace[i] == ULONG_MAX) return 0; - if (i+1 == max_stack_trace.nr_entries || + if (i+1 == stack_trace_max.nr_entries || stack_dump_trace[i+1] == ULONG_MAX) - size = stack_dump_index[i]; + size = stack_trace_index[i]; else - size = stack_dump_index[i] - stack_dump_index[i+1]; + size = stack_trace_index[i] - stack_trace_index[i+1]; - seq_printf(m, "%3ld) %8d %5d ", i, stack_dump_index[i], size); + seq_printf(m, "%3ld) %8d %5d ", i, stack_trace_index[i], size); trace_lookup_stack(m, i); @@ -464,7 +466,7 @@ static __init int stack_trace_init(void) return 0; trace_create_file("stack_max_size", 0644, d_tracer, - &max_stack_size, &stack_max_size_fops); + &stack_trace_max_size, &stack_max_size_fops); trace_create_file("stack_trace", 0444, d_tracer, NULL, &stack_trace_fops); diff --git a/kernel/kernel/trace/trace_syscalls.c b/kernel/kernel/trace/trace_syscalls.c index f97f6e3a6..0655afbea 100644 --- a/kernel/kernel/trace/trace_syscalls.c +++ b/kernel/kernel/trace/trace_syscalls.c @@ -13,13 +13,13 @@ static DEFINE_MUTEX(syscall_trace_lock); -static int syscall_enter_register(struct ftrace_event_call *event, +static int syscall_enter_register(struct trace_event_call *event, enum trace_reg type, void *data); -static int syscall_exit_register(struct ftrace_event_call *event, +static int syscall_exit_register(struct trace_event_call *event, enum trace_reg type, void *data); static struct list_head * -syscall_get_enter_fields(struct ftrace_event_call *call) +syscall_get_enter_fields(struct trace_event_call *call) { struct syscall_metadata *entry = call->data; @@ -110,6 +110,7 @@ static enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags, struct trace_event *event) { + struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; struct trace_entry *ent = iter->ent; struct syscall_trace_enter *trace; @@ -136,7 +137,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags, goto end; /* parameter types */ - if (trace_flags & TRACE_ITER_VERBOSE) + if (tr->trace_flags & TRACE_ITER_VERBOSE) trace_seq_printf(s, "%s ", entry->types[i]); /* parameter values */ @@ -219,7 +220,7 @@ __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) return pos; } -static int __init set_syscall_print_fmt(struct ftrace_event_call *call) +static int __init set_syscall_print_fmt(struct trace_event_call *call) { char *print_fmt; int len; @@ -244,7 +245,7 @@ static int __init set_syscall_print_fmt(struct ftrace_event_call *call) return 0; } -static void __init free_syscall_print_fmt(struct ftrace_event_call *call) +static void __init free_syscall_print_fmt(struct trace_event_call *call) { struct syscall_metadata *entry = call->data; @@ -252,7 +253,7 @@ static void __init free_syscall_print_fmt(struct ftrace_event_call *call) kfree(call->print_fmt); } -static int __init syscall_enter_define_fields(struct ftrace_event_call *call) +static int __init syscall_enter_define_fields(struct trace_event_call *call) { struct syscall_trace_enter trace; struct syscall_metadata *meta = call->data; @@ -275,7 +276,7 @@ static int __init syscall_enter_define_fields(struct ftrace_event_call *call) return ret; } -static int __init syscall_exit_define_fields(struct ftrace_event_call *call) +static int __init syscall_exit_define_fields(struct trace_event_call *call) { struct syscall_trace_exit trace; int ret; @@ -293,7 +294,7 @@ static int __init syscall_exit_define_fields(struct ftrace_event_call *call) static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) { struct trace_array *tr = data; - struct ftrace_event_file *ftrace_file; + struct trace_event_file *trace_file; struct syscall_trace_enter *entry; struct syscall_metadata *sys_data; struct ring_buffer_event *event; @@ -308,11 +309,11 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) return; /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */ - ftrace_file = rcu_dereference_sched(tr->enter_syscall_files[syscall_nr]); - if (!ftrace_file) + trace_file = rcu_dereference_sched(tr->enter_syscall_files[syscall_nr]); + if (!trace_file) return; - if (ftrace_trigger_soft_disabled(ftrace_file)) + if (trace_trigger_soft_disabled(trace_file)) return; sys_data = syscall_nr_to_meta(syscall_nr); @@ -334,14 +335,14 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) entry->nr = syscall_nr; syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args); - event_trigger_unlock_commit(ftrace_file, buffer, event, entry, + event_trigger_unlock_commit(trace_file, buffer, event, entry, irq_flags, pc); } static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) { struct trace_array *tr = data; - struct ftrace_event_file *ftrace_file; + struct trace_event_file *trace_file; struct syscall_trace_exit *entry; struct syscall_metadata *sys_data; struct ring_buffer_event *event; @@ -355,11 +356,11 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) return; /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE()) */ - ftrace_file = rcu_dereference_sched(tr->exit_syscall_files[syscall_nr]); - if (!ftrace_file) + trace_file = rcu_dereference_sched(tr->exit_syscall_files[syscall_nr]); + if (!trace_file) return; - if (ftrace_trigger_soft_disabled(ftrace_file)) + if (trace_trigger_soft_disabled(trace_file)) return; sys_data = syscall_nr_to_meta(syscall_nr); @@ -380,12 +381,12 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) entry->nr = syscall_nr; entry->ret = syscall_get_return_value(current, regs); - event_trigger_unlock_commit(ftrace_file, buffer, event, entry, + event_trigger_unlock_commit(trace_file, buffer, event, entry, irq_flags, pc); } -static int reg_event_syscall_enter(struct ftrace_event_file *file, - struct ftrace_event_call *call) +static int reg_event_syscall_enter(struct trace_event_file *file, + struct trace_event_call *call) { struct trace_array *tr = file->tr; int ret = 0; @@ -405,8 +406,8 @@ static int reg_event_syscall_enter(struct ftrace_event_file *file, return ret; } -static void unreg_event_syscall_enter(struct ftrace_event_file *file, - struct ftrace_event_call *call) +static void unreg_event_syscall_enter(struct trace_event_file *file, + struct trace_event_call *call) { struct trace_array *tr = file->tr; int num; @@ -422,8 +423,8 @@ static void unreg_event_syscall_enter(struct ftrace_event_file *file, mutex_unlock(&syscall_trace_lock); } -static int reg_event_syscall_exit(struct ftrace_event_file *file, - struct ftrace_event_call *call) +static int reg_event_syscall_exit(struct trace_event_file *file, + struct trace_event_call *call) { struct trace_array *tr = file->tr; int ret = 0; @@ -443,8 +444,8 @@ static int reg_event_syscall_exit(struct ftrace_event_file *file, return ret; } -static void unreg_event_syscall_exit(struct ftrace_event_file *file, - struct ftrace_event_call *call) +static void unreg_event_syscall_exit(struct trace_event_file *file, + struct trace_event_call *call) { struct trace_array *tr = file->tr; int num; @@ -460,7 +461,7 @@ static void unreg_event_syscall_exit(struct ftrace_event_file *file, mutex_unlock(&syscall_trace_lock); } -static int __init init_syscall_trace(struct ftrace_event_call *call) +static int __init init_syscall_trace(struct trace_event_call *call) { int id; int num; @@ -493,7 +494,7 @@ struct trace_event_functions exit_syscall_print_funcs = { .trace = print_syscall_exit, }; -struct ftrace_event_class __refdata event_class_syscall_enter = { +struct trace_event_class __refdata event_class_syscall_enter = { .system = "syscalls", .reg = syscall_enter_register, .define_fields = syscall_enter_define_fields, @@ -501,7 +502,7 @@ struct ftrace_event_class __refdata event_class_syscall_enter = { .raw_init = init_syscall_trace, }; -struct ftrace_event_class __refdata event_class_syscall_exit = { +struct trace_event_class __refdata event_class_syscall_exit = { .system = "syscalls", .reg = syscall_exit_register, .define_fields = syscall_exit_define_fields, @@ -584,7 +585,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); } -static int perf_sysenter_enable(struct ftrace_event_call *call) +static int perf_sysenter_enable(struct trace_event_call *call) { int ret = 0; int num; @@ -605,7 +606,7 @@ static int perf_sysenter_enable(struct ftrace_event_call *call) return ret; } -static void perf_sysenter_disable(struct ftrace_event_call *call) +static void perf_sysenter_disable(struct trace_event_call *call) { int num; @@ -656,7 +657,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); } -static int perf_sysexit_enable(struct ftrace_event_call *call) +static int perf_sysexit_enable(struct trace_event_call *call) { int ret = 0; int num; @@ -677,7 +678,7 @@ static int perf_sysexit_enable(struct ftrace_event_call *call) return ret; } -static void perf_sysexit_disable(struct ftrace_event_call *call) +static void perf_sysexit_disable(struct trace_event_call *call) { int num; @@ -693,10 +694,10 @@ static void perf_sysexit_disable(struct ftrace_event_call *call) #endif /* CONFIG_PERF_EVENTS */ -static int syscall_enter_register(struct ftrace_event_call *event, +static int syscall_enter_register(struct trace_event_call *event, enum trace_reg type, void *data) { - struct ftrace_event_file *file = data; + struct trace_event_file *file = data; switch (type) { case TRACE_REG_REGISTER: @@ -721,10 +722,10 @@ static int syscall_enter_register(struct ftrace_event_call *event, return 0; } -static int syscall_exit_register(struct ftrace_event_call *event, +static int syscall_exit_register(struct trace_event_call *event, enum trace_reg type, void *data) { - struct ftrace_event_file *file = data; + struct trace_event_file *file = data; switch (type) { case TRACE_REG_REGISTER: diff --git a/kernel/kernel/trace/trace_uprobe.c b/kernel/kernel/trace/trace_uprobe.c index 6dd022c7b..d2f6d0be3 100644 --- a/kernel/kernel/trace/trace_uprobe.c +++ b/kernel/kernel/trace/trace_uprobe.c @@ -293,7 +293,7 @@ static struct trace_uprobe *find_probe_event(const char *event, const char *grou struct trace_uprobe *tu; list_for_each_entry(tu, &uprobe_list, list) - if (strcmp(ftrace_event_name(&tu->tp.call), event) == 0 && + if (strcmp(trace_event_name(&tu->tp.call), event) == 0 && strcmp(tu->tp.call.class->system, group) == 0) return tu; @@ -323,7 +323,7 @@ static int register_trace_uprobe(struct trace_uprobe *tu) mutex_lock(&uprobe_lock); /* register as an event */ - old_tu = find_probe_event(ftrace_event_name(&tu->tp.call), + old_tu = find_probe_event(trace_event_name(&tu->tp.call), tu->tp.call.class->system); if (old_tu) { /* delete old event */ @@ -600,8 +600,23 @@ static int probes_seq_show(struct seq_file *m, void *v) int i; seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system, - ftrace_event_name(&tu->tp.call)); - seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset); + trace_event_name(&tu->tp.call)); + seq_printf(m, " %s:", tu->filename); + + /* Don't print "0x (null)" when offset is 0 */ + if (tu->offset) { + seq_printf(m, "0x%p", (void *)tu->offset); + } else { + switch (sizeof(void *)) { + case 4: + seq_printf(m, "0x00000000"); + break; + case 8: + default: + seq_printf(m, "0x0000000000000000"); + break; + } + } for (i = 0; i < tu->tp.nr_args; i++) seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm); @@ -651,7 +666,7 @@ static int probes_profile_seq_show(struct seq_file *m, void *v) struct trace_uprobe *tu = v; seq_printf(m, " %s %-44s %15lu\n", tu->filename, - ftrace_event_name(&tu->tp.call), tu->nhit); + trace_event_name(&tu->tp.call), tu->nhit); return 0; } @@ -770,26 +785,26 @@ static void uprobe_buffer_put(struct uprobe_cpu_buffer *ucb) static void __uprobe_trace_func(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs, struct uprobe_cpu_buffer *ucb, int dsize, - struct ftrace_event_file *ftrace_file) + struct trace_event_file *trace_file) { struct uprobe_trace_entry_head *entry; struct ring_buffer_event *event; struct ring_buffer *buffer; void *data; int size, esize; - struct ftrace_event_call *call = &tu->tp.call; + struct trace_event_call *call = &tu->tp.call; - WARN_ON(call != ftrace_file->event_call); + WARN_ON(call != trace_file->event_call); if (WARN_ON_ONCE(tu->tp.size + dsize > PAGE_SIZE)) return; - if (ftrace_trigger_soft_disabled(ftrace_file)) + if (trace_trigger_soft_disabled(trace_file)) return; esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); size = esize + tu->tp.size + dsize; - event = trace_event_buffer_lock_reserve(&buffer, ftrace_file, + event = trace_event_buffer_lock_reserve(&buffer, trace_file, call->event.type, size, 0, 0); if (!event) return; @@ -806,7 +821,7 @@ static void __uprobe_trace_func(struct trace_uprobe *tu, memcpy(data, ucb->buf, tu->tp.size + dsize); - event_trigger_unlock_commit(ftrace_file, buffer, event, entry, 0, 0); + event_trigger_unlock_commit(trace_file, buffer, event, entry, 0, 0); } /* uprobe handler */ @@ -853,12 +868,12 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e if (is_ret_probe(tu)) { trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", - ftrace_event_name(&tu->tp.call), + trace_event_name(&tu->tp.call), entry->vaddr[1], entry->vaddr[0]); data = DATAOF_TRACE_ENTRY(entry, true); } else { trace_seq_printf(s, "%s: (0x%lx)", - ftrace_event_name(&tu->tp.call), + trace_event_name(&tu->tp.call), entry->vaddr[0]); data = DATAOF_TRACE_ENTRY(entry, false); } @@ -881,7 +896,7 @@ typedef bool (*filter_func_t)(struct uprobe_consumer *self, struct mm_struct *mm); static int -probe_event_enable(struct trace_uprobe *tu, struct ftrace_event_file *file, +probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file, filter_func_t filter) { bool enabled = trace_probe_is_enabled(&tu->tp); @@ -938,7 +953,7 @@ probe_event_enable(struct trace_uprobe *tu, struct ftrace_event_file *file, } static void -probe_event_disable(struct trace_uprobe *tu, struct ftrace_event_file *file) +probe_event_disable(struct trace_uprobe *tu, struct trace_event_file *file) { if (!trace_probe_is_enabled(&tu->tp)) return; @@ -967,7 +982,7 @@ probe_event_disable(struct trace_uprobe *tu, struct ftrace_event_file *file) uprobe_buffer_disable(); } -static int uprobe_event_define_fields(struct ftrace_event_call *event_call) +static int uprobe_event_define_fields(struct trace_event_call *event_call) { int ret, i, size; struct uprobe_trace_entry_head field; @@ -1093,13 +1108,17 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs, struct uprobe_cpu_buffer *ucb, int dsize) { - struct ftrace_event_call *call = &tu->tp.call; + struct trace_event_call *call = &tu->tp.call; struct uprobe_trace_entry_head *entry; + struct bpf_prog *prog = call->prog; struct hlist_head *head; void *data; int size, esize; int rctx; + if (prog && !trace_call_bpf(prog, regs)) + return; + esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); size = esize + tu->tp.size + dsize; @@ -1159,11 +1178,11 @@ static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func, #endif /* CONFIG_PERF_EVENTS */ static int -trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, +trace_uprobe_register(struct trace_event_call *event, enum trace_reg type, void *data) { struct trace_uprobe *tu = event->data; - struct ftrace_event_file *file = data; + struct trace_event_file *file = data; switch (type) { case TRACE_REG_REGISTER: @@ -1272,10 +1291,10 @@ static struct trace_event_functions uprobe_funcs = { static int register_uprobe_event(struct trace_uprobe *tu) { - struct ftrace_event_call *call = &tu->tp.call; + struct trace_event_call *call = &tu->tp.call; int ret; - /* Initialize ftrace_event_call */ + /* Initialize trace_event_call */ INIT_LIST_HEAD(&call->class->fields); call->event.funcs = &uprobe_funcs; call->class->define_fields = uprobe_event_define_fields; @@ -1283,21 +1302,22 @@ static int register_uprobe_event(struct trace_uprobe *tu) if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) return -ENOMEM; - ret = register_ftrace_event(&call->event); + ret = register_trace_event(&call->event); if (!ret) { kfree(call->print_fmt); return -ENODEV; } + call->flags = TRACE_EVENT_FL_UPROBE; call->class->reg = trace_uprobe_register; call->data = tu; ret = trace_add_event_call(call); if (ret) { pr_info("Failed to register uprobe event: %s\n", - ftrace_event_name(call)); + trace_event_name(call)); kfree(call->print_fmt); - unregister_ftrace_event(&call->event); + unregister_trace_event(&call->event); } return ret; diff --git a/kernel/kernel/tracepoint.c b/kernel/kernel/tracepoint.c index 3490407dc..ecd536de6 100644 --- a/kernel/kernel/tracepoint.c +++ b/kernel/kernel/tracepoint.c @@ -91,11 +91,13 @@ static void debug_print_probes(struct tracepoint_func *funcs) printk(KERN_DEBUG "Probe %d : %p\n", i, funcs[i].func); } -static struct tracepoint_func *func_add(struct tracepoint_func **funcs, - struct tracepoint_func *tp_func) +static struct tracepoint_func * +func_add(struct tracepoint_func **funcs, struct tracepoint_func *tp_func, + int prio) { - int nr_probes = 0; struct tracepoint_func *old, *new; + int nr_probes = 0; + int pos = -1; if (WARN_ON(!tp_func->func)) return ERR_PTR(-EINVAL); @@ -104,18 +106,33 @@ static struct tracepoint_func *func_add(struct tracepoint_func **funcs, old = *funcs; if (old) { /* (N -> N+1), (N != 0, 1) probes */ - for (nr_probes = 0; old[nr_probes].func; nr_probes++) + for (nr_probes = 0; old[nr_probes].func; nr_probes++) { + /* Insert before probes of lower priority */ + if (pos < 0 && old[nr_probes].prio < prio) + pos = nr_probes; if (old[nr_probes].func == tp_func->func && old[nr_probes].data == tp_func->data) return ERR_PTR(-EEXIST); + } } /* + 2 : one for new probe, one for NULL func */ new = allocate_probes(nr_probes + 2); if (new == NULL) return ERR_PTR(-ENOMEM); - if (old) - memcpy(new, old, nr_probes * sizeof(struct tracepoint_func)); - new[nr_probes] = *tp_func; + if (old) { + if (pos < 0) { + pos = nr_probes; + memcpy(new, old, nr_probes * sizeof(struct tracepoint_func)); + } else { + /* Copy higher priority probes ahead of the new probe */ + memcpy(new, old, pos * sizeof(struct tracepoint_func)); + /* Copy the rest after it. */ + memcpy(new + pos + 1, old + pos, + (nr_probes - pos) * sizeof(struct tracepoint_func)); + } + } else + pos = 0; + new[pos] = *tp_func; new[nr_probes + 1].func = NULL; *funcs = new; debug_print_probes(*funcs); @@ -174,7 +191,7 @@ static void *func_remove(struct tracepoint_func **funcs, * Add the probe function to a tracepoint. */ static int tracepoint_add_func(struct tracepoint *tp, - struct tracepoint_func *func) + struct tracepoint_func *func, int prio) { struct tracepoint_func *old, *tp_funcs; @@ -183,7 +200,7 @@ static int tracepoint_add_func(struct tracepoint *tp, tp_funcs = rcu_dereference_protected(tp->funcs, lockdep_is_held(&tracepoints_mutex)); - old = func_add(&tp_funcs, func); + old = func_add(&tp_funcs, func, prio); if (IS_ERR(old)) { WARN_ON_ONCE(1); return PTR_ERR(old); @@ -240,6 +257,7 @@ static int tracepoint_remove_func(struct tracepoint *tp, * @tp: tracepoint * @probe: probe handler * @data: tracepoint data + * @prio: priority of this function over other registered functions * * Returns 0 if ok, error value on error. * Note: if @tp is within a module, the caller is responsible for @@ -247,7 +265,8 @@ static int tracepoint_remove_func(struct tracepoint *tp, * performed either with a tracepoint module going notifier, or from * within module exit functions. */ -int tracepoint_probe_register(struct tracepoint *tp, void *probe, void *data) +int tracepoint_probe_register_prio(struct tracepoint *tp, void *probe, + void *data, int prio) { struct tracepoint_func tp_func; int ret; @@ -255,10 +274,30 @@ int tracepoint_probe_register(struct tracepoint *tp, void *probe, void *data) mutex_lock(&tracepoints_mutex); tp_func.func = probe; tp_func.data = data; - ret = tracepoint_add_func(tp, &tp_func); + tp_func.prio = prio; + ret = tracepoint_add_func(tp, &tp_func, prio); mutex_unlock(&tracepoints_mutex); return ret; } +EXPORT_SYMBOL_GPL(tracepoint_probe_register_prio); + +/** + * tracepoint_probe_register - Connect a probe to a tracepoint + * @tp: tracepoint + * @probe: probe handler + * @data: tracepoint data + * @prio: priority of this function over other registered functions + * + * Returns 0 if ok, error value on error. + * Note: if @tp is within a module, the caller is responsible for + * unregistering the probe before the module is gone. This can be + * performed either with a tracepoint module going notifier, or from + * within module exit functions. + */ +int tracepoint_probe_register(struct tracepoint *tp, void *probe, void *data) +{ + return tracepoint_probe_register_prio(tp, probe, data, TRACEPOINT_DEFAULT_PRIO); +} EXPORT_SYMBOL_GPL(tracepoint_probe_register); /** diff --git a/kernel/kernel/user_namespace.c b/kernel/kernel/user_namespace.c index 4109f8320..88fefa68c 100644 --- a/kernel/kernel/user_namespace.c +++ b/kernel/kernel/user_namespace.c @@ -39,6 +39,7 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) cred->cap_inheritable = CAP_EMPTY_SET; cred->cap_permitted = CAP_FULL_SET; cred->cap_effective = CAP_FULL_SET; + cred->cap_ambient = CAP_EMPTY_SET; cred->cap_bset = CAP_FULL_SET; #ifdef CONFIG_KEYS key_put(cred->request_key_auth); @@ -976,8 +977,8 @@ static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns) if (user_ns == current_user_ns()) return -EINVAL; - /* Threaded processes may not enter a different user namespace */ - if (atomic_read(¤t->mm->mm_users) > 1) + /* Tasks that share a thread group must share a user namespace */ + if (!thread_group_empty(current)) return -EINVAL; if (current->fs->users != 1) diff --git a/kernel/kernel/watchdog.c b/kernel/kernel/watchdog.c index 3b8e827ee..201775b44 100644 --- a/kernel/kernel/watchdog.c +++ b/kernel/kernel/watchdog.c @@ -19,10 +19,12 @@ #include <linux/sysctl.h> #include <linux/smpboot.h> #include <linux/sched/rt.h> +#include <linux/tick.h> #include <asm/irq_regs.h> #include <linux/kvm_para.h> #include <linux/perf_event.h> +#include <linux/kthread.h> /* * The run state of the lockup detectors is controlled by the content of the @@ -55,11 +57,38 @@ int __read_mostly watchdog_thresh = 10; #ifdef CONFIG_SMP int __read_mostly sysctl_softlockup_all_cpu_backtrace; +int __read_mostly sysctl_hardlockup_all_cpu_backtrace; #else #define sysctl_softlockup_all_cpu_backtrace 0 +#define sysctl_hardlockup_all_cpu_backtrace 0 #endif +static struct cpumask watchdog_cpumask __read_mostly; +unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); +/* Helper for online, unparked cpus. */ +#define for_each_watchdog_cpu(cpu) \ + for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask) + +/* + * The 'watchdog_running' variable is set to 1 when the watchdog threads + * are registered/started and is set to 0 when the watchdog threads are + * unregistered/stopped, so it is an indicator whether the threads exist. + */ static int __read_mostly watchdog_running; +/* + * If a subsystem has a need to deactivate the watchdog temporarily, it + * can use the suspend/resume interface to achieve this. The content of + * the 'watchdog_suspended' variable reflects this state. Existing threads + * are parked/unparked by the lockup_detector_{suspend|resume} functions + * (see comment blocks pertaining to those functions for further details). + * + * 'watchdog_suspended' also prevents threads from being registered/started + * or unregistered/stopped via parameters in /proc/sys/kernel, so the state + * of 'watchdog_running' cannot change while the watchdog is deactivated + * temporarily (see related code in 'proc' handlers). + */ +static int __read_mostly watchdog_suspended; + static u64 __read_mostly sample_period; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); @@ -83,8 +112,9 @@ static unsigned long soft_lockup_nmi_warn; * Should we panic when a soft-lockup or hard-lockup occurs: */ #ifdef CONFIG_HARDLOCKUP_DETECTOR -static int hardlockup_panic = +unsigned int __read_mostly hardlockup_panic = CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; +static unsigned long hardlockup_allcpu_dumped; /* * We may not want to enable hard lockup detection by default in all cases, * for example when running the kernel as a guest on a hypervisor. In these @@ -146,6 +176,13 @@ static int __init softlockup_all_cpu_backtrace_setup(char *str) return 1; } __setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup); +static int __init hardlockup_all_cpu_backtrace_setup(char *str) +{ + sysctl_hardlockup_all_cpu_backtrace = + !!simple_strtol(str, NULL, 0); + return 1; +} +__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); #endif /* @@ -207,7 +244,7 @@ void touch_all_softlockup_watchdogs(void) * do we care if a 0 races with a timestamp? * all it means is the softlock check starts one cycle later */ - for_each_online_cpu(cpu) + for_each_watchdog_cpu(cpu) per_cpu(watchdog_touch_ts, cpu) = 0; } @@ -236,15 +273,15 @@ void touch_softlockup_watchdog_sync(void) #ifdef CONFIG_HARDLOCKUP_DETECTOR /* watchdog detector functions */ -static int is_hardlockup(void) +static bool is_hardlockup(void) { unsigned long hrint = __this_cpu_read(hrtimer_interrupts); if (__this_cpu_read(hrtimer_interrupts_saved) == hrint) - return 1; + return true; __this_cpu_write(hrtimer_interrupts_saved, hrint); - return 0; + return false; } #endif @@ -252,7 +289,7 @@ static int is_softlockup(unsigned long touch_ts) { unsigned long now = get_timestamp(); - if (watchdog_enabled & SOFT_WATCHDOG_ENABLED) { + if ((watchdog_enabled & SOFT_WATCHDOG_ENABLED) && watchdog_thresh){ /* Warn about unreasonable delays. */ if (time_after(now, touch_ts + get_softlockup_thresh())) return now - touch_ts; @@ -293,6 +330,7 @@ static void watchdog_overflow_callback(struct perf_event *event, */ if (is_hardlockup()) { int this_cpu = smp_processor_id(); + struct pt_regs *regs = get_irq_regs(); /* only print hardlockups once */ if (__this_cpu_read(hard_watchdog_warn) == true) @@ -303,15 +341,27 @@ static void watchdog_overflow_callback(struct perf_event *event, */ printk_kill(); - if (hardlockup_panic) { - panic("Watchdog detected hard LOCKUP on cpu %d", - this_cpu); - } else { - raw_spin_lock(&watchdog_output_lock); - WARN(1, "Watchdog detected hard LOCKUP on cpu %d", - this_cpu); - raw_spin_unlock(&watchdog_output_lock); - } + raw_spin_lock(&watchdog_output_lock); + + pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu); + print_modules(); + print_irqtrace_events(current); + if (regs) + show_regs(regs); + else + dump_stack(); + + /* + * Perform all-CPU dump only once to avoid multiple hardlockups + * generating interleaving traces + */ + if (sysctl_hardlockup_all_cpu_backtrace && + !test_and_set_bit(0, &hardlockup_allcpu_dumped)) + trigger_allbutself_cpu_backtrace(); + + raw_spin_unlock(&watchdog_output_lock); + if (hardlockup_panic) + panic("Hard LOCKUP"); __this_cpu_write(hard_watchdog_warn, true); return; @@ -330,6 +380,9 @@ static void watchdog_interrupt_count(void) static int watchdog_nmi_enable(unsigned int cpu); static void watchdog_nmi_disable(unsigned int cpu); +static int watchdog_enable_all_cpus(void); +static void watchdog_disable_all_cpus(void); + /* watchdog kicker functions */ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) { @@ -617,46 +670,9 @@ static void watchdog_nmi_disable(unsigned int cpu) } } -void watchdog_nmi_enable_all(void) -{ - int cpu; - - mutex_lock(&watchdog_proc_mutex); - - if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) - goto unlock; - - get_online_cpus(); - for_each_online_cpu(cpu) - watchdog_nmi_enable(cpu); - put_online_cpus(); - -unlock: - mutex_unlock(&watchdog_proc_mutex); -} - -void watchdog_nmi_disable_all(void) -{ - int cpu; - - mutex_lock(&watchdog_proc_mutex); - - if (!watchdog_running) - goto unlock; - - get_online_cpus(); - for_each_online_cpu(cpu) - watchdog_nmi_disable(cpu); - put_online_cpus(); - -unlock: - mutex_unlock(&watchdog_proc_mutex); -} #else static int watchdog_nmi_enable(unsigned int cpu) { return 0; } static void watchdog_nmi_disable(unsigned int cpu) { return; } -void watchdog_nmi_enable_all(void) {} -void watchdog_nmi_disable_all(void) {} #endif /* CONFIG_HARDLOCKUP_DETECTOR */ static struct smp_hotplug_thread watchdog_threads = { @@ -670,46 +686,107 @@ static struct smp_hotplug_thread watchdog_threads = { .unpark = watchdog_enable, }; -static void restart_watchdog_hrtimer(void *info) +/* + * park all watchdog threads that are specified in 'watchdog_cpumask' + * + * This function returns an error if kthread_park() of a watchdog thread + * fails. In this situation, the watchdog threads of some CPUs can already + * be parked and the watchdog threads of other CPUs can still be runnable. + * Callers are expected to handle this special condition as appropriate in + * their context. + * + * This function may only be called in a context that is protected against + * races with CPU hotplug - for example, via get_online_cpus(). + */ +static int watchdog_park_threads(void) +{ + int cpu, ret = 0; + + for_each_watchdog_cpu(cpu) { + ret = kthread_park(per_cpu(softlockup_watchdog, cpu)); + if (ret) + break; + } + + return ret; +} + +/* + * unpark all watchdog threads that are specified in 'watchdog_cpumask' + * + * This function may only be called in a context that is protected against + * races with CPU hotplug - for example, via get_online_cpus(). + */ +static void watchdog_unpark_threads(void) { - struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); - int ret; + int cpu; + for_each_watchdog_cpu(cpu) + kthread_unpark(per_cpu(softlockup_watchdog, cpu)); +} + +/* + * Suspend the hard and soft lockup detector by parking the watchdog threads. + */ +int lockup_detector_suspend(void) +{ + int ret = 0; + + get_online_cpus(); + mutex_lock(&watchdog_proc_mutex); /* - * No need to cancel and restart hrtimer if it is currently executing - * because it will reprogram itself with the new period now. - * We should never see it unqueued here because we are running per-cpu - * with interrupts disabled. + * Multiple suspend requests can be active in parallel (counted by + * the 'watchdog_suspended' variable). If the watchdog threads are + * running, the first caller takes care that they will be parked. + * The state of 'watchdog_running' cannot change while a suspend + * request is active (see related code in 'proc' handlers). */ - ret = hrtimer_try_to_cancel(hrtimer); - if (ret == 1) - hrtimer_start(hrtimer, ns_to_ktime(sample_period), - HRTIMER_MODE_REL_PINNED); + if (watchdog_running && !watchdog_suspended) + ret = watchdog_park_threads(); + + if (ret == 0) + watchdog_suspended++; + else { + watchdog_disable_all_cpus(); + pr_err("Failed to suspend lockup detectors, disabled\n"); + watchdog_enabled = 0; + } + + mutex_unlock(&watchdog_proc_mutex); + + return ret; } -static void update_watchdog(int cpu) +/* + * Resume the hard and soft lockup detector by unparking the watchdog threads. + */ +void lockup_detector_resume(void) { + mutex_lock(&watchdog_proc_mutex); + + watchdog_suspended--; /* - * Make sure that perf event counter will adopt to a new - * sampling period. Updating the sampling period directly would - * be much nicer but we do not have an API for that now so - * let's use a big hammer. - * Hrtimer will adopt the new period on the next tick but this - * might be late already so we have to restart the timer as well. + * The watchdog threads are unparked if they were previously running + * and if there is no more active suspend request. */ - watchdog_nmi_disable(cpu); - smp_call_function_single(cpu, restart_watchdog_hrtimer, NULL, 1); - watchdog_nmi_enable(cpu); + if (watchdog_running && !watchdog_suspended) + watchdog_unpark_threads(); + + mutex_unlock(&watchdog_proc_mutex); + put_online_cpus(); } -static void update_watchdog_all_cpus(void) +static int update_watchdog_all_cpus(void) { - int cpu; + int ret; - get_online_cpus(); - for_each_online_cpu(cpu) - update_watchdog(cpu); - put_online_cpus(); + ret = watchdog_park_threads(); + if (ret) + return ret; + + watchdog_unpark_threads(); + + return 0; } static int watchdog_enable_all_cpus(void) @@ -717,7 +794,8 @@ static int watchdog_enable_all_cpus(void) int err = 0; if (!watchdog_running) { - err = smpboot_register_percpu_thread(&watchdog_threads); + err = smpboot_register_percpu_thread_cpumask(&watchdog_threads, + &watchdog_cpumask); if (err) pr_err("Failed to create watchdog threads, disabled\n"); else @@ -727,15 +805,20 @@ static int watchdog_enable_all_cpus(void) * Enable/disable the lockup detectors or * change the sample period 'on the fly'. */ - update_watchdog_all_cpus(); + err = update_watchdog_all_cpus(); + + if (err) { + watchdog_disable_all_cpus(); + pr_err("Failed to update lockup detectors, disabled\n"); + } } + if (err) + watchdog_enabled = 0; + return err; } -/* prepare/enable/disable routines */ -/* sysctl functions */ -#ifdef CONFIG_SYSCTL static void watchdog_disable_all_cpus(void) { if (watchdog_running) { @@ -744,6 +827,8 @@ static void watchdog_disable_all_cpus(void) } } +#ifdef CONFIG_SYSCTL + /* * Update the run state of the lockup detectors. */ @@ -785,8 +870,15 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, int err, old, new; int *watchdog_param = (int *)table->data; + get_online_cpus(); mutex_lock(&watchdog_proc_mutex); + if (watchdog_suspended) { + /* no parameter changes allowed while watchdog is suspended */ + err = -EAGAIN; + goto out; + } + /* * If the parameter is being read return the state of the corresponding * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the @@ -820,15 +912,17 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, } while (cmpxchg(&watchdog_enabled, old, new) != old); /* - * Update the run state of the lockup detectors. - * Restore 'watchdog_enabled' on failure. + * Update the run state of the lockup detectors. There is _no_ + * need to check the value returned by proc_watchdog_update() + * and to restore the previous value of 'watchdog_enabled' as + * both lockup detectors are disabled if proc_watchdog_update() + * returns an error. */ err = proc_watchdog_update(); - if (err) - watchdog_enabled = old; } out: mutex_unlock(&watchdog_proc_mutex); + put_online_cpus(); return err; } @@ -870,8 +964,15 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, { int err, old; + get_online_cpus(); mutex_lock(&watchdog_proc_mutex); + if (watchdog_suspended) { + /* no parameter changes allowed while watchdog is suspended */ + err = -EAGAIN; + goto out; + } + old = ACCESS_ONCE(watchdog_thresh); err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); @@ -879,23 +980,79 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, goto out; /* - * Update the sample period. - * Restore 'watchdog_thresh' on failure. + * Update the sample period. Restore on failure. */ set_sample_period(); err = proc_watchdog_update(); - if (err) + if (err) { watchdog_thresh = old; + set_sample_period(); + } out: mutex_unlock(&watchdog_proc_mutex); + put_online_cpus(); return err; } + +/* + * The cpumask is the mask of possible cpus that the watchdog can run + * on, not the mask of cpus it is actually running on. This allows the + * user to specify a mask that will include cpus that have not yet + * been brought online, if desired. + */ +int proc_watchdog_cpumask(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int err; + + get_online_cpus(); + mutex_lock(&watchdog_proc_mutex); + + if (watchdog_suspended) { + /* no parameter changes allowed while watchdog is suspended */ + err = -EAGAIN; + goto out; + } + + err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); + if (!err && write) { + /* Remove impossible cpus to keep sysctl output cleaner. */ + cpumask_and(&watchdog_cpumask, &watchdog_cpumask, + cpu_possible_mask); + + if (watchdog_running) { + /* + * Failure would be due to being unable to allocate + * a temporary cpumask, so we are likely not in a + * position to do much else to make things better. + */ + if (smpboot_update_cpumask_percpu_thread( + &watchdog_threads, &watchdog_cpumask) != 0) + pr_err("cpumask update failed\n"); + } + } +out: + mutex_unlock(&watchdog_proc_mutex); + put_online_cpus(); + return err; +} + #endif /* CONFIG_SYSCTL */ void __init lockup_detector_init(void) { set_sample_period(); +#ifdef CONFIG_NO_HZ_FULL + if (tick_nohz_full_enabled()) { + pr_info("Disabling watchdog on nohz_full cores by default\n"); + cpumask_copy(&watchdog_cpumask, housekeeping_mask); + } else + cpumask_copy(&watchdog_cpumask, cpu_possible_mask); +#else + cpumask_copy(&watchdog_cpumask, cpu_possible_mask); +#endif + if (watchdog_enabled) watchdog_enable_all_cpus(); } diff --git a/kernel/kernel/workqueue.c b/kernel/kernel/workqueue.c index 21daecdfd..efd1f5822 100644 --- a/kernel/kernel/workqueue.c +++ b/kernel/kernel/workqueue.c @@ -134,6 +134,11 @@ enum { * * PR: wq_pool_mutex protected for writes. RCU protected for reads. * + * PW: wq_pool_mutex and wq->mutex protected for writes. Either for reads. + * + * PWR: wq_pool_mutex and wq->mutex protected for writes. Either or + * sched-RCU for reads. + * * WQ: wq->mutex protected. * * WR: wq->mutex protected for writes. RCU protected for reads. @@ -254,8 +259,8 @@ struct workqueue_struct { int nr_drainers; /* WQ: drain in progress */ int saved_max_active; /* WQ: saved pwq max_active */ - struct workqueue_attrs *unbound_attrs; /* WQ: only for unbound wqs */ - struct pool_workqueue *dfl_pwq; /* WQ: only for unbound wqs */ + struct workqueue_attrs *unbound_attrs; /* PW: only for unbound wqs */ + struct pool_workqueue *dfl_pwq; /* PW: only for unbound wqs */ #ifdef CONFIG_SYSFS struct wq_device *wq_dev; /* I: for sysfs interface */ @@ -275,7 +280,7 @@ struct workqueue_struct { /* hot fields used during command issue, aligned to cacheline */ unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */ struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */ - struct pool_workqueue __rcu *numa_pwq_tbl[]; /* FR: unbound pwqs indexed by node */ + struct pool_workqueue __rcu *numa_pwq_tbl[]; /* PWR: unbound pwqs indexed by node */ }; static struct kmem_cache *pwq_cache; @@ -287,12 +292,7 @@ static bool wq_disable_numa; module_param_named(disable_numa, wq_disable_numa, bool, 0444); /* see the comment above the definition of WQ_POWER_EFFICIENT */ -#ifdef CONFIG_WQ_POWER_EFFICIENT_DEFAULT -static bool wq_power_efficient = true; -#else -static bool wq_power_efficient; -#endif - +static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT); module_param_named(power_efficient, wq_power_efficient, bool, 0444); static bool wq_numa_enabled; /* unbound NUMA affinity enabled */ @@ -306,6 +306,8 @@ static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ static LIST_HEAD(workqueues); /* PR: list of all workqueues */ static bool workqueue_freezing; /* PL: have wqs started freezing? */ +static cpumask_var_t wq_unbound_cpumask; /* PL: low level cpumask for all unbound wqs */ + /* the per-cpu worker pools */ static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools); @@ -339,22 +341,26 @@ EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq); static DEFINE_LOCAL_IRQ_LOCK(pendingb_lock); static int worker_thread(void *__worker); -static void copy_workqueue_attrs(struct workqueue_attrs *to, - const struct workqueue_attrs *from); static void workqueue_sysfs_unregister(struct workqueue_struct *wq); #define CREATE_TRACE_POINTS #include <trace/events/workqueue.h> #define assert_rcu_or_pool_mutex() \ - rcu_lockdep_assert(rcu_read_lock_held() || \ - lockdep_is_held(&wq_pool_mutex), \ - "RCU or wq_pool_mutex should be held") + RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ + !lockdep_is_held(&wq_pool_mutex), \ + "RCU or wq_pool_mutex should be held") #define assert_rcu_or_wq_mutex(wq) \ - rcu_lockdep_assert(rcu_read_lock_held() || \ - lockdep_is_held(&wq->mutex), \ - "RCU or wq->mutex should be held") + RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ + !lockdep_is_held(&wq->mutex), \ + "RCU or wq->mutex should be held") + +#define assert_rcu_or_wq_mutex_or_pool_mutex(wq) \ + RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ + !lockdep_is_held(&wq->mutex) && \ + !lockdep_is_held(&wq_pool_mutex), \ + "RCU, wq->mutex or wq_pool_mutex should be held") #define for_each_cpu_worker_pool(pool, cpu) \ for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \ @@ -585,7 +591,8 @@ static int worker_pool_assign_id(struct worker_pool *pool) * @wq: the target workqueue * @node: the node ID * - * This must be called either with pwq_lock held or RCU read locked. + * This must be called with any of wq_pool_mutex, wq->mutex or RCU + * read locked. * If the pwq needs to be used beyond the locking in effect, the caller is * responsible for guaranteeing that the pwq stays online. * @@ -594,7 +601,17 @@ static int worker_pool_assign_id(struct worker_pool *pool) static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq, int node) { - assert_rcu_or_wq_mutex(wq); + assert_rcu_or_wq_mutex_or_pool_mutex(wq); + + /* + * XXX: @node can be NUMA_NO_NODE if CPU goes offline while a + * delayed item is pending. The plan is to keep CPU -> NODE + * mapping valid and stable across CPU on/offlines. Once that + * happens, this workaround can be removed. + */ + if (unlikely(node == NUMA_NO_NODE)) + return wq->dfl_pwq; + return rcu_dereference_raw(wq->numa_pwq_tbl[node]); } @@ -1000,7 +1017,7 @@ static struct worker *find_worker_executing_work(struct worker_pool *pool, * move_linked_works - move linked works to a list * @work: start of series of works to be scheduled * @head: target list to append @work to - * @nextp: out paramter for nested worklist walking + * @nextp: out parameter for nested worklist walking * * Schedule linked works starting from @work to @head. Work series to * be scheduled starts at @work and includes any consecutive work with @@ -1740,9 +1757,7 @@ static struct worker *create_worker(struct worker_pool *pool) goto fail; set_user_nice(worker->task, pool->attrs->nice); - - /* prevent userland from meddling with cpumask of workqueue workers */ - worker->task->flags |= PF_NO_SETAFFINITY; + kthread_bind_mask(worker->task, pool->attrs->cpumask); /* successful, attach the worker to the pool */ worker_attach_to_pool(worker, pool); @@ -2642,7 +2657,7 @@ void flush_workqueue(struct workqueue_struct *wq) out_unlock: mutex_unlock(&wq->mutex); } -EXPORT_SYMBOL_GPL(flush_workqueue); +EXPORT_SYMBOL(flush_workqueue); /** * drain_workqueue - drain a workqueue @@ -2651,7 +2666,7 @@ EXPORT_SYMBOL_GPL(flush_workqueue); * Wait until the workqueue becomes empty. While draining is in progress, * only chain queueing is allowed. IOW, only currently pending or running * work items on @wq can queue further work items on it. @wq is flushed - * repeatedly until it becomes empty. The number of flushing is detemined + * repeatedly until it becomes empty. The number of flushing is determined * by the depth of chaining and should be relatively short. Whine if it * takes too long. */ @@ -2983,36 +2998,6 @@ int schedule_on_each_cpu(work_func_t func) } /** - * flush_scheduled_work - ensure that any scheduled work has run to completion. - * - * Forces execution of the kernel-global workqueue and blocks until its - * completion. - * - * Think twice before calling this function! It's very easy to get into - * trouble if you don't take great care. Either of the following situations - * will lead to deadlock: - * - * One of the work items currently on the workqueue needs to acquire - * a lock held by your code or its caller. - * - * Your code is running in the context of a work routine. - * - * They will be detected by lockdep when they occur, but the first might not - * occur very often. It depends on what work items are on the workqueue and - * what locks they need, which you have no control over. - * - * In most situations flushing the entire workqueue is overkill; you merely - * need to know that a particular work item isn't queued and isn't running. - * In such cases you should use cancel_delayed_work_sync() or - * cancel_work_sync() instead. - */ -void flush_scheduled_work(void) -{ - flush_workqueue(system_wq); -} -EXPORT_SYMBOL(flush_scheduled_work); - -/** * execute_in_process_context - reliably execute the routine with user context * @fn: the function to execute * @ew: guaranteed storage for the execute work structure (must @@ -3117,7 +3102,7 @@ static bool wqattrs_equal(const struct workqueue_attrs *a, * init_worker_pool - initialize a newly zalloc'd worker_pool * @pool: worker_pool to initialize * - * Initiailize a newly zalloc'd @pool. It also allocates @pool->attrs. + * Initialize a newly zalloc'd @pool. It also allocates @pool->attrs. * * Return: 0 on success, -errno on failure. Even on failure, all fields * inside @pool proper are initialized and put_unbound_pool() can be called @@ -3260,6 +3245,7 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) u32 hash = wqattrs_hash(attrs); struct worker_pool *pool; int node; + int target_node = NUMA_NO_NODE; lockdep_assert_held(&wq_pool_mutex); @@ -3271,13 +3257,25 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) } } + /* if cpumask is contained inside a NUMA node, we belong to that node */ + if (wq_numa_enabled) { + for_each_node(node) { + if (cpumask_subset(attrs->cpumask, + wq_numa_possible_cpumask[node])) { + target_node = node; + break; + } + } + } + /* nope, create a new one */ - pool = kzalloc(sizeof(*pool), GFP_KERNEL); + pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, target_node); if (!pool || init_worker_pool(pool) < 0) goto fail; lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */ copy_workqueue_attrs(pool->attrs, attrs); + pool->node = target_node; /* * no_numa isn't a worker_pool attribute, always clear it. See @@ -3285,17 +3283,6 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) */ pool->attrs->no_numa = false; - /* if cpumask is contained inside a NUMA node, we belong to that node */ - if (wq_numa_enabled) { - for_each_node(node) { - if (cpumask_subset(pool->attrs->cpumask, - wq_numa_possible_cpumask[node])) { - pool->node = node; - break; - } - } - } - if (worker_pool_assign_id(pool) < 0) goto fail; @@ -3461,20 +3448,9 @@ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq, return pwq; } -/* undo alloc_unbound_pwq(), used only in the error path */ -static void free_unbound_pwq(struct pool_workqueue *pwq) -{ - lockdep_assert_held(&wq_pool_mutex); - - if (pwq) { - put_unbound_pool(pwq->pool); - kmem_cache_free(pwq_cache, pwq); - } -} - /** - * wq_calc_node_mask - calculate a wq_attrs' cpumask for the specified node - * @attrs: the wq_attrs of interest + * wq_calc_node_cpumask - calculate a wq_attrs' cpumask for the specified node + * @attrs: the wq_attrs of the default pwq of the target workqueue * @node: the target NUMA node * @cpu_going_down: if >= 0, the CPU to consider as offline * @cpumask: outarg, the resulting cpumask @@ -3524,6 +3500,7 @@ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq, { struct pool_workqueue *old_pwq; + lockdep_assert_held(&wq_pool_mutex); lockdep_assert_held(&wq->mutex); /* link_pwq() can handle duplicate calls */ @@ -3534,46 +3511,59 @@ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq, return old_pwq; } -/** - * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue - * @wq: the target workqueue - * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs() - * - * Apply @attrs to an unbound workqueue @wq. Unless disabled, on NUMA - * machines, this function maps a separate pwq to each NUMA node with - * possibles CPUs in @attrs->cpumask so that work items are affine to the - * NUMA node it was issued on. Older pwqs are released as in-flight work - * items finish. Note that a work item which repeatedly requeues itself - * back-to-back will stay on its current pwq. - * - * Performs GFP_KERNEL allocations. - * - * Return: 0 on success and -errno on failure. - */ -int apply_workqueue_attrs(struct workqueue_struct *wq, - const struct workqueue_attrs *attrs) +/* context to store the prepared attrs & pwqs before applying */ +struct apply_wqattrs_ctx { + struct workqueue_struct *wq; /* target workqueue */ + struct workqueue_attrs *attrs; /* attrs to apply */ + struct list_head list; /* queued for batching commit */ + struct pool_workqueue *dfl_pwq; + struct pool_workqueue *pwq_tbl[]; +}; + +/* free the resources after success or abort */ +static void apply_wqattrs_cleanup(struct apply_wqattrs_ctx *ctx) +{ + if (ctx) { + int node; + + for_each_node(node) + put_pwq_unlocked(ctx->pwq_tbl[node]); + put_pwq_unlocked(ctx->dfl_pwq); + + free_workqueue_attrs(ctx->attrs); + + kfree(ctx); + } +} + +/* allocate the attrs and pwqs for later installation */ +static struct apply_wqattrs_ctx * +apply_wqattrs_prepare(struct workqueue_struct *wq, + const struct workqueue_attrs *attrs) { + struct apply_wqattrs_ctx *ctx; struct workqueue_attrs *new_attrs, *tmp_attrs; - struct pool_workqueue **pwq_tbl, *dfl_pwq; - int node, ret; + int node; - /* only unbound workqueues can change attributes */ - if (WARN_ON(!(wq->flags & WQ_UNBOUND))) - return -EINVAL; + lockdep_assert_held(&wq_pool_mutex); - /* creating multiple pwqs breaks ordering guarantee */ - if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs))) - return -EINVAL; + ctx = kzalloc(sizeof(*ctx) + nr_node_ids * sizeof(ctx->pwq_tbl[0]), + GFP_KERNEL); - pwq_tbl = kzalloc(nr_node_ids * sizeof(pwq_tbl[0]), GFP_KERNEL); new_attrs = alloc_workqueue_attrs(GFP_KERNEL); tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL); - if (!pwq_tbl || !new_attrs || !tmp_attrs) - goto enomem; + if (!ctx || !new_attrs || !tmp_attrs) + goto out_free; - /* make a copy of @attrs and sanitize it */ + /* + * Calculate the attrs of the default pwq. + * If the user configured cpumask doesn't overlap with the + * wq_unbound_cpumask, we fallback to the wq_unbound_cpumask. + */ copy_workqueue_attrs(new_attrs, attrs); - cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask); + cpumask_and(new_attrs->cpumask, new_attrs->cpumask, wq_unbound_cpumask); + if (unlikely(cpumask_empty(new_attrs->cpumask))) + cpumask_copy(new_attrs->cpumask, wq_unbound_cpumask); /* * We may create multiple pwqs with differing cpumasks. Make a @@ -3583,75 +3573,129 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, copy_workqueue_attrs(tmp_attrs, new_attrs); /* - * CPUs should stay stable across pwq creations and installations. - * Pin CPUs, determine the target cpumask for each node and create - * pwqs accordingly. - */ - get_online_cpus(); - - mutex_lock(&wq_pool_mutex); - - /* * If something goes wrong during CPU up/down, we'll fall back to * the default pwq covering whole @attrs->cpumask. Always create * it even if we don't use it immediately. */ - dfl_pwq = alloc_unbound_pwq(wq, new_attrs); - if (!dfl_pwq) - goto enomem_pwq; + ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs); + if (!ctx->dfl_pwq) + goto out_free; for_each_node(node) { - if (wq_calc_node_cpumask(attrs, node, -1, tmp_attrs->cpumask)) { - pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs); - if (!pwq_tbl[node]) - goto enomem_pwq; + if (wq_calc_node_cpumask(new_attrs, node, -1, tmp_attrs->cpumask)) { + ctx->pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs); + if (!ctx->pwq_tbl[node]) + goto out_free; } else { - dfl_pwq->refcnt++; - pwq_tbl[node] = dfl_pwq; + ctx->dfl_pwq->refcnt++; + ctx->pwq_tbl[node] = ctx->dfl_pwq; } } - mutex_unlock(&wq_pool_mutex); + /* save the user configured attrs and sanitize it. */ + copy_workqueue_attrs(new_attrs, attrs); + cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask); + ctx->attrs = new_attrs; + + ctx->wq = wq; + free_workqueue_attrs(tmp_attrs); + return ctx; + +out_free: + free_workqueue_attrs(tmp_attrs); + free_workqueue_attrs(new_attrs); + apply_wqattrs_cleanup(ctx); + return NULL; +} + +/* set attrs and install prepared pwqs, @ctx points to old pwqs on return */ +static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx) +{ + int node; /* all pwqs have been created successfully, let's install'em */ - mutex_lock(&wq->mutex); + mutex_lock(&ctx->wq->mutex); - copy_workqueue_attrs(wq->unbound_attrs, new_attrs); + copy_workqueue_attrs(ctx->wq->unbound_attrs, ctx->attrs); /* save the previous pwq and install the new one */ for_each_node(node) - pwq_tbl[node] = numa_pwq_tbl_install(wq, node, pwq_tbl[node]); + ctx->pwq_tbl[node] = numa_pwq_tbl_install(ctx->wq, node, + ctx->pwq_tbl[node]); /* @dfl_pwq might not have been used, ensure it's linked */ - link_pwq(dfl_pwq); - swap(wq->dfl_pwq, dfl_pwq); + link_pwq(ctx->dfl_pwq); + swap(ctx->wq->dfl_pwq, ctx->dfl_pwq); - mutex_unlock(&wq->mutex); + mutex_unlock(&ctx->wq->mutex); +} - /* put the old pwqs */ - for_each_node(node) - put_pwq_unlocked(pwq_tbl[node]); - put_pwq_unlocked(dfl_pwq); +static void apply_wqattrs_lock(void) +{ + /* CPUs should stay stable across pwq creations and installations */ + get_online_cpus(); + mutex_lock(&wq_pool_mutex); +} +static void apply_wqattrs_unlock(void) +{ + mutex_unlock(&wq_pool_mutex); put_online_cpus(); - ret = 0; - /* fall through */ -out_free: - free_workqueue_attrs(tmp_attrs); - free_workqueue_attrs(new_attrs); - kfree(pwq_tbl); +} + +static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, + const struct workqueue_attrs *attrs) +{ + struct apply_wqattrs_ctx *ctx; + int ret = -ENOMEM; + + /* only unbound workqueues can change attributes */ + if (WARN_ON(!(wq->flags & WQ_UNBOUND))) + return -EINVAL; + + /* creating multiple pwqs breaks ordering guarantee */ + if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs))) + return -EINVAL; + + ctx = apply_wqattrs_prepare(wq, attrs); + + /* the ctx has been prepared successfully, let's commit it */ + if (ctx) { + apply_wqattrs_commit(ctx); + ret = 0; + } + + apply_wqattrs_cleanup(ctx); + return ret; +} -enomem_pwq: - free_unbound_pwq(dfl_pwq); - for_each_node(node) - if (pwq_tbl && pwq_tbl[node] != dfl_pwq) - free_unbound_pwq(pwq_tbl[node]); - mutex_unlock(&wq_pool_mutex); - put_online_cpus(); -enomem: - ret = -ENOMEM; - goto out_free; +/** + * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue + * @wq: the target workqueue + * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs() + * + * Apply @attrs to an unbound workqueue @wq. Unless disabled, on NUMA + * machines, this function maps a separate pwq to each NUMA node with + * possibles CPUs in @attrs->cpumask so that work items are affine to the + * NUMA node it was issued on. Older pwqs are released as in-flight work + * items finish. Note that a work item which repeatedly requeues itself + * back-to-back will stay on its current pwq. + * + * Performs GFP_KERNEL allocations. + * + * Return: 0 on success and -errno on failure. + */ +int apply_workqueue_attrs(struct workqueue_struct *wq, + const struct workqueue_attrs *attrs) +{ + int ret; + + apply_wqattrs_lock(); + ret = apply_workqueue_attrs_locked(wq, attrs); + apply_wqattrs_unlock(); + + return ret; } /** @@ -3687,7 +3731,8 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, lockdep_assert_held(&wq_pool_mutex); - if (!wq_numa_enabled || !(wq->flags & WQ_UNBOUND)) + if (!wq_numa_enabled || !(wq->flags & WQ_UNBOUND) || + wq->unbound_attrs->no_numa) return; /* @@ -3698,48 +3743,37 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, target_attrs = wq_update_unbound_numa_attrs_buf; cpumask = target_attrs->cpumask; - mutex_lock(&wq->mutex); - if (wq->unbound_attrs->no_numa) - goto out_unlock; - copy_workqueue_attrs(target_attrs, wq->unbound_attrs); pwq = unbound_pwq_by_node(wq, node); /* * Let's determine what needs to be done. If the target cpumask is - * different from wq's, we need to compare it to @pwq's and create - * a new one if they don't match. If the target cpumask equals - * wq's, the default pwq should be used. + * different from the default pwq's, we need to compare it to @pwq's + * and create a new one if they don't match. If the target cpumask + * equals the default pwq's, the default pwq should be used. */ - if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) { + if (wq_calc_node_cpumask(wq->dfl_pwq->pool->attrs, node, cpu_off, cpumask)) { if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask)) - goto out_unlock; + return; } else { goto use_dfl_pwq; } - mutex_unlock(&wq->mutex); - /* create a new pwq */ pwq = alloc_unbound_pwq(wq, target_attrs); if (!pwq) { pr_warn("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n", wq->name); - mutex_lock(&wq->mutex); goto use_dfl_pwq; } - /* - * Install the new pwq. As this function is called only from CPU - * hotplug callbacks and applying a new attrs is wrapped with - * get/put_online_cpus(), @wq->unbound_attrs couldn't have changed - * inbetween. - */ + /* Install the new pwq. */ mutex_lock(&wq->mutex); old_pwq = numa_pwq_tbl_install(wq, node, pwq); goto out_unlock; use_dfl_pwq: + mutex_lock(&wq->mutex); spin_lock_irq(&wq->dfl_pwq->pool->lock); get_pwq(wq->dfl_pwq); spin_unlock_irq(&wq->dfl_pwq->pool->lock); @@ -3868,7 +3902,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, } wq->rescuer = rescuer; - rescuer->task->flags |= PF_NO_SETAFFINITY; + kthread_bind_mask(rescuer->task, cpu_possible_mask); wake_up_process(rescuer->task); } @@ -4423,7 +4457,7 @@ static void rebind_workers(struct worker_pool *pool) /* * Restore CPU affinity of all workers. As all idle workers should * be on the run-queue of the associated CPU before any local - * wake-ups for concurrency management happen, restore CPU affinty + * wake-ups for concurrency management happen, restore CPU affinity * of all workers first and then clear UNBOUND. As we're called * from CPU_ONLINE, the following shouldn't fail. */ @@ -4736,6 +4770,82 @@ out_unlock: } #endif /* CONFIG_FREEZER */ +static int workqueue_apply_unbound_cpumask(void) +{ + LIST_HEAD(ctxs); + int ret = 0; + struct workqueue_struct *wq; + struct apply_wqattrs_ctx *ctx, *n; + + lockdep_assert_held(&wq_pool_mutex); + + list_for_each_entry(wq, &workqueues, list) { + if (!(wq->flags & WQ_UNBOUND)) + continue; + /* creating multiple pwqs breaks ordering guarantee */ + if (wq->flags & __WQ_ORDERED) + continue; + + ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs); + if (!ctx) { + ret = -ENOMEM; + break; + } + + list_add_tail(&ctx->list, &ctxs); + } + + list_for_each_entry_safe(ctx, n, &ctxs, list) { + if (!ret) + apply_wqattrs_commit(ctx); + apply_wqattrs_cleanup(ctx); + } + + return ret; +} + +/** + * workqueue_set_unbound_cpumask - Set the low-level unbound cpumask + * @cpumask: the cpumask to set + * + * The low-level workqueues cpumask is a global cpumask that limits + * the affinity of all unbound workqueues. This function check the @cpumask + * and apply it to all unbound workqueues and updates all pwqs of them. + * + * Retun: 0 - Success + * -EINVAL - Invalid @cpumask + * -ENOMEM - Failed to allocate memory for attrs or pwqs. + */ +int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) +{ + int ret = -EINVAL; + cpumask_var_t saved_cpumask; + + if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL)) + return -ENOMEM; + + cpumask_and(cpumask, cpumask, cpu_possible_mask); + if (!cpumask_empty(cpumask)) { + apply_wqattrs_lock(); + + /* save the old wq_unbound_cpumask. */ + cpumask_copy(saved_cpumask, wq_unbound_cpumask); + + /* update wq_unbound_cpumask at first and apply it to wqs. */ + cpumask_copy(wq_unbound_cpumask, cpumask); + ret = workqueue_apply_unbound_cpumask(); + + /* restore the wq_unbound_cpumask when failed. */ + if (ret < 0) + cpumask_copy(wq_unbound_cpumask, saved_cpumask); + + apply_wqattrs_unlock(); + } + + free_cpumask_var(saved_cpumask); + return ret; +} + #ifdef CONFIG_SYSFS /* * Workqueues with WQ_SYSFS flag set is visible to userland via @@ -4842,13 +4952,13 @@ static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq) { struct workqueue_attrs *attrs; + lockdep_assert_held(&wq_pool_mutex); + attrs = alloc_workqueue_attrs(GFP_KERNEL); if (!attrs) return NULL; - mutex_lock(&wq->mutex); copy_workqueue_attrs(attrs, wq->unbound_attrs); - mutex_unlock(&wq->mutex); return attrs; } @@ -4857,18 +4967,22 @@ static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr, { struct workqueue_struct *wq = dev_to_wq(dev); struct workqueue_attrs *attrs; - int ret; + int ret = -ENOMEM; + + apply_wqattrs_lock(); attrs = wq_sysfs_prep_attrs(wq); if (!attrs) - return -ENOMEM; + goto out_unlock; if (sscanf(buf, "%d", &attrs->nice) == 1 && attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE) - ret = apply_workqueue_attrs(wq, attrs); + ret = apply_workqueue_attrs_locked(wq, attrs); else ret = -EINVAL; +out_unlock: + apply_wqattrs_unlock(); free_workqueue_attrs(attrs); return ret ?: count; } @@ -4892,16 +5006,20 @@ static ssize_t wq_cpumask_store(struct device *dev, { struct workqueue_struct *wq = dev_to_wq(dev); struct workqueue_attrs *attrs; - int ret; + int ret = -ENOMEM; + + apply_wqattrs_lock(); attrs = wq_sysfs_prep_attrs(wq); if (!attrs) - return -ENOMEM; + goto out_unlock; ret = cpumask_parse(buf, attrs->cpumask); if (!ret) - ret = apply_workqueue_attrs(wq, attrs); + ret = apply_workqueue_attrs_locked(wq, attrs); +out_unlock: + apply_wqattrs_unlock(); free_workqueue_attrs(attrs); return ret ?: count; } @@ -4925,18 +5043,22 @@ static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr, { struct workqueue_struct *wq = dev_to_wq(dev); struct workqueue_attrs *attrs; - int v, ret; + int v, ret = -ENOMEM; + + apply_wqattrs_lock(); attrs = wq_sysfs_prep_attrs(wq); if (!attrs) - return -ENOMEM; + goto out_unlock; ret = -EINVAL; if (sscanf(buf, "%d", &v) == 1) { attrs->no_numa = !v; - ret = apply_workqueue_attrs(wq, attrs); + ret = apply_workqueue_attrs_locked(wq, attrs); } +out_unlock: + apply_wqattrs_unlock(); free_workqueue_attrs(attrs); return ret ?: count; } @@ -4954,9 +5076,49 @@ static struct bus_type wq_subsys = { .dev_groups = wq_sysfs_groups, }; +static ssize_t wq_unbound_cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int written; + + mutex_lock(&wq_pool_mutex); + written = scnprintf(buf, PAGE_SIZE, "%*pb\n", + cpumask_pr_args(wq_unbound_cpumask)); + mutex_unlock(&wq_pool_mutex); + + return written; +} + +static ssize_t wq_unbound_cpumask_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + cpumask_var_t cpumask; + int ret; + + if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL)) + return -ENOMEM; + + ret = cpumask_parse(buf, cpumask); + if (!ret) + ret = workqueue_set_unbound_cpumask(cpumask); + + free_cpumask_var(cpumask); + return ret ? ret : count; +} + +static struct device_attribute wq_sysfs_cpumask_attr = + __ATTR(cpumask, 0644, wq_unbound_cpumask_show, + wq_unbound_cpumask_store); + static int __init wq_sysfs_init(void) { - return subsys_virtual_register(&wq_subsys, NULL); + int err; + + err = subsys_virtual_register(&wq_subsys, NULL); + if (err) + return err; + + return device_create_file(wq_subsys.dev_root, &wq_sysfs_cpumask_attr); } core_initcall(wq_sysfs_init); @@ -4988,7 +5150,7 @@ int workqueue_sysfs_register(struct workqueue_struct *wq) int ret; /* - * Adjusting max_active or creating new pwqs by applyting + * Adjusting max_active or creating new pwqs by applying * attributes breaks ordering guarantee. Disallow exposing ordered * workqueues. */ @@ -5104,6 +5266,9 @@ static int __init init_workqueues(void) WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); + BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); + cpumask_copy(wq_unbound_cpumask, cpu_possible_mask); + pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP); |