summaryrefslogtreecommitdiffstats
path: root/kernel/kernel
diff options
context:
space:
mode:
authorYunhong Jiang <yunhong.jiang@linux.intel.com>2017-03-08 23:13:28 -0800
committerYunhong Jiang <yunhong.jiang@linux.intel.com>2017-03-08 23:36:15 -0800
commit52f993b8e89487ec9ee15a7fb4979e0f09a45b27 (patch)
treed65304486afe0bea4a311c783c0d72791c8c0aa2 /kernel/kernel
parentc189ccac5702322ed843fe17057035b7222a59b6 (diff)
Upgrade to 4.4.50-rt62
The current kernel is based on rt kernel v4.4.6-rt14. We will upgrade it to 4.4.50-rt62. The command to achieve it is: a) Clone a git repo from git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-stable-rt.git b) Get the diff between this two changesets: git diff 640eca2901f3435e616157b11379d3223a44b391 705619beeea1b0b48219a683fd1a901a86fdaf5e where the two commits are: [yjiang5@jnakajim-build linux-stable-rt]$ git show --oneline --name-only 640eca2901f3435e616157b11379d3223a44b391 640eca2901f3 v4.4.6-rt14 localversion-rt [yjiang5@jnakajim-build linux-stable-rt]$ git show --oneline --name-only 705619beeea1b0b48219a683fd1a901a86fdaf5e 705619beeea1 Linux 4.4.50-rt62 localversion-rt c) One patch has been backported thus revert the patch before applying. filterdiff -p1 -x scripts/package/Makefile ~/tmp/v4.4.6-rt14-4.4.50-rt62.diff |patch -p1 --dry-run Upstream status: backport Change-Id: I244d57a32f6066e5a5b9915f9fbf99e7bbca6e01 Signed-off-by: Yunhong Jiang <yunhong.jiang@linux.intel.com>
Diffstat (limited to 'kernel/kernel')
-rw-r--r--kernel/kernel/audit_watch.c8
-rw-r--r--kernel/kernel/auditsc.c332
-rw-r--r--kernel/kernel/bpf/helpers.c2
-rw-r--r--kernel/kernel/bpf/inode.c10
-rw-r--r--kernel/kernel/bpf/syscall.c24
-rw-r--r--kernel/kernel/bpf/verifier.c66
-rw-r--r--kernel/kernel/capability.c82
-rw-r--r--kernel/kernel/cgroup.c38
-rw-r--r--kernel/kernel/configs/tiny.config8
-rw-r--r--kernel/kernel/cpu.c12
-rw-r--r--kernel/kernel/cpuset.c36
-rw-r--r--kernel/kernel/cred.c2
-rw-r--r--kernel/kernel/debug/debug_core.c4
-rw-r--r--kernel/kernel/events/core.c139
-rw-r--r--kernel/kernel/events/ring_buffer.c10
-rw-r--r--kernel/kernel/events/uprobes.c5
-rw-r--r--kernel/kernel/exit.c29
-rw-r--r--kernel/kernel/fork.c51
-rw-r--r--kernel/kernel/futex.c29
-rw-r--r--kernel/kernel/irq/generic-chip.c21
-rw-r--r--kernel/kernel/irq/msi.c20
-rw-r--r--kernel/kernel/jump_label.c43
-rw-r--r--kernel/kernel/kexec_file.c3
-rw-r--r--kernel/kernel/locking/mcs_spinlock.h8
-rw-r--r--kernel/kernel/locking/mutex.c9
-rw-r--r--kernel/kernel/locking/qspinlock.c60
-rw-r--r--kernel/kernel/locking/rtmutex.c121
-rw-r--r--kernel/kernel/locking/rtmutex_common.h5
-rw-r--r--kernel/kernel/memremap.c4
-rw-r--r--kernel/kernel/module.c13
-rw-r--r--kernel/kernel/panic.c45
-rw-r--r--kernel/kernel/power/hibernate.c5
-rw-r--r--kernel/kernel/power/snapshot.c10
-rw-r--r--kernel/kernel/power/suspend_test.c4
-rw-r--r--kernel/kernel/printk/braille.c4
-rw-r--r--kernel/kernel/printk/printk.c10
-rw-r--r--kernel/kernel/ptrace.c28
-rw-r--r--kernel/kernel/rcu/tree_plugin.h1
-rw-r--r--kernel/kernel/sched/Makefile2
-rw-r--r--kernel/kernel/sched/core.c74
-rw-r--r--kernel/kernel/sched/cputime.c99
-rw-r--r--kernel/kernel/sched/fair.c78
-rw-r--r--kernel/kernel/sched/loadavg.c11
-rw-r--r--kernel/kernel/sched/sched.h13
-rw-r--r--kernel/kernel/sched/swork.c (renamed from kernel/kernel/sched/work-simple.c)2
-rw-r--r--kernel/kernel/sysctl.c60
-rw-r--r--kernel/kernel/sysctl_binary.c2
-rw-r--r--kernel/kernel/time/clocksource.c52
-rw-r--r--kernel/kernel/time/hrtimer.c7
-rw-r--r--kernel/kernel/time/ntp.c20
-rw-r--r--kernel/kernel/time/posix-cpu-timers.c1
-rw-r--r--kernel/kernel/time/tick-broadcast.c3
-rw-r--r--kernel/kernel/time/timekeeping.c35
-rw-r--r--kernel/kernel/time/timekeeping_debug.c9
-rw-r--r--kernel/kernel/time/timer.c4
-rw-r--r--kernel/kernel/trace/Makefile4
-rw-r--r--kernel/kernel/trace/ring_buffer.c35
-rw-r--r--kernel/kernel/trace/trace.c66
-rw-r--r--kernel/kernel/trace/trace_events.c17
-rw-r--r--kernel/kernel/trace/trace_functions_graph.c17
-rw-r--r--kernel/kernel/trace/trace_irqsoff.c8
-rw-r--r--kernel/kernel/trace/trace_printk.c10
-rw-r--r--kernel/kernel/watchdog.c12
-rw-r--r--kernel/kernel/workqueue.c42
64 files changed, 1419 insertions, 565 deletions
diff --git a/kernel/kernel/audit_watch.c b/kernel/kernel/audit_watch.c
index 656c7e93a..939945a56 100644
--- a/kernel/kernel/audit_watch.c
+++ b/kernel/kernel/audit_watch.c
@@ -19,6 +19,7 @@
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
+#include <linux/file.h>
#include <linux/kernel.h>
#include <linux/audit.h>
#include <linux/kthread.h>
@@ -544,10 +545,11 @@ int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark)
unsigned long ino;
dev_t dev;
- rcu_read_lock();
- exe_file = rcu_dereference(tsk->mm->exe_file);
+ exe_file = get_task_exe_file(tsk);
+ if (!exe_file)
+ return 0;
ino = exe_file->f_inode->i_ino;
dev = exe_file->f_inode->i_sb->s_dev;
- rcu_read_unlock();
+ fput(exe_file);
return audit_mark_compare(mark, ino, dev);
}
diff --git a/kernel/kernel/auditsc.c b/kernel/kernel/auditsc.c
index b86cc0495..48f45987d 100644
--- a/kernel/kernel/auditsc.c
+++ b/kernel/kernel/auditsc.c
@@ -73,6 +73,7 @@
#include <linux/compat.h>
#include <linux/ctype.h>
#include <linux/string.h>
+#include <linux/uaccess.h>
#include <uapi/linux/limits.h>
#include "audit.h"
@@ -82,7 +83,8 @@
#define AUDITSC_SUCCESS 1
#define AUDITSC_FAILURE 2
-/* no execve audit message should be longer than this (userspace limits) */
+/* no execve audit message should be longer than this (userspace limits),
+ * see the note near the top of audit_log_execve_info() about this value */
#define MAX_EXECVE_AUDIT_LEN 7500
/* max length to print of cmdline/proctitle value during audit */
@@ -988,184 +990,178 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
return rc;
}
-/*
- * to_send and len_sent accounting are very loose estimates. We aren't
- * really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being
- * within about 500 bytes (next page boundary)
- *
- * why snprintf? an int is up to 12 digits long. if we just assumed when
- * logging that a[%d]= was going to be 16 characters long we would be wasting
- * space in every audit message. In one 7500 byte message we can log up to
- * about 1000 min size arguments. That comes down to about 50% waste of space
- * if we didn't do the snprintf to find out how long arg_num_len was.
- */
-static int audit_log_single_execve_arg(struct audit_context *context,
- struct audit_buffer **ab,
- int arg_num,
- size_t *len_sent,
- const char __user *p,
- char *buf)
+static void audit_log_execve_info(struct audit_context *context,
+ struct audit_buffer **ab)
{
- char arg_num_len_buf[12];
- const char __user *tmp_p = p;
- /* how many digits are in arg_num? 5 is the length of ' a=""' */
- size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 5;
- size_t len, len_left, to_send;
- size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN;
- unsigned int i, has_cntl = 0, too_long = 0;
- int ret;
-
- /* strnlen_user includes the null we don't want to send */
- len_left = len = strnlen_user(p, MAX_ARG_STRLEN) - 1;
-
- /*
- * We just created this mm, if we can't find the strings
- * we just copied into it something is _very_ wrong. Similar
- * for strings that are too long, we should not have created
- * any.
- */
- if (WARN_ON_ONCE(len < 0 || len > MAX_ARG_STRLEN - 1)) {
- send_sig(SIGKILL, current, 0);
- return -1;
+ long len_max;
+ long len_rem;
+ long len_full;
+ long len_buf;
+ long len_abuf;
+ long len_tmp;
+ bool require_data;
+ bool encode;
+ unsigned int iter;
+ unsigned int arg;
+ char *buf_head;
+ char *buf;
+ const char __user *p = (const char __user *)current->mm->arg_start;
+
+ /* NOTE: this buffer needs to be large enough to hold all the non-arg
+ * data we put in the audit record for this argument (see the
+ * code below) ... at this point in time 96 is plenty */
+ char abuf[96];
+
+ /* NOTE: we set MAX_EXECVE_AUDIT_LEN to a rather arbitrary limit, the
+ * current value of 7500 is not as important as the fact that it
+ * is less than 8k, a setting of 7500 gives us plenty of wiggle
+ * room if we go over a little bit in the logging below */
+ WARN_ON_ONCE(MAX_EXECVE_AUDIT_LEN > 7500);
+ len_max = MAX_EXECVE_AUDIT_LEN;
+
+ /* scratch buffer to hold the userspace args */
+ buf_head = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL);
+ if (!buf_head) {
+ audit_panic("out of memory for argv string");
+ return;
}
+ buf = buf_head;
- /* walk the whole argument looking for non-ascii chars */
+ audit_log_format(*ab, "argc=%d", context->execve.argc);
+
+ len_rem = len_max;
+ len_buf = 0;
+ len_full = 0;
+ require_data = true;
+ encode = false;
+ iter = 0;
+ arg = 0;
do {
- if (len_left > MAX_EXECVE_AUDIT_LEN)
- to_send = MAX_EXECVE_AUDIT_LEN;
- else
- to_send = len_left;
- ret = copy_from_user(buf, tmp_p, to_send);
- /*
- * There is no reason for this copy to be short. We just
- * copied them here, and the mm hasn't been exposed to user-
- * space yet.
- */
- if (ret) {
- WARN_ON(1);
- send_sig(SIGKILL, current, 0);
- return -1;
- }
- buf[to_send] = '\0';
- has_cntl = audit_string_contains_control(buf, to_send);
- if (has_cntl) {
- /*
- * hex messages get logged as 2 bytes, so we can only
- * send half as much in each message
- */
- max_execve_audit_len = MAX_EXECVE_AUDIT_LEN / 2;
- break;
- }
- len_left -= to_send;
- tmp_p += to_send;
- } while (len_left > 0);
-
- len_left = len;
-
- if (len > max_execve_audit_len)
- too_long = 1;
-
- /* rewalk the argument actually logging the message */
- for (i = 0; len_left > 0; i++) {
- int room_left;
-
- if (len_left > max_execve_audit_len)
- to_send = max_execve_audit_len;
- else
- to_send = len_left;
-
- /* do we have space left to send this argument in this ab? */
- room_left = MAX_EXECVE_AUDIT_LEN - arg_num_len - *len_sent;
- if (has_cntl)
- room_left -= (to_send * 2);
- else
- room_left -= to_send;
- if (room_left < 0) {
- *len_sent = 0;
- audit_log_end(*ab);
- *ab = audit_log_start(context, GFP_KERNEL, AUDIT_EXECVE);
- if (!*ab)
- return 0;
- }
+ /* NOTE: we don't ever want to trust this value for anything
+ * serious, but the audit record format insists we
+ * provide an argument length for really long arguments,
+ * e.g. > MAX_EXECVE_AUDIT_LEN, so we have no choice but
+ * to use strncpy_from_user() to obtain this value for
+ * recording in the log, although we don't use it
+ * anywhere here to avoid a double-fetch problem */
+ if (len_full == 0)
+ len_full = strnlen_user(p, MAX_ARG_STRLEN) - 1;
+
+ /* read more data from userspace */
+ if (require_data) {
+ /* can we make more room in the buffer? */
+ if (buf != buf_head) {
+ memmove(buf_head, buf, len_buf);
+ buf = buf_head;
+ }
+
+ /* fetch as much as we can of the argument */
+ len_tmp = strncpy_from_user(&buf_head[len_buf], p,
+ len_max - len_buf);
+ if (len_tmp == -EFAULT) {
+ /* unable to copy from userspace */
+ send_sig(SIGKILL, current, 0);
+ goto out;
+ } else if (len_tmp == (len_max - len_buf)) {
+ /* buffer is not large enough */
+ require_data = true;
+ /* NOTE: if we are going to span multiple
+ * buffers force the encoding so we stand
+ * a chance at a sane len_full value and
+ * consistent record encoding */
+ encode = true;
+ len_full = len_full * 2;
+ p += len_tmp;
+ } else {
+ require_data = false;
+ if (!encode)
+ encode = audit_string_contains_control(
+ buf, len_tmp);
+ /* try to use a trusted value for len_full */
+ if (len_full < len_max)
+ len_full = (encode ?
+ len_tmp * 2 : len_tmp);
+ p += len_tmp + 1;
+ }
+ len_buf += len_tmp;
+ buf_head[len_buf] = '\0';
- /*
- * first record needs to say how long the original string was
- * so we can be sure nothing was lost.
- */
- if ((i == 0) && (too_long))
- audit_log_format(*ab, " a%d_len=%zu", arg_num,
- has_cntl ? 2*len : len);
-
- /*
- * normally arguments are small enough to fit and we already
- * filled buf above when we checked for control characters
- * so don't bother with another copy_from_user
- */
- if (len >= max_execve_audit_len)
- ret = copy_from_user(buf, p, to_send);
- else
- ret = 0;
- if (ret) {
- WARN_ON(1);
- send_sig(SIGKILL, current, 0);
- return -1;
+ /* length of the buffer in the audit record? */
+ len_abuf = (encode ? len_buf * 2 : len_buf + 2);
}
- buf[to_send] = '\0';
-
- /* actually log it */
- audit_log_format(*ab, " a%d", arg_num);
- if (too_long)
- audit_log_format(*ab, "[%d]", i);
- audit_log_format(*ab, "=");
- if (has_cntl)
- audit_log_n_hex(*ab, buf, to_send);
- else
- audit_log_string(*ab, buf);
-
- p += to_send;
- len_left -= to_send;
- *len_sent += arg_num_len;
- if (has_cntl)
- *len_sent += to_send * 2;
- else
- *len_sent += to_send;
- }
- /* include the null we didn't log */
- return len + 1;
-}
-static void audit_log_execve_info(struct audit_context *context,
- struct audit_buffer **ab)
-{
- int i, len;
- size_t len_sent = 0;
- const char __user *p;
- char *buf;
+ /* write as much as we can to the audit log */
+ if (len_buf > 0) {
+ /* NOTE: some magic numbers here - basically if we
+ * can't fit a reasonable amount of data into the
+ * existing audit buffer, flush it and start with
+ * a new buffer */
+ if ((sizeof(abuf) + 8) > len_rem) {
+ len_rem = len_max;
+ audit_log_end(*ab);
+ *ab = audit_log_start(context,
+ GFP_KERNEL, AUDIT_EXECVE);
+ if (!*ab)
+ goto out;
+ }
- p = (const char __user *)current->mm->arg_start;
+ /* create the non-arg portion of the arg record */
+ len_tmp = 0;
+ if (require_data || (iter > 0) ||
+ ((len_abuf + sizeof(abuf)) > len_rem)) {
+ if (iter == 0) {
+ len_tmp += snprintf(&abuf[len_tmp],
+ sizeof(abuf) - len_tmp,
+ " a%d_len=%lu",
+ arg, len_full);
+ }
+ len_tmp += snprintf(&abuf[len_tmp],
+ sizeof(abuf) - len_tmp,
+ " a%d[%d]=", arg, iter++);
+ } else
+ len_tmp += snprintf(&abuf[len_tmp],
+ sizeof(abuf) - len_tmp,
+ " a%d=", arg);
+ WARN_ON(len_tmp >= sizeof(abuf));
+ abuf[sizeof(abuf) - 1] = '\0';
+
+ /* log the arg in the audit record */
+ audit_log_format(*ab, "%s", abuf);
+ len_rem -= len_tmp;
+ len_tmp = len_buf;
+ if (encode) {
+ if (len_abuf > len_rem)
+ len_tmp = len_rem / 2; /* encoding */
+ audit_log_n_hex(*ab, buf, len_tmp);
+ len_rem -= len_tmp * 2;
+ len_abuf -= len_tmp * 2;
+ } else {
+ if (len_abuf > len_rem)
+ len_tmp = len_rem - 2; /* quotes */
+ audit_log_n_string(*ab, buf, len_tmp);
+ len_rem -= len_tmp + 2;
+ /* don't subtract the "2" because we still need
+ * to add quotes to the remaining string */
+ len_abuf -= len_tmp;
+ }
+ len_buf -= len_tmp;
+ buf += len_tmp;
+ }
- audit_log_format(*ab, "argc=%d", context->execve.argc);
+ /* ready to move to the next argument? */
+ if ((len_buf == 0) && !require_data) {
+ arg++;
+ iter = 0;
+ len_full = 0;
+ require_data = true;
+ encode = false;
+ }
+ } while (arg < context->execve.argc);
- /*
- * we need some kernel buffer to hold the userspace args. Just
- * allocate one big one rather than allocating one of the right size
- * for every single argument inside audit_log_single_execve_arg()
- * should be <8k allocation so should be pretty safe.
- */
- buf = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL);
- if (!buf) {
- audit_panic("out of memory for argv string");
- return;
- }
+ /* NOTE: the caller handles the final audit_log_end() call */
- for (i = 0; i < context->execve.argc; i++) {
- len = audit_log_single_execve_arg(context, ab, i,
- &len_sent, p, buf);
- if (len <= 0)
- break;
- p += len;
- }
- kfree(buf);
+out:
+ kfree(buf_head);
}
static void show_special(struct audit_context *context, int *call_panic)
diff --git a/kernel/kernel/bpf/helpers.c b/kernel/kernel/bpf/helpers.c
index 4504ca661..50da680c4 100644
--- a/kernel/kernel/bpf/helpers.c
+++ b/kernel/kernel/bpf/helpers.c
@@ -166,7 +166,7 @@ static u64 bpf_get_current_comm(u64 r1, u64 size, u64 r3, u64 r4, u64 r5)
if (!task)
return -EINVAL;
- memcpy(buf, task->comm, min_t(size_t, size, sizeof(task->comm)));
+ strlcpy(buf, task->comm, min_t(size_t, size, sizeof(task->comm)));
return 0;
}
diff --git a/kernel/kernel/bpf/inode.c b/kernel/kernel/bpf/inode.c
index 5a8a797d5..cb85d228b 100644
--- a/kernel/kernel/bpf/inode.c
+++ b/kernel/kernel/bpf/inode.c
@@ -31,10 +31,10 @@ static void *bpf_any_get(void *raw, enum bpf_type type)
{
switch (type) {
case BPF_TYPE_PROG:
- atomic_inc(&((struct bpf_prog *)raw)->aux->refcnt);
+ raw = bpf_prog_inc(raw);
break;
case BPF_TYPE_MAP:
- bpf_map_inc(raw, true);
+ raw = bpf_map_inc(raw, true);
break;
default:
WARN_ON_ONCE(1);
@@ -277,7 +277,8 @@ static void *bpf_obj_do_get(const struct filename *pathname,
goto out;
raw = bpf_any_get(inode->i_private, *type);
- touch_atime(&path);
+ if (!IS_ERR(raw))
+ touch_atime(&path);
path_put(&path);
return raw;
@@ -357,7 +358,7 @@ static int bpf_fill_super(struct super_block *sb, void *data, int silent)
static struct dentry *bpf_mount(struct file_system_type *type, int flags,
const char *dev_name, void *data)
{
- return mount_ns(type, flags, current->nsproxy->mnt_ns, bpf_fill_super);
+ return mount_nodev(type, flags, data, bpf_fill_super);
}
static struct file_system_type bpf_fs_type = {
@@ -365,7 +366,6 @@ static struct file_system_type bpf_fs_type = {
.name = "bpf",
.mount = bpf_mount,
.kill_sb = kill_litter_super,
- .fs_flags = FS_USERNS_MOUNT,
};
MODULE_ALIAS_FS("bpf");
diff --git a/kernel/kernel/bpf/syscall.c b/kernel/kernel/bpf/syscall.c
index 3b39550d8..4e32cc94e 100644
--- a/kernel/kernel/bpf/syscall.c
+++ b/kernel/kernel/bpf/syscall.c
@@ -181,11 +181,18 @@ struct bpf_map *__bpf_map_get(struct fd f)
return f.file->private_data;
}
-void bpf_map_inc(struct bpf_map *map, bool uref)
+/* prog's and map's refcnt limit */
+#define BPF_MAX_REFCNT 32768
+
+struct bpf_map *bpf_map_inc(struct bpf_map *map, bool uref)
{
- atomic_inc(&map->refcnt);
+ if (atomic_inc_return(&map->refcnt) > BPF_MAX_REFCNT) {
+ atomic_dec(&map->refcnt);
+ return ERR_PTR(-EBUSY);
+ }
if (uref)
atomic_inc(&map->usercnt);
+ return map;
}
struct bpf_map *bpf_map_get_with_uref(u32 ufd)
@@ -197,7 +204,7 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd)
if (IS_ERR(map))
return map;
- bpf_map_inc(map, true);
+ map = bpf_map_inc(map, true);
fdput(f);
return map;
@@ -580,6 +587,15 @@ static struct bpf_prog *__bpf_prog_get(struct fd f)
return f.file->private_data;
}
+struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
+{
+ if (atomic_inc_return(&prog->aux->refcnt) > BPF_MAX_REFCNT) {
+ atomic_dec(&prog->aux->refcnt);
+ return ERR_PTR(-EBUSY);
+ }
+ return prog;
+}
+
/* called by sockets/tracing/seccomp before attaching program to an event
* pairs with bpf_prog_put()
*/
@@ -592,7 +608,7 @@ struct bpf_prog *bpf_prog_get(u32 ufd)
if (IS_ERR(prog))
return prog;
- atomic_inc(&prog->aux->refcnt);
+ prog = bpf_prog_inc(prog);
fdput(f);
return prog;
diff --git a/kernel/kernel/bpf/verifier.c b/kernel/kernel/bpf/verifier.c
index 2e7f7ab73..2cbfba78d 100644
--- a/kernel/kernel/bpf/verifier.c
+++ b/kernel/kernel/bpf/verifier.c
@@ -239,15 +239,6 @@ static const char * const reg_type_str[] = {
[CONST_IMM] = "imm",
};
-static const struct {
- int map_type;
- int func_id;
-} func_limit[] = {
- {BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call},
- {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read},
- {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_output},
-};
-
static void print_verifier_state(struct verifier_env *env)
{
enum bpf_reg_type t;
@@ -898,24 +889,44 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
static int check_map_func_compatibility(struct bpf_map *map, int func_id)
{
- bool bool_map, bool_func;
- int i;
-
if (!map)
return 0;
- for (i = 0; i < ARRAY_SIZE(func_limit); i++) {
- bool_map = (map->map_type == func_limit[i].map_type);
- bool_func = (func_id == func_limit[i].func_id);
- /* only when map & func pair match it can continue.
- * don't allow any other map type to be passed into
- * the special func;
- */
- if (bool_func && bool_map != bool_func)
- return -EINVAL;
+ /* We need a two way check, first is from map perspective ... */
+ switch (map->map_type) {
+ case BPF_MAP_TYPE_PROG_ARRAY:
+ if (func_id != BPF_FUNC_tail_call)
+ goto error;
+ break;
+ case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
+ if (func_id != BPF_FUNC_perf_event_read &&
+ func_id != BPF_FUNC_perf_event_output)
+ goto error;
+ break;
+ default:
+ break;
+ }
+
+ /* ... and second from the function itself. */
+ switch (func_id) {
+ case BPF_FUNC_tail_call:
+ if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
+ goto error;
+ break;
+ case BPF_FUNC_perf_event_read:
+ case BPF_FUNC_perf_event_output:
+ if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY)
+ goto error;
+ break;
+ default:
+ break;
}
return 0;
+error:
+ verbose("cannot pass map_type %d into func %d\n",
+ map->map_type, func_id);
+ return -EINVAL;
}
static int check_call(struct verifier_env *env, int func_id)
@@ -1348,6 +1359,7 @@ static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn)
}
if (insn->dst_reg != BPF_REG_0 || insn->off != 0 ||
+ BPF_SIZE(insn->code) == BPF_DW ||
(mode == BPF_ABS && insn->src_reg != BPF_REG_0)) {
verbose("BPF_LD_ABS uses reserved fields\n");
return -EINVAL;
@@ -2003,7 +2015,6 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env)
if (IS_ERR(map)) {
verbose("fd %d is not pointing to valid bpf_map\n",
insn->imm);
- fdput(f);
return PTR_ERR(map);
}
@@ -2023,15 +2034,18 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env)
return -E2BIG;
}
- /* remember this map */
- env->used_maps[env->used_map_cnt++] = map;
-
/* hold the map. If the program is rejected by verifier,
* the map will be released by release_maps() or it
* will be used by the valid program until it's unloaded
* and all maps are released in free_bpf_prog_info()
*/
- bpf_map_inc(map, false);
+ map = bpf_map_inc(map, false);
+ if (IS_ERR(map)) {
+ fdput(f);
+ return PTR_ERR(map);
+ }
+ env->used_maps[env->used_map_cnt++] = map;
+
fdput(f);
next_insn:
insn++;
diff --git a/kernel/kernel/capability.c b/kernel/kernel/capability.c
index 45432b54d..4984e1f55 100644
--- a/kernel/kernel/capability.c
+++ b/kernel/kernel/capability.c
@@ -361,6 +361,24 @@ bool has_capability_noaudit(struct task_struct *t, int cap)
return has_ns_capability_noaudit(t, &init_user_ns, cap);
}
+static bool ns_capable_common(struct user_namespace *ns, int cap, bool audit)
+{
+ int capable;
+
+ if (unlikely(!cap_valid(cap))) {
+ pr_crit("capable() called with invalid cap=%u\n", cap);
+ BUG();
+ }
+
+ capable = audit ? security_capable(current_cred(), ns, cap) :
+ security_capable_noaudit(current_cred(), ns, cap);
+ if (capable == 0) {
+ current->flags |= PF_SUPERPRIV;
+ return true;
+ }
+ return false;
+}
+
/**
* ns_capable - Determine if the current task has a superior capability in effect
* @ns: The usernamespace we want the capability in
@@ -374,19 +392,27 @@ bool has_capability_noaudit(struct task_struct *t, int cap)
*/
bool ns_capable(struct user_namespace *ns, int cap)
{
- if (unlikely(!cap_valid(cap))) {
- pr_crit("capable() called with invalid cap=%u\n", cap);
- BUG();
- }
-
- if (security_capable(current_cred(), ns, cap) == 0) {
- current->flags |= PF_SUPERPRIV;
- return true;
- }
- return false;
+ return ns_capable_common(ns, cap, true);
}
EXPORT_SYMBOL(ns_capable);
+/**
+ * ns_capable_noaudit - Determine if the current task has a superior capability
+ * (unaudited) in effect
+ * @ns: The usernamespace we want the capability in
+ * @cap: The capability to be tested for
+ *
+ * Return true if the current task has the given superior capability currently
+ * available for use, false if not.
+ *
+ * This sets PF_SUPERPRIV on the task if the capability is available on the
+ * assumption that it's about to be used.
+ */
+bool ns_capable_noaudit(struct user_namespace *ns, int cap)
+{
+ return ns_capable_common(ns, cap, false);
+}
+EXPORT_SYMBOL(ns_capable_noaudit);
/**
* capable - Determine if the current task has a superior capability in effect
@@ -431,6 +457,19 @@ bool file_ns_capable(const struct file *file, struct user_namespace *ns,
EXPORT_SYMBOL(file_ns_capable);
/**
+ * privileged_wrt_inode_uidgid - Do capabilities in the namespace work over the inode?
+ * @ns: The user namespace in question
+ * @inode: The inode in question
+ *
+ * Return true if the inode uid and gid are within the namespace.
+ */
+bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct inode *inode)
+{
+ return kuid_has_mapping(ns, inode->i_uid) &&
+ kgid_has_mapping(ns, inode->i_gid);
+}
+
+/**
* capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped
* @inode: The inode in question
* @cap: The capability in question
@@ -443,7 +482,26 @@ bool capable_wrt_inode_uidgid(const struct inode *inode, int cap)
{
struct user_namespace *ns = current_user_ns();
- return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid) &&
- kgid_has_mapping(ns, inode->i_gid);
+ return ns_capable(ns, cap) && privileged_wrt_inode_uidgid(ns, inode);
}
EXPORT_SYMBOL(capable_wrt_inode_uidgid);
+
+/**
+ * ptracer_capable - Determine if the ptracer holds CAP_SYS_PTRACE in the namespace
+ * @tsk: The task that may be ptraced
+ * @ns: The user namespace to search for CAP_SYS_PTRACE in
+ *
+ * Return true if the task that is ptracing the current task had CAP_SYS_PTRACE
+ * in the specified user namespace.
+ */
+bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns)
+{
+ int ret = 0; /* An absent tracer adds no restrictions */
+ const struct cred *cred;
+ rcu_read_lock();
+ cred = rcu_dereference(tsk->ptracer_cred);
+ if (cred)
+ ret = security_capable_noaudit(cred, ns, CAP_SYS_PTRACE);
+ rcu_read_unlock();
+ return (ret == 0);
+}
diff --git a/kernel/kernel/cgroup.c b/kernel/kernel/cgroup.c
index a0087e536..af31510a2 100644
--- a/kernel/kernel/cgroup.c
+++ b/kernel/kernel/cgroup.c
@@ -236,6 +236,9 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
*/
static bool cgroup_ssid_enabled(int ssid)
{
+ if (CGROUP_SUBSYS_COUNT == 0)
+ return false;
+
return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
}
@@ -2498,6 +2501,14 @@ static void cgroup_migrate_add_src(struct css_set *src_cset,
lockdep_assert_held(&cgroup_mutex);
lockdep_assert_held(&css_set_lock);
+ /*
+ * If ->dead, @src_set is associated with one or more dead cgroups
+ * and doesn't contain any migratable tasks. Ignore it early so
+ * that the rest of migration path doesn't get confused by it.
+ */
+ if (src_cset->dead)
+ return;
+
src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
if (!list_empty(&src_cset->mg_preload_node))
@@ -2713,9 +2724,10 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off, bool threadgroup)
{
struct task_struct *tsk;
+ struct cgroup_subsys *ss;
struct cgroup *cgrp;
pid_t pid;
- int ret;
+ int ssid, ret;
if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
return -EINVAL;
@@ -2763,8 +2775,10 @@ out_unlock_rcu:
rcu_read_unlock();
out_unlock_threadgroup:
percpu_up_write(&cgroup_threadgroup_rwsem);
+ for_each_subsys(ss, ssid)
+ if (ss->post_attach)
+ ss->post_attach();
cgroup_kn_unlock(of->kn);
- cpuset_post_attach_flush();
return ret ?: nbytes;
}
@@ -4681,14 +4695,15 @@ static void css_free_work_fn(struct work_struct *work)
if (ss) {
/* css free path */
+ struct cgroup_subsys_state *parent = css->parent;
int id = css->id;
- if (css->parent)
- css_put(css->parent);
-
ss->css_free(css);
cgroup_idr_remove(&ss->css_idr, id);
cgroup_put(cgrp);
+
+ if (parent)
+ css_put(parent);
} else {
/* cgroup free path */
atomic_dec(&cgrp->root->nr_cgrps);
@@ -4781,6 +4796,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
memset(css, 0, sizeof(*css));
css->cgroup = cgrp;
css->ss = ss;
+ css->id = -1;
INIT_LIST_HEAD(&css->sibling);
INIT_LIST_HEAD(&css->children);
css->serial_nr = css_serial_nr_next++;
@@ -5131,6 +5147,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
{
struct cgroup_subsys_state *css;
+ struct cgrp_cset_link *link;
int ssid;
lockdep_assert_held(&cgroup_mutex);
@@ -5151,11 +5168,18 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
return -EBUSY;
/*
- * Mark @cgrp dead. This prevents further task migration and child
- * creation by disabling cgroup_lock_live_group().
+ * Mark @cgrp and the associated csets dead. The former prevents
+ * further task migration and child creation by disabling
+ * cgroup_lock_live_group(). The latter makes the csets ignored by
+ * the migration path.
*/
cgrp->self.flags &= ~CSS_ONLINE;
+ spin_lock_bh(&css_set_lock);
+ list_for_each_entry(link, &cgrp->cset_links, cset_link)
+ link->cset->dead = true;
+ spin_unlock_bh(&css_set_lock);
+
/* initiate massacre of all css's */
for_each_css(css, ssid, cgrp)
kill_css(css);
diff --git a/kernel/kernel/configs/tiny.config b/kernel/kernel/configs/tiny.config
index c2de56ab0..7fa0c4ae6 100644
--- a/kernel/kernel/configs/tiny.config
+++ b/kernel/kernel/configs/tiny.config
@@ -1,4 +1,12 @@
+# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE is not set
CONFIG_CC_OPTIMIZE_FOR_SIZE=y
+# CONFIG_KERNEL_GZIP is not set
+# CONFIG_KERNEL_BZIP2 is not set
+# CONFIG_KERNEL_LZMA is not set
CONFIG_KERNEL_XZ=y
+# CONFIG_KERNEL_LZO is not set
+# CONFIG_KERNEL_LZ4 is not set
CONFIG_OPTIMIZE_INLINING=y
+# CONFIG_SLAB is not set
+# CONFIG_SLUB is not set
CONFIG_SLOB=y
diff --git a/kernel/kernel/cpu.c b/kernel/kernel/cpu.c
index 8edd3c716..0be18c168 100644
--- a/kernel/kernel/cpu.c
+++ b/kernel/kernel/cpu.c
@@ -506,12 +506,6 @@ static int cpu_notify(unsigned long val, void *v)
return __cpu_notify(val, v, -1, NULL);
}
-#ifdef CONFIG_HOTPLUG_CPU
-
-static void cpu_notify_nofail(unsigned long val, void *v)
-{
- BUG_ON(cpu_notify(val, v));
-}
EXPORT_SYMBOL(register_cpu_notifier);
EXPORT_SYMBOL(__register_cpu_notifier);
@@ -529,6 +523,12 @@ void __unregister_cpu_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL(__unregister_cpu_notifier);
+#ifdef CONFIG_HOTPLUG_CPU
+static void cpu_notify_nofail(unsigned long val, void *v)
+{
+ BUG_ON(cpu_notify(val, v));
+}
+
/**
* clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU
* @cpu: a CPU id
diff --git a/kernel/kernel/cpuset.c b/kernel/kernel/cpuset.c
index 2ade63219..b271353d5 100644
--- a/kernel/kernel/cpuset.c
+++ b/kernel/kernel/cpuset.c
@@ -57,7 +57,6 @@
#include <asm/uaccess.h>
#include <linux/atomic.h>
#include <linux/mutex.h>
-#include <linux/workqueue.h>
#include <linux/cgroup.h>
#include <linux/wait.h>
@@ -325,8 +324,7 @@ static struct file_system_type cpuset_fs_type = {
/*
* Return in pmask the portion of a cpusets's cpus_allowed that
* are online. If none are online, walk up the cpuset hierarchy
- * until we find one that does have some online cpus. The top
- * cpuset always has some cpus online.
+ * until we find one that does have some online cpus.
*
* One way or another, we guarantee to return some non-empty subset
* of cpu_online_mask.
@@ -335,8 +333,20 @@ static struct file_system_type cpuset_fs_type = {
*/
static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
{
- while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask))
+ while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) {
cs = parent_cs(cs);
+ if (unlikely(!cs)) {
+ /*
+ * The top cpuset doesn't have any online cpu as a
+ * consequence of a race between cpuset_hotplug_work
+ * and cpu hotplug notifier. But we know the top
+ * cpuset's effective_cpus is on its way to to be
+ * identical to cpu_online_mask.
+ */
+ cpumask_copy(pmask, cpu_online_mask);
+ return;
+ }
+ }
cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
}
@@ -1015,7 +1025,7 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
}
}
-void cpuset_post_attach_flush(void)
+static void cpuset_post_attach(void)
{
flush_workqueue(cpuset_migrate_mm_wq);
}
@@ -2075,6 +2085,20 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
mutex_unlock(&cpuset_mutex);
}
+/*
+ * Make sure the new task conform to the current state of its parent,
+ * which could have been changed by cpuset just after it inherits the
+ * state from the parent and before it sits on the cgroup's task list.
+ */
+void cpuset_fork(struct task_struct *task, void *priv)
+{
+ if (task_css_is_root(task, cpuset_cgrp_id))
+ return;
+
+ set_cpus_allowed_ptr(task, &current->cpus_allowed);
+ task->mems_allowed = current->mems_allowed;
+}
+
struct cgroup_subsys cpuset_cgrp_subsys = {
.css_alloc = cpuset_css_alloc,
.css_online = cpuset_css_online,
@@ -2083,7 +2107,9 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
.can_attach = cpuset_can_attach,
.cancel_attach = cpuset_cancel_attach,
.attach = cpuset_attach,
+ .post_attach = cpuset_post_attach,
.bind = cpuset_bind,
+ .fork = cpuset_fork,
.legacy_cftypes = files,
.early_init = 1,
};
diff --git a/kernel/kernel/cred.c b/kernel/kernel/cred.c
index 71179a09c..ff8606f77 100644
--- a/kernel/kernel/cred.c
+++ b/kernel/kernel/cred.c
@@ -689,6 +689,8 @@ EXPORT_SYMBOL(set_security_override_from_ctx);
*/
int set_create_files_as(struct cred *new, struct inode *inode)
{
+ if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid))
+ return -EINVAL;
new->fsuid = inode->i_uid;
new->fsgid = inode->i_gid;
return security_kernel_create_files_as(new, inode);
diff --git a/kernel/kernel/debug/debug_core.c b/kernel/kernel/debug/debug_core.c
index 0874e2edd..79517e554 100644
--- a/kernel/kernel/debug/debug_core.c
+++ b/kernel/kernel/debug/debug_core.c
@@ -598,11 +598,11 @@ return_normal:
/*
* Wait for the other CPUs to be notified and be waiting for us:
*/
- time_left = loops_per_jiffy * HZ;
+ time_left = MSEC_PER_SEC;
while (kgdb_do_roundup && --time_left &&
(atomic_read(&masters_in_kgdb) + atomic_read(&slaves_in_kgdb)) !=
online_cpus)
- cpu_relax();
+ udelay(1000);
if (!time_left)
pr_crit("Timed out waiting for secondary CPUs.\n");
diff --git a/kernel/kernel/events/core.c b/kernel/kernel/events/core.c
index 760f41d98..02a21c9f8 100644
--- a/kernel/kernel/events/core.c
+++ b/kernel/kernel/events/core.c
@@ -947,6 +947,7 @@ static void put_ctx(struct perf_event_context *ctx)
* function.
*
* Lock order:
+ * cred_guard_mutex
* task_struct::perf_event_mutex
* perf_event_context::mutex
* perf_event_context::lock
@@ -1539,12 +1540,33 @@ static int __init perf_workqueue_init(void)
core_initcall(perf_workqueue_init);
-static inline int pmu_filter_match(struct perf_event *event)
+static inline int __pmu_filter_match(struct perf_event *event)
{
struct pmu *pmu = event->pmu;
return pmu->filter_match ? pmu->filter_match(event) : 1;
}
+/*
+ * Check whether we should attempt to schedule an event group based on
+ * PMU-specific filtering. An event group can consist of HW and SW events,
+ * potentially with a SW leader, so we must check all the filters, to
+ * determine whether a group is schedulable:
+ */
+static inline int pmu_filter_match(struct perf_event *event)
+{
+ struct perf_event *child;
+
+ if (!__pmu_filter_match(event))
+ return 0;
+
+ list_for_each_entry(child, &event->sibling_list, group_entry) {
+ if (!__pmu_filter_match(child))
+ return 0;
+ }
+
+ return 1;
+}
+
static inline int
event_filter_match(struct perf_event *event)
{
@@ -1581,14 +1603,14 @@ event_sched_out(struct perf_event *event,
perf_pmu_disable(event->pmu);
+ event->tstamp_stopped = tstamp;
+ event->pmu->del(event, 0);
+ event->oncpu = -1;
event->state = PERF_EVENT_STATE_INACTIVE;
if (event->pending_disable) {
event->pending_disable = 0;
event->state = PERF_EVENT_STATE_OFF;
}
- event->tstamp_stopped = tstamp;
- event->pmu->del(event, 0);
- event->oncpu = -1;
if (!is_software_event(event))
cpuctx->active_oncpu--;
@@ -3419,7 +3441,6 @@ static struct task_struct *
find_lively_task_by_vpid(pid_t vpid)
{
struct task_struct *task;
- int err;
rcu_read_lock();
if (!vpid)
@@ -3433,16 +3454,7 @@ find_lively_task_by_vpid(pid_t vpid)
if (!task)
return ERR_PTR(-ESRCH);
- /* Reuse ptrace permission checks for now. */
- err = -EACCES;
- if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
- goto errout;
-
return task;
-errout:
- put_task_struct(task);
- return ERR_PTR(err);
-
}
/*
@@ -6028,6 +6040,27 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
char *buf = NULL;
char *name;
+ if (vma->vm_flags & VM_READ)
+ prot |= PROT_READ;
+ if (vma->vm_flags & VM_WRITE)
+ prot |= PROT_WRITE;
+ if (vma->vm_flags & VM_EXEC)
+ prot |= PROT_EXEC;
+
+ if (vma->vm_flags & VM_MAYSHARE)
+ flags = MAP_SHARED;
+ else
+ flags = MAP_PRIVATE;
+
+ if (vma->vm_flags & VM_DENYWRITE)
+ flags |= MAP_DENYWRITE;
+ if (vma->vm_flags & VM_MAYEXEC)
+ flags |= MAP_EXECUTABLE;
+ if (vma->vm_flags & VM_LOCKED)
+ flags |= MAP_LOCKED;
+ if (vma->vm_flags & VM_HUGETLB)
+ flags |= MAP_HUGETLB;
+
if (file) {
struct inode *inode;
dev_t dev;
@@ -6054,27 +6087,6 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
maj = MAJOR(dev);
min = MINOR(dev);
- if (vma->vm_flags & VM_READ)
- prot |= PROT_READ;
- if (vma->vm_flags & VM_WRITE)
- prot |= PROT_WRITE;
- if (vma->vm_flags & VM_EXEC)
- prot |= PROT_EXEC;
-
- if (vma->vm_flags & VM_MAYSHARE)
- flags = MAP_SHARED;
- else
- flags = MAP_PRIVATE;
-
- if (vma->vm_flags & VM_DENYWRITE)
- flags |= MAP_DENYWRITE;
- if (vma->vm_flags & VM_MAYEXEC)
- flags |= MAP_EXECUTABLE;
- if (vma->vm_flags & VM_LOCKED)
- flags |= MAP_LOCKED;
- if (vma->vm_flags & VM_HUGETLB)
- flags |= MAP_HUGETLB;
-
goto got_name;
} else {
if (vma->vm_ops && vma->vm_ops->name) {
@@ -7111,7 +7123,7 @@ static void perf_event_free_bpf_prog(struct perf_event *event)
prog = event->tp_event->prog;
if (prog) {
event->tp_event->prog = NULL;
- bpf_prog_put(prog);
+ bpf_prog_put_rcu(prog);
}
}
@@ -7981,6 +7993,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
}
}
+ /* symmetric to unaccount_event() in _free_event() */
+ account_event(event);
+
return event;
err_per_task:
@@ -8327,6 +8342,24 @@ SYSCALL_DEFINE5(perf_event_open,
get_online_cpus();
+ if (task) {
+ err = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
+ if (err)
+ goto err_cpus;
+
+ /*
+ * Reuse ptrace permission checks for now.
+ *
+ * We must hold cred_guard_mutex across this and any potential
+ * perf_install_in_context() call for this new event to
+ * serialize against exec() altering our credentials (and the
+ * perf_event_exit_task() that could imply).
+ */
+ err = -EACCES;
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
+ goto err_cred;
+ }
+
if (flags & PERF_FLAG_PID_CGROUP)
cgroup_fd = pid;
@@ -8334,7 +8367,7 @@ SYSCALL_DEFINE5(perf_event_open,
NULL, NULL, cgroup_fd);
if (IS_ERR(event)) {
err = PTR_ERR(event);
- goto err_cpus;
+ goto err_cred;
}
if (is_sampling_event(event)) {
@@ -8344,8 +8377,6 @@ SYSCALL_DEFINE5(perf_event_open,
}
}
- account_event(event);
-
/*
* Special case software events and allow them to be part of
* any hardware group.
@@ -8395,11 +8426,6 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_context;
}
- if (task) {
- put_task_struct(task);
- task = NULL;
- }
-
/*
* Look up the group leader (we will attach this event to it):
*/
@@ -8487,6 +8513,11 @@ SYSCALL_DEFINE5(perf_event_open,
WARN_ON_ONCE(ctx->parent_ctx);
+ /*
+ * This is the point on no return; we cannot fail hereafter. This is
+ * where we start modifying current state.
+ */
+
if (move_group) {
/*
* See perf_event_ctx_lock() for comments on the details
@@ -8556,6 +8587,11 @@ SYSCALL_DEFINE5(perf_event_open,
mutex_unlock(&gctx->mutex);
mutex_unlock(&ctx->mutex);
+ if (task) {
+ mutex_unlock(&task->signal->cred_guard_mutex);
+ put_task_struct(task);
+ }
+
put_online_cpus();
event->owner = current;
@@ -8584,7 +8620,15 @@ err_context:
perf_unpin_context(ctx);
put_ctx(ctx);
err_alloc:
- free_event(event);
+ /*
+ * If event_file is set, the fput() above will have called ->release()
+ * and that will take care of freeing the event.
+ */
+ if (!event_file)
+ free_event(event);
+err_cred:
+ if (task)
+ mutex_unlock(&task->signal->cred_guard_mutex);
err_cpus:
put_online_cpus();
err_task:
@@ -8628,8 +8672,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
/* Mark owner so we could distinguish it from user events. */
event->owner = EVENT_OWNER_KERNEL;
- account_event(event);
-
ctx = find_get_context(event->pmu, task, event);
if (IS_ERR(ctx)) {
err = PTR_ERR(ctx);
@@ -8866,6 +8908,9 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
/*
* When a child task exits, feed back event values to parent events.
+ *
+ * Can be called with cred_guard_mutex held when called from
+ * install_exec_creds().
*/
void perf_event_exit_task(struct task_struct *child)
{
diff --git a/kernel/kernel/events/ring_buffer.c b/kernel/kernel/events/ring_buffer.c
index adfdc0536..014b69528 100644
--- a/kernel/kernel/events/ring_buffer.c
+++ b/kernel/kernel/events/ring_buffer.c
@@ -347,6 +347,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
bool truncated)
{
struct ring_buffer *rb = handle->rb;
+ bool wakeup = truncated;
unsigned long aux_head;
u64 flags = 0;
@@ -375,9 +376,16 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
aux_head = rb->user_page->aux_head = local_read(&rb->aux_head);
if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) {
- perf_output_wakeup(handle);
+ wakeup = true;
local_add(rb->aux_watermark, &rb->aux_wakeup);
}
+
+ if (wakeup) {
+ if (truncated)
+ handle->event->pending_disable = 1;
+ perf_output_wakeup(handle);
+ }
+
handle->event = NULL;
local_set(&rb->aux_nest, 0);
diff --git a/kernel/kernel/events/uprobes.c b/kernel/kernel/events/uprobes.c
index 7dad84913..da0c09ff6 100644
--- a/kernel/kernel/events/uprobes.c
+++ b/kernel/kernel/events/uprobes.c
@@ -171,8 +171,10 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
err = -EAGAIN;
ptep = page_check_address(page, mm, addr, &ptl, 0);
- if (!ptep)
+ if (!ptep) {
+ mem_cgroup_cancel_charge(kpage, memcg);
goto unlock;
+ }
get_page(kpage);
page_add_new_anon_rmap(kpage, vma, addr);
@@ -199,7 +201,6 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
err = 0;
unlock:
- mem_cgroup_cancel_charge(kpage, memcg);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
unlock_page(page);
return err;
diff --git a/kernel/kernel/exit.c b/kernel/kernel/exit.c
index 73497e458..e199407f8 100644
--- a/kernel/kernel/exit.c
+++ b/kernel/kernel/exit.c
@@ -918,17 +918,28 @@ static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
task_pid_type(p, wo->wo_type) == wo->wo_pid;
}
-static int eligible_child(struct wait_opts *wo, struct task_struct *p)
+static int
+eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p)
{
if (!eligible_pid(wo, p))
return 0;
- /* Wait for all children (clone and not) if __WALL is set;
- * otherwise, wait for clone children *only* if __WCLONE is
- * set; otherwise, wait for non-clone children *only*. (Note:
- * A "clone" child here is one that reports to its parent
- * using a signal other than SIGCHLD.) */
- if (((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
- && !(wo->wo_flags & __WALL))
+
+ /*
+ * Wait for all children (clone and not) if __WALL is set or
+ * if it is traced by us.
+ */
+ if (ptrace || (wo->wo_flags & __WALL))
+ return 1;
+
+ /*
+ * Otherwise, wait for clone children *only* if __WCLONE is set;
+ * otherwise, wait for non-clone children *only*.
+ *
+ * Note: a "clone" child here is one that reports to its parent
+ * using a signal other than SIGCHLD, or a non-leader thread which
+ * we can only see if it is traced by us.
+ */
+ if ((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
return 0;
return 1;
@@ -1301,7 +1312,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
if (unlikely(exit_state == EXIT_DEAD))
return 0;
- ret = eligible_child(wo, p);
+ ret = eligible_child(wo, ptrace, p);
if (!ret)
return ret;
diff --git a/kernel/kernel/fork.c b/kernel/kernel/fork.c
index 46c1e8342..3929b800d 100644
--- a/kernel/kernel/fork.c
+++ b/kernel/kernel/fork.c
@@ -598,7 +598,8 @@ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
#endif
}
-static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
+static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
+ struct user_namespace *user_ns)
{
mm->mmap = NULL;
mm->mm_rb = RB_ROOT;
@@ -638,6 +639,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
if (init_new_context(p, mm))
goto fail_nocontext;
+ mm->user_ns = get_user_ns(user_ns);
return mm;
fail_nocontext:
@@ -683,7 +685,7 @@ struct mm_struct *mm_alloc(void)
return NULL;
memset(mm, 0, sizeof(*mm));
- return mm_init(mm, current);
+ return mm_init(mm, current, current_user_ns());
}
/*
@@ -698,6 +700,7 @@ void __mmdrop(struct mm_struct *mm)
destroy_context(mm);
mmu_notifier_mm_destroy(mm);
check_mm(mm);
+ put_user_ns(mm->user_ns);
free_mm(mm);
}
EXPORT_SYMBOL_GPL(__mmdrop);
@@ -790,6 +793,29 @@ struct file *get_mm_exe_file(struct mm_struct *mm)
EXPORT_SYMBOL(get_mm_exe_file);
/**
+ * get_task_exe_file - acquire a reference to the task's executable file
+ *
+ * Returns %NULL if task's mm (if any) has no associated executable file or
+ * this is a kernel thread with borrowed mm (see the comment above get_task_mm).
+ * User must release file via fput().
+ */
+struct file *get_task_exe_file(struct task_struct *task)
+{
+ struct file *exe_file = NULL;
+ struct mm_struct *mm;
+
+ task_lock(task);
+ mm = task->mm;
+ if (mm) {
+ if (!(task->flags & PF_KTHREAD))
+ exe_file = get_mm_exe_file(mm);
+ }
+ task_unlock(task);
+ return exe_file;
+}
+EXPORT_SYMBOL(get_task_exe_file);
+
+/**
* get_task_mm - acquire a reference to the task's mm
*
* Returns %NULL if the task has no mm. Checks PF_KTHREAD (meaning
@@ -904,14 +930,12 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
deactivate_mm(tsk, mm);
/*
- * If we're exiting normally, clear a user-space tid field if
- * requested. We leave this alone when dying by signal, to leave
- * the value intact in a core dump, and to save the unnecessary
- * trouble, say, a killed vfork parent shouldn't touch this mm.
- * Userland only wants this done for a sys_exit.
+ * Signal userspace if we're not exiting with a core dump
+ * because we want to leave the value intact for debugging
+ * purposes.
*/
if (tsk->clear_child_tid) {
- if (!(tsk->flags & PF_SIGNALED) &&
+ if (!(tsk->signal->flags & SIGNAL_GROUP_COREDUMP) &&
atomic_read(&mm->mm_users) > 1) {
/*
* We don't check the error code - if userspace has
@@ -947,7 +971,7 @@ static struct mm_struct *dup_mm(struct task_struct *tsk)
memcpy(mm, oldmm, sizeof(*mm));
- if (!mm_init(mm, tsk))
+ if (!mm_init(mm, tsk, mm->user_ns))
goto fail_nomem;
err = dup_mmap(mm, oldmm);
@@ -1379,10 +1403,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
prev_cputime_init(&p->prev_cputime);
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
- raw_spin_lock_init(&p->vtime_lock);
- seqcount_init(&p->vtime_seq);
+ seqcount_init(&p->vtime_seqcount);
p->vtime_snap = 0;
- p->vtime_snap_whence = VTIME_SLEEPING;
+ p->vtime_snap_whence = VTIME_INACTIVE;
#endif
#if defined(SPLIT_RSS_COUNTING)
@@ -1400,7 +1423,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->real_start_time = ktime_get_boot_ns();
p->io_context = NULL;
p->audit_context = NULL;
- threadgroup_change_begin(current);
cgroup_fork(p);
#ifdef CONFIG_NUMA
p->mempolicy = mpol_dup(p->mempolicy);
@@ -1552,6 +1574,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
INIT_LIST_HEAD(&p->thread_group);
p->task_works = NULL;
+ threadgroup_change_begin(current);
/*
* Ensure that the cgroup subsystem policies allow the new process to be
* forked. It should be noted the the new process's css_set can be changed
@@ -1652,6 +1675,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
bad_fork_cancel_cgroup:
cgroup_cancel_fork(p, cgrp_ss_priv);
bad_fork_free_pid:
+ threadgroup_change_end(current);
if (pid != &init_struct_pid)
free_pid(pid);
bad_fork_cleanup_io:
@@ -1682,7 +1706,6 @@ bad_fork_cleanup_policy:
mpol_put(p->mempolicy);
bad_fork_cleanup_threadgroup_lock:
#endif
- threadgroup_change_end(current);
delayacct_tsk_free(p);
bad_fork_cleanup_count:
atomic_dec(&p->cred->user->processes);
diff --git a/kernel/kernel/futex.c b/kernel/kernel/futex.c
index 02bb86921..059623427 100644
--- a/kernel/kernel/futex.c
+++ b/kernel/kernel/futex.c
@@ -1247,10 +1247,20 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
if (unlikely(should_fail_futex(true)))
ret = -EFAULT;
- if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
+ if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) {
ret = -EFAULT;
- else if (curval != uval)
- ret = -EINVAL;
+ } else if (curval != uval) {
+ /*
+ * If a unconditional UNLOCK_PI operation (user space did not
+ * try the TID->0 transition) raced with a waiter setting the
+ * FUTEX_WAITERS flag between get_user() and locking the hash
+ * bucket lock, retry the operation.
+ */
+ if ((FUTEX_TID_MASK & curval) == uval)
+ ret = -EAGAIN;
+ else
+ ret = -EINVAL;
+ }
if (ret) {
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
return ret;
@@ -1278,7 +1288,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
* deboost first (and lose our higher priority), then the task might get
* scheduled away before the wake up can take place.
*/
- spin_unlock(&hb->lock);
+ deboost |= spin_unlock_no_deboost(&hb->lock);
wake_up_q(&wake_q);
wake_up_q_sleeper(&wake_sleeper_q);
if (deboost)
@@ -1479,8 +1489,8 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
if (likely(&hb1->chain != &hb2->chain)) {
plist_del(&q->list, &hb1->chain);
hb_waiters_dec(hb1);
- plist_add(&q->list, &hb2->chain);
hb_waiters_inc(hb2);
+ plist_add(&q->list, &hb2->chain);
q->lock_ptr = &hb2->lock;
}
get_futex_key_refs(key2);
@@ -2553,6 +2563,15 @@ retry:
if (ret == -EFAULT)
goto pi_faulted;
/*
+ * A unconditional UNLOCK_PI op raced against a waiter
+ * setting the FUTEX_WAITERS bit. Try again.
+ */
+ if (ret == -EAGAIN) {
+ spin_unlock(&hb->lock);
+ put_futex_key(&key);
+ goto retry;
+ }
+ /*
* wake_futex_pi has detected invalid state. Tell user
* space.
*/
diff --git a/kernel/kernel/irq/generic-chip.c b/kernel/kernel/irq/generic-chip.c
index abd286afb..a4775f345 100644
--- a/kernel/kernel/irq/generic-chip.c
+++ b/kernel/kernel/irq/generic-chip.c
@@ -411,8 +411,29 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
}
EXPORT_SYMBOL_GPL(irq_map_generic_chip);
+static void irq_unmap_generic_chip(struct irq_domain *d, unsigned int virq)
+{
+ struct irq_data *data = irq_domain_get_irq_data(d, virq);
+ struct irq_domain_chip_generic *dgc = d->gc;
+ unsigned int hw_irq = data->hwirq;
+ struct irq_chip_generic *gc;
+ int irq_idx;
+
+ gc = irq_get_domain_generic_chip(d, hw_irq);
+ if (!gc)
+ return;
+
+ irq_idx = hw_irq % dgc->irqs_per_chip;
+
+ clear_bit(irq_idx, &gc->installed);
+ irq_domain_set_info(d, virq, hw_irq, &no_irq_chip, NULL, NULL, NULL,
+ NULL);
+
+}
+
struct irq_domain_ops irq_generic_chip_ops = {
.map = irq_map_generic_chip,
+ .unmap = irq_unmap_generic_chip,
.xlate = irq_domain_xlate_onetwocell,
};
EXPORT_SYMBOL_GPL(irq_generic_chip_ops);
diff --git a/kernel/kernel/irq/msi.c b/kernel/kernel/irq/msi.c
index 6b0c0b74a..cd6009006 100644
--- a/kernel/kernel/irq/msi.c
+++ b/kernel/kernel/irq/msi.c
@@ -268,7 +268,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
struct msi_domain_ops *ops = info->ops;
msi_alloc_info_t arg;
struct msi_desc *desc;
- int i, ret, virq = -1;
+ int i, ret, virq;
ret = ops->msi_check(domain, info, dev);
if (ret == 0)
@@ -278,12 +278,8 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
for_each_msi_entry(desc, dev) {
ops->set_desc(&arg, desc);
- if (info->flags & MSI_FLAG_IDENTITY_MAP)
- virq = (int)ops->get_hwirq(info, &arg);
- else
- virq = -1;
- virq = __irq_domain_alloc_irqs(domain, virq, desc->nvec_used,
+ virq = __irq_domain_alloc_irqs(domain, -1, desc->nvec_used,
dev_to_node(dev), &arg, false);
if (virq < 0) {
ret = -ENOSPC;
@@ -302,11 +298,23 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
ops->msi_finish(&arg, 0);
for_each_msi_entry(desc, dev) {
+ virq = desc->irq;
if (desc->nvec_used == 1)
dev_dbg(dev, "irq %d for MSI\n", virq);
else
dev_dbg(dev, "irq [%d-%d] for MSI\n",
virq, virq + desc->nvec_used - 1);
+ /*
+ * This flag is set by the PCI layer as we need to activate
+ * the MSI entries before the PCI layer enables MSI in the
+ * card. Otherwise the card latches a random msi message.
+ */
+ if (info->flags & MSI_FLAG_ACTIVATE_EARLY) {
+ struct irq_data *irq_data;
+
+ irq_data = irq_domain_get_irq_data(domain, desc->irq);
+ irq_domain_activate_irq(irq_data);
+ }
}
return 0;
diff --git a/kernel/kernel/jump_label.c b/kernel/kernel/jump_label.c
index 05254eeb4..453ec4232 100644
--- a/kernel/kernel/jump_label.c
+++ b/kernel/kernel/jump_label.c
@@ -58,13 +58,36 @@ static void jump_label_update(struct static_key *key);
void static_key_slow_inc(struct static_key *key)
{
+ int v, v1;
+
STATIC_KEY_CHECK_USE();
- if (atomic_inc_not_zero(&key->enabled))
- return;
+
+ /*
+ * Careful if we get concurrent static_key_slow_inc() calls;
+ * later calls must wait for the first one to _finish_ the
+ * jump_label_update() process. At the same time, however,
+ * the jump_label_update() call below wants to see
+ * static_key_enabled(&key) for jumps to be updated properly.
+ *
+ * So give a special meaning to negative key->enabled: it sends
+ * static_key_slow_inc() down the slow path, and it is non-zero
+ * so it counts as "enabled" in jump_label_update(). Note that
+ * atomic_inc_unless_negative() checks >= 0, so roll our own.
+ */
+ for (v = atomic_read(&key->enabled); v > 0; v = v1) {
+ v1 = atomic_cmpxchg(&key->enabled, v, v + 1);
+ if (likely(v1 == v))
+ return;
+ }
jump_label_lock();
- if (atomic_inc_return(&key->enabled) == 1)
+ if (atomic_read(&key->enabled) == 0) {
+ atomic_set(&key->enabled, -1);
jump_label_update(key);
+ atomic_set(&key->enabled, 1);
+ } else {
+ atomic_inc(&key->enabled);
+ }
jump_label_unlock();
}
EXPORT_SYMBOL_GPL(static_key_slow_inc);
@@ -72,6 +95,13 @@ EXPORT_SYMBOL_GPL(static_key_slow_inc);
static void __static_key_slow_dec(struct static_key *key,
unsigned long rate_limit, struct delayed_work *work)
{
+ /*
+ * The negative count check is valid even when a negative
+ * key->enabled is in use by static_key_slow_inc(); a
+ * __static_key_slow_dec() before the first static_key_slow_inc()
+ * returns is unbalanced, because all other static_key_slow_inc()
+ * instances block while the update is in progress.
+ */
if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) {
WARN(atomic_read(&key->enabled) < 0,
"jump label: negative count!\n");
@@ -108,6 +138,13 @@ void static_key_slow_dec_deferred(struct static_key_deferred *key)
}
EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred);
+void static_key_deferred_flush(struct static_key_deferred *key)
+{
+ STATIC_KEY_CHECK_USE();
+ flush_delayed_work(&key->work);
+}
+EXPORT_SYMBOL_GPL(static_key_deferred_flush);
+
void jump_label_rate_limit(struct static_key_deferred *key,
unsigned long rl)
{
diff --git a/kernel/kernel/kexec_file.c b/kernel/kernel/kexec_file.c
index b70ada002..6030efd4a 100644
--- a/kernel/kernel/kexec_file.c
+++ b/kernel/kernel/kexec_file.c
@@ -934,7 +934,10 @@ int kexec_load_purgatory(struct kimage *image, unsigned long min,
return 0;
out:
vfree(pi->sechdrs);
+ pi->sechdrs = NULL;
+
vfree(pi->purgatory_buf);
+ pi->purgatory_buf = NULL;
return ret;
}
diff --git a/kernel/kernel/locking/mcs_spinlock.h b/kernel/kernel/locking/mcs_spinlock.h
index 5b9102a47..c835270f0 100644
--- a/kernel/kernel/locking/mcs_spinlock.h
+++ b/kernel/kernel/locking/mcs_spinlock.h
@@ -67,7 +67,13 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
node->locked = 0;
node->next = NULL;
- prev = xchg_acquire(lock, node);
+ /*
+ * We rely on the full barrier with global transitivity implied by the
+ * below xchg() to order the initialization stores above against any
+ * observation of @node. And to provide the ACQUIRE ordering associated
+ * with a LOCK primitive.
+ */
+ prev = xchg(lock, node);
if (likely(prev == NULL)) {
/*
* Lock acquired, don't need to set node->locked to 1. Threads
diff --git a/kernel/kernel/locking/mutex.c b/kernel/kernel/locking/mutex.c
index 0551c219c..89350f924 100644
--- a/kernel/kernel/locking/mutex.c
+++ b/kernel/kernel/locking/mutex.c
@@ -486,9 +486,6 @@ __ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
if (!hold_ctx)
return 0;
- if (unlikely(ctx == hold_ctx))
- return -EALREADY;
-
if (ctx->stamp - hold_ctx->stamp <= LONG_MAX &&
(ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) {
#ifdef CONFIG_DEBUG_MUTEXES
@@ -514,6 +511,12 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
unsigned long flags;
int ret;
+ if (use_ww_ctx) {
+ struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
+ if (unlikely(ww_ctx == READ_ONCE(ww->ctx)))
+ return -EALREADY;
+ }
+
preempt_disable();
mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
diff --git a/kernel/kernel/locking/qspinlock.c b/kernel/kernel/locking/qspinlock.c
index 87e9ce6a6..8173bc7fe 100644
--- a/kernel/kernel/locking/qspinlock.c
+++ b/kernel/kernel/locking/qspinlock.c
@@ -255,6 +255,66 @@ static __always_inline void __pv_wait_head(struct qspinlock *lock,
#define queued_spin_lock_slowpath native_queued_spin_lock_slowpath
#endif
+/*
+ * queued_spin_lock_slowpath() can (load-)ACQUIRE the lock before
+ * issuing an _unordered_ store to set _Q_LOCKED_VAL.
+ *
+ * This means that the store can be delayed, but no later than the
+ * store-release from the unlock. This means that simply observing
+ * _Q_LOCKED_VAL is not sufficient to determine if the lock is acquired.
+ *
+ * There are two paths that can issue the unordered store:
+ *
+ * (1) clear_pending_set_locked(): *,1,0 -> *,0,1
+ *
+ * (2) set_locked(): t,0,0 -> t,0,1 ; t != 0
+ * atomic_cmpxchg_relaxed(): t,0,0 -> 0,0,1
+ *
+ * However, in both cases we have other !0 state we've set before to queue
+ * ourseves:
+ *
+ * For (1) we have the atomic_cmpxchg_acquire() that set _Q_PENDING_VAL, our
+ * load is constrained by that ACQUIRE to not pass before that, and thus must
+ * observe the store.
+ *
+ * For (2) we have a more intersting scenario. We enqueue ourselves using
+ * xchg_tail(), which ends up being a RELEASE. This in itself is not
+ * sufficient, however that is followed by an smp_cond_acquire() on the same
+ * word, giving a RELEASE->ACQUIRE ordering. This again constrains our load and
+ * guarantees we must observe that store.
+ *
+ * Therefore both cases have other !0 state that is observable before the
+ * unordered locked byte store comes through. This means we can use that to
+ * wait for the lock store, and then wait for an unlock.
+ */
+#ifndef queued_spin_unlock_wait
+void queued_spin_unlock_wait(struct qspinlock *lock)
+{
+ u32 val;
+
+ for (;;) {
+ val = atomic_read(&lock->val);
+
+ if (!val) /* not locked, we're done */
+ goto done;
+
+ if (val & _Q_LOCKED_MASK) /* locked, go wait for unlock */
+ break;
+
+ /* not locked, but pending, wait until we observe the lock */
+ cpu_relax();
+ }
+
+ /* any unlock is good */
+ while (atomic_read(&lock->val) & _Q_LOCKED_MASK)
+ cpu_relax();
+
+done:
+ smp_rmb(); /* CTRL + RMB -> ACQUIRE */
+}
+EXPORT_SYMBOL(queued_spin_unlock_wait);
+#endif
+
#endif /* _GEN_PV_LOCK_SLOWPATH */
/**
diff --git a/kernel/kernel/locking/rtmutex.c b/kernel/kernel/locking/rtmutex.c
index 30777e813..86a78c068 100644
--- a/kernel/kernel/locking/rtmutex.c
+++ b/kernel/kernel/locking/rtmutex.c
@@ -71,8 +71,72 @@ static inline void clear_rt_mutex_waiters(struct rt_mutex *lock)
static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
{
- if (!rt_mutex_has_waiters(lock))
- clear_rt_mutex_waiters(lock);
+ unsigned long owner, *p = (unsigned long *) &lock->owner;
+
+ if (rt_mutex_has_waiters(lock))
+ return;
+
+ /*
+ * The rbtree has no waiters enqueued, now make sure that the
+ * lock->owner still has the waiters bit set, otherwise the
+ * following can happen:
+ *
+ * CPU 0 CPU 1 CPU2
+ * l->owner=T1
+ * rt_mutex_lock(l)
+ * lock(l->lock)
+ * l->owner = T1 | HAS_WAITERS;
+ * enqueue(T2)
+ * boost()
+ * unlock(l->lock)
+ * block()
+ *
+ * rt_mutex_lock(l)
+ * lock(l->lock)
+ * l->owner = T1 | HAS_WAITERS;
+ * enqueue(T3)
+ * boost()
+ * unlock(l->lock)
+ * block()
+ * signal(->T2) signal(->T3)
+ * lock(l->lock)
+ * dequeue(T2)
+ * deboost()
+ * unlock(l->lock)
+ * lock(l->lock)
+ * dequeue(T3)
+ * ==> wait list is empty
+ * deboost()
+ * unlock(l->lock)
+ * lock(l->lock)
+ * fixup_rt_mutex_waiters()
+ * if (wait_list_empty(l) {
+ * l->owner = owner
+ * owner = l->owner & ~HAS_WAITERS;
+ * ==> l->owner = T1
+ * }
+ * lock(l->lock)
+ * rt_mutex_unlock(l) fixup_rt_mutex_waiters()
+ * if (wait_list_empty(l) {
+ * owner = l->owner & ~HAS_WAITERS;
+ * cmpxchg(l->owner, T1, NULL)
+ * ===> Success (l->owner = NULL)
+ *
+ * l->owner = owner
+ * ==> l->owner = T1
+ * }
+ *
+ * With the check for the waiter bit in place T3 on CPU2 will not
+ * overwrite. All tasks fiddling with the waiters bit are
+ * serialized by l->lock, so nothing else can modify the waiters
+ * bit. If the bit is set then nothing can change l->owner either
+ * so the simple RMW is safe. The cmpxchg() will simply fail if it
+ * happens in the middle of the RMW because the waiters bit is
+ * still set.
+ */
+ owner = READ_ONCE(*p);
+ if (owner & RT_MUTEX_HAS_WAITERS)
+ WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS);
}
static int rt_mutex_real_waiter(struct rt_mutex_waiter *waiter)
@@ -939,13 +1003,14 @@ static inline void rt_spin_lock_fastlock(struct rt_mutex *lock,
slowfn(lock, do_mig_dis);
}
-static inline void rt_spin_lock_fastunlock(struct rt_mutex *lock,
- void (*slowfn)(struct rt_mutex *lock))
+static inline int rt_spin_lock_fastunlock(struct rt_mutex *lock,
+ int (*slowfn)(struct rt_mutex *lock))
{
- if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
+ if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
rt_mutex_deadlock_account_unlock(current);
- else
- slowfn(lock);
+ return 0;
+ }
+ return slowfn(lock);
}
#ifdef CONFIG_SMP
/*
@@ -1086,7 +1151,7 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
/*
* Slow path to release a rt_mutex spin_lock style
*/
-static void noinline __sched rt_spin_lock_slowunlock(struct rt_mutex *lock)
+static int noinline __sched rt_spin_lock_slowunlock(struct rt_mutex *lock)
{
unsigned long flags;
WAKE_Q(wake_q);
@@ -1101,7 +1166,7 @@ static void noinline __sched rt_spin_lock_slowunlock(struct rt_mutex *lock)
if (!rt_mutex_has_waiters(lock)) {
lock->owner = NULL;
raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
- return;
+ return 0;
}
mark_wakeup_next_waiter(&wake_q, &wake_sleeper_q, lock);
@@ -1112,6 +1177,33 @@ static void noinline __sched rt_spin_lock_slowunlock(struct rt_mutex *lock)
/* Undo pi boosting.when necessary */
rt_mutex_adjust_prio(current);
+ return 0;
+}
+
+static int noinline __sched rt_spin_lock_slowunlock_no_deboost(struct rt_mutex *lock)
+{
+ unsigned long flags;
+ WAKE_Q(wake_q);
+ WAKE_Q(wake_sleeper_q);
+
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
+
+ debug_rt_mutex_unlock(lock);
+
+ rt_mutex_deadlock_account_unlock(current);
+
+ if (!rt_mutex_has_waiters(lock)) {
+ lock->owner = NULL;
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+ return 0;
+ }
+
+ mark_wakeup_next_waiter(&wake_q, &wake_sleeper_q, lock);
+
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+ wake_up_q(&wake_q);
+ wake_up_q_sleeper(&wake_sleeper_q);
+ return 1;
}
void __lockfunc rt_spin_lock__no_mg(spinlock_t *lock)
@@ -1166,6 +1258,17 @@ void __lockfunc rt_spin_unlock(spinlock_t *lock)
}
EXPORT_SYMBOL(rt_spin_unlock);
+int __lockfunc rt_spin_unlock_no_deboost(spinlock_t *lock)
+{
+ int ret;
+
+ /* NOTE: we always pass in '1' for nested, for simplicity */
+ spin_release(&lock->dep_map, 1, _RET_IP_);
+ ret = rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock_no_deboost);
+ migrate_enable();
+ return ret;
+}
+
void __lockfunc __rt_spin_unlock(struct rt_mutex *lock)
{
rt_spin_lock_fastunlock(lock, rt_spin_lock_slowunlock);
diff --git a/kernel/kernel/locking/rtmutex_common.h b/kernel/kernel/locking/rtmutex_common.h
index 289f062f2..f457c7574 100644
--- a/kernel/kernel/locking/rtmutex_common.h
+++ b/kernel/kernel/locking/rtmutex_common.h
@@ -76,8 +76,9 @@ task_top_pi_waiter(struct task_struct *p)
static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
{
- return (struct task_struct *)
- ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL);
+ unsigned long owner = (unsigned long) READ_ONCE(lock->owner);
+
+ return (struct task_struct *) (owner & ~RT_MUTEX_OWNER_MASKALL);
}
/*
diff --git a/kernel/kernel/memremap.c b/kernel/kernel/memremap.c
index 25ced161e..f719c925c 100644
--- a/kernel/kernel/memremap.c
+++ b/kernel/kernel/memremap.c
@@ -159,7 +159,9 @@ static void devm_memremap_pages_release(struct device *dev, void *res)
struct page_map *page_map = res;
/* pages are dead and unused, undo the arch mapping */
+ mem_hotplug_begin();
arch_remove_memory(page_map->res.start, resource_size(&page_map->res));
+ mem_hotplug_done();
}
void *devm_memremap_pages(struct device *dev, struct resource *res)
@@ -189,7 +191,9 @@ void *devm_memremap_pages(struct device *dev, struct resource *res)
if (nid < 0)
nid = numa_mem_id();
+ mem_hotplug_begin();
error = arch_add_memory(nid, res->start, resource_size(res), true);
+ mem_hotplug_done();
if (error) {
devres_free(page_map);
return ERR_PTR(error);
diff --git a/kernel/kernel/module.c b/kernel/kernel/module.c
index 0e5c71195..b14a4f312 100644
--- a/kernel/kernel/module.c
+++ b/kernel/kernel/module.c
@@ -2606,13 +2606,18 @@ static inline void kmemleak_load_module(const struct module *mod,
#endif
#ifdef CONFIG_MODULE_SIG
-static int module_sig_check(struct load_info *info)
+static int module_sig_check(struct load_info *info, int flags)
{
int err = -ENOKEY;
const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1;
const void *mod = info->hdr;
- if (info->len > markerlen &&
+ /*
+ * Require flags == 0, as a module with version information
+ * removed is no longer the module that was signed
+ */
+ if (flags == 0 &&
+ info->len > markerlen &&
memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) {
/* We truncate the module to discard the signature */
info->len -= markerlen;
@@ -2631,7 +2636,7 @@ static int module_sig_check(struct load_info *info)
return err;
}
#else /* !CONFIG_MODULE_SIG */
-static int module_sig_check(struct load_info *info)
+static int module_sig_check(struct load_info *info, int flags)
{
return 0;
}
@@ -3444,7 +3449,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
long err;
char *after_dashes;
- err = module_sig_check(info);
+ err = module_sig_check(info, flags);
if (err)
goto free_copy;
diff --git a/kernel/kernel/panic.c b/kernel/kernel/panic.c
index 50d4ae2e7..3535f8029 100644
--- a/kernel/kernel/panic.c
+++ b/kernel/kernel/panic.c
@@ -61,6 +61,37 @@ void __weak panic_smp_self_stop(void)
cpu_relax();
}
+/*
+ * Stop ourselves in NMI context if another CPU has already panicked. Arch code
+ * may override this to prepare for crash dumping, e.g. save regs info.
+ */
+void __weak nmi_panic_self_stop(struct pt_regs *regs)
+{
+ panic_smp_self_stop();
+}
+
+atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID);
+
+/*
+ * A variant of panic() called from NMI context. We return if we've already
+ * panicked on this CPU. If another CPU already panicked, loop in
+ * nmi_panic_self_stop() which can provide architecture dependent code such
+ * as saving register state for crash dump.
+ */
+void nmi_panic(struct pt_regs *regs, const char *msg)
+{
+ int old_cpu, cpu;
+
+ cpu = raw_smp_processor_id();
+ old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, cpu);
+
+ if (old_cpu == PANIC_CPU_INVALID)
+ panic("%s", msg);
+ else if (old_cpu != cpu)
+ nmi_panic_self_stop(regs);
+}
+EXPORT_SYMBOL(nmi_panic);
+
/**
* panic - halt the system
* @fmt: The text string to print
@@ -71,17 +102,17 @@ void __weak panic_smp_self_stop(void)
*/
void panic(const char *fmt, ...)
{
- static DEFINE_SPINLOCK(panic_lock);
static char buf[1024];
va_list args;
long i, i_next = 0;
int state = 0;
+ int old_cpu, this_cpu;
/*
* Disable local interrupts. This will prevent panic_smp_self_stop
* from deadlocking the first cpu that invokes the panic, since
* there is nothing to prevent an interrupt handler (that runs
- * after the panic_lock is acquired) from invoking panic again.
+ * after setting panic_cpu) from invoking panic() again.
*/
local_irq_disable();
@@ -94,8 +125,16 @@ void panic(const char *fmt, ...)
* multiple parallel invocations of panic, all other CPUs either
* stop themself or will wait until they are stopped by the 1st CPU
* with smp_send_stop().
+ *
+ * `old_cpu == PANIC_CPU_INVALID' means this is the 1st CPU which
+ * comes here, so go ahead.
+ * `old_cpu == this_cpu' means we came from nmi_panic() which sets
+ * panic_cpu to this CPU. In this case, this is also the 1st CPU.
*/
- if (!spin_trylock(&panic_lock))
+ this_cpu = raw_smp_processor_id();
+ old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu);
+
+ if (old_cpu != PANIC_CPU_INVALID && old_cpu != this_cpu)
panic_smp_self_stop();
console_verbose();
diff --git a/kernel/kernel/power/hibernate.c b/kernel/kernel/power/hibernate.c
index fbb23f93e..c1b981521 100644
--- a/kernel/kernel/power/hibernate.c
+++ b/kernel/kernel/power/hibernate.c
@@ -301,12 +301,12 @@ static int create_image(int platform_mode)
save_processor_state();
trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, true);
error = swsusp_arch_suspend();
+ /* Restore control flow magically appears here */
+ restore_processor_state();
trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, false);
if (error)
printk(KERN_ERR "PM: Error %d creating hibernation image\n",
error);
- /* Restore control flow magically appears here */
- restore_processor_state();
if (!in_suspend)
events_check_enabled = false;
@@ -342,6 +342,7 @@ int hibernation_snapshot(int platform_mode)
pm_message_t msg;
int error;
+ pm_suspend_clear_flags();
error = platform_begin(platform_mode);
if (error)
goto Close;
diff --git a/kernel/kernel/power/snapshot.c b/kernel/kernel/power/snapshot.c
index 3a9706043..f155c62f1 100644
--- a/kernel/kernel/power/snapshot.c
+++ b/kernel/kernel/power/snapshot.c
@@ -765,9 +765,9 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn)
*/
static bool rtree_next_node(struct memory_bitmap *bm)
{
- bm->cur.node = list_entry(bm->cur.node->list.next,
- struct rtree_node, list);
- if (&bm->cur.node->list != &bm->cur.zone->leaves) {
+ if (!list_is_last(&bm->cur.node->list, &bm->cur.zone->leaves)) {
+ bm->cur.node = list_entry(bm->cur.node->list.next,
+ struct rtree_node, list);
bm->cur.node_pfn += BM_BITS_PER_BLOCK;
bm->cur.node_bit = 0;
touch_softlockup_watchdog();
@@ -775,9 +775,9 @@ static bool rtree_next_node(struct memory_bitmap *bm)
}
/* No more nodes, goto next zone */
- bm->cur.zone = list_entry(bm->cur.zone->list.next,
+ if (!list_is_last(&bm->cur.zone->list, &bm->zones)) {
+ bm->cur.zone = list_entry(bm->cur.zone->list.next,
struct mem_zone_bm_rtree, list);
- if (&bm->cur.zone->list != &bm->zones) {
bm->cur.node = list_entry(bm->cur.zone->leaves.next,
struct rtree_node, list);
bm->cur.node_pfn = 0;
diff --git a/kernel/kernel/power/suspend_test.c b/kernel/kernel/power/suspend_test.c
index 084452e34..bdff5ed57 100644
--- a/kernel/kernel/power/suspend_test.c
+++ b/kernel/kernel/power/suspend_test.c
@@ -203,8 +203,10 @@ static int __init test_suspend(void)
/* RTCs have initialized by now too ... can we use one? */
dev = class_find_device(rtc_class, NULL, NULL, has_wakealarm);
- if (dev)
+ if (dev) {
rtc = rtc_class_open(dev_name(dev));
+ put_device(dev);
+ }
if (!rtc) {
printk(warn_no_rtc);
return 0;
diff --git a/kernel/kernel/printk/braille.c b/kernel/kernel/printk/braille.c
index 276762f3a..d5760c42f 100644
--- a/kernel/kernel/printk/braille.c
+++ b/kernel/kernel/printk/braille.c
@@ -9,10 +9,10 @@
char *_braille_console_setup(char **str, char **brl_options)
{
- if (!memcmp(*str, "brl,", 4)) {
+ if (!strncmp(*str, "brl,", 4)) {
*brl_options = "";
*str += 4;
- } else if (!memcmp(str, "brl=", 4)) {
+ } else if (!strncmp(*str, "brl=", 4)) {
*brl_options = *str + 4;
*str = strchr(*brl_options, ',');
if (!*str)
diff --git a/kernel/kernel/printk/printk.c b/kernel/kernel/printk/printk.c
index f75e4b0c6..c747bdfa1 100644
--- a/kernel/kernel/printk/printk.c
+++ b/kernel/kernel/printk/printk.c
@@ -1527,6 +1527,11 @@ static void call_console_drivers(int level,
if (!console_drivers)
return;
+ if (IS_ENABLED(CONFIG_PREEMPT_RT_BASE)) {
+ if (in_irq() || in_nmi())
+ return;
+ }
+
migrate_disable();
for_each_console(con) {
if (exclusive_console && con != exclusive_console)
@@ -2464,6 +2469,11 @@ void console_unblank(void)
{
struct console *c;
+ if (IS_ENABLED(CONFIG_PREEMPT_RT_BASE)) {
+ if (in_irq() || in_nmi())
+ return;
+ }
+
/*
* console_unblank can no longer be called in interrupt context unless
* oops_in_progress is set to 1..
diff --git a/kernel/kernel/ptrace.c b/kernel/kernel/ptrace.c
index 1004af706..6947f7691 100644
--- a/kernel/kernel/ptrace.c
+++ b/kernel/kernel/ptrace.c
@@ -39,6 +39,9 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
BUG_ON(!list_empty(&child->ptrace_entry));
list_add(&child->ptrace_entry, &new_parent->ptraced);
child->parent = new_parent;
+ rcu_read_lock();
+ child->ptracer_cred = get_cred(__task_cred(new_parent));
+ rcu_read_unlock();
}
/**
@@ -71,11 +74,15 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
*/
void __ptrace_unlink(struct task_struct *child)
{
+ const struct cred *old_cred;
BUG_ON(!child->ptrace);
child->ptrace = 0;
child->parent = child->real_parent;
list_del_init(&child->ptrace_entry);
+ old_cred = child->ptracer_cred;
+ child->ptracer_cred = NULL;
+ put_cred(old_cred);
spin_lock(&child->sighand->siglock);
@@ -226,7 +233,7 @@ static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode)
static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
{
const struct cred *cred = current_cred(), *tcred;
- int dumpable = 0;
+ struct mm_struct *mm;
kuid_t caller_uid;
kgid_t caller_gid;
@@ -277,16 +284,11 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
return -EPERM;
ok:
rcu_read_unlock();
- smp_rmb();
- if (task->mm)
- dumpable = get_dumpable(task->mm);
- rcu_read_lock();
- if (dumpable != SUID_DUMP_USER &&
- !ptrace_has_cap(__task_cred(task)->user_ns, mode)) {
- rcu_read_unlock();
- return -EPERM;
- }
- rcu_read_unlock();
+ mm = task->mm;
+ if (mm &&
+ ((get_dumpable(mm) != SUID_DUMP_USER) &&
+ !ptrace_has_cap(mm->user_ns, mode)))
+ return -EPERM;
return security_ptrace_access_check(task, mode);
}
@@ -350,10 +352,6 @@ static int ptrace_attach(struct task_struct *task, long request,
if (seize)
flags |= PT_SEIZED;
- rcu_read_lock();
- if (ns_capable(__task_cred(task)->user_ns, CAP_SYS_PTRACE))
- flags |= PT_PTRACE_CAP;
- rcu_read_unlock();
task->ptrace = flags;
__ptrace_link(task, current);
diff --git a/kernel/kernel/rcu/tree_plugin.h b/kernel/kernel/rcu/tree_plugin.h
index 8e119cf64..45e3e3e02 100644
--- a/kernel/kernel/rcu/tree_plugin.h
+++ b/kernel/kernel/rcu/tree_plugin.h
@@ -2175,6 +2175,7 @@ static int rcu_nocb_kthread(void *arg)
cl++;
c++;
local_bh_enable();
+ cond_resched_rcu_qs();
list = next;
}
trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1);
diff --git a/kernel/kernel/sched/Makefile b/kernel/kernel/sched/Makefile
index debedbee5..01b9994b3 100644
--- a/kernel/kernel/sched/Makefile
+++ b/kernel/kernel/sched/Makefile
@@ -13,7 +13,7 @@ endif
obj-y += core.o loadavg.o clock.o cputime.o
obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
-obj-y += wait.o swait.o work-simple.o completion.o idle.o
+obj-y += wait.o swait.o swork.o completion.o idle.o
obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/kernel/sched/core.c b/kernel/kernel/sched/core.c
index 94827a593..e9b8d5182 100644
--- a/kernel/kernel/sched/core.c
+++ b/kernel/kernel/sched/core.c
@@ -670,7 +670,10 @@ int get_nohz_timer_target(void)
rcu_read_lock();
for_each_domain(cpu, sd) {
for_each_cpu(i, sched_domain_span(sd)) {
- if (!idle_cpu(i) && is_housekeeping_cpu(cpu)) {
+ if (cpu == i)
+ continue;
+
+ if (!idle_cpu(i) && is_housekeeping_cpu(i)) {
cpu = i;
goto unlock;
}
@@ -2098,6 +2101,28 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
success = 1; /* we're going to change ->state */
cpu = task_cpu(p);
+ /*
+ * Ensure we load p->on_rq _after_ p->state, otherwise it would
+ * be possible to, falsely, observe p->on_rq == 0 and get stuck
+ * in smp_cond_load_acquire() below.
+ *
+ * sched_ttwu_pending() try_to_wake_up()
+ * [S] p->on_rq = 1; [L] P->state
+ * UNLOCK rq->lock -----.
+ * \
+ * +--- RMB
+ * schedule() /
+ * LOCK rq->lock -----'
+ * UNLOCK rq->lock
+ *
+ * [task p]
+ * [S] p->state = UNINTERRUPTIBLE [L] p->on_rq
+ *
+ * Pairs with the UNLOCK+LOCK on rq->lock from the
+ * last wakeup of our task and the schedule that got our task
+ * current.
+ */
+ smp_rmb();
if (p->on_rq && ttwu_remote(p, wake_flags))
goto stat;
@@ -3127,7 +3152,8 @@ static noinline void __schedule_bug(struct task_struct *prev)
static inline void schedule_debug(struct task_struct *prev)
{
#ifdef CONFIG_SCHED_STACK_END_CHECK
- BUG_ON(task_stack_end_corrupted(prev));
+ if (task_stack_end_corrupted(prev))
+ panic("corrupted stack end detected inside scheduler\n");
#endif
if (unlikely(in_atomic_preempt_off())) {
@@ -3470,7 +3496,7 @@ static __always_inline int preemptible_lazy(void)
#else
-static int preemptible_lazy(void)
+static inline int preemptible_lazy(void)
{
return 1;
}
@@ -5185,14 +5211,16 @@ void show_state_filter(unsigned long state_filter)
/*
* reset the NMI-timeout, listing all files on a slow
* console might take a lot of time:
+ * Also, reset softlockup watchdogs on all CPUs, because
+ * another CPU might be blocked waiting for us to process
+ * an IPI.
*/
touch_nmi_watchdog();
+ touch_all_softlockup_watchdogs();
if (!state_filter || (p->state & state_filter))
sched_show_task(p);
}
- touch_all_softlockup_watchdogs();
-
#ifdef CONFIG_SCHED_DEBUG
sysrq_sched_debug_show();
#endif
@@ -5768,6 +5796,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
case CPU_UP_PREPARE:
rq->calc_load_update = calc_load_update;
+ account_reset_rq(rq);
break;
case CPU_ONLINE:
@@ -7939,7 +7968,7 @@ void set_curr_task(int cpu, struct task_struct *p)
/* task_group_lock serializes the addition/removal of task groups */
static DEFINE_SPINLOCK(task_group_lock);
-static void free_sched_group(struct task_group *tg)
+static void sched_free_group(struct task_group *tg)
{
free_fair_sched_group(tg);
free_rt_sched_group(tg);
@@ -7965,7 +7994,7 @@ struct task_group *sched_create_group(struct task_group *parent)
return tg;
err:
- free_sched_group(tg);
+ sched_free_group(tg);
return ERR_PTR(-ENOMEM);
}
@@ -7985,17 +8014,16 @@ void sched_online_group(struct task_group *tg, struct task_group *parent)
}
/* rcu callback to free various structures associated with a task group */
-static void free_sched_group_rcu(struct rcu_head *rhp)
+static void sched_free_group_rcu(struct rcu_head *rhp)
{
/* now it should be safe to free those cfs_rqs */
- free_sched_group(container_of(rhp, struct task_group, rcu));
+ sched_free_group(container_of(rhp, struct task_group, rcu));
}
-/* Destroy runqueue etc associated with a task group */
void sched_destroy_group(struct task_group *tg)
{
/* wait for possible concurrent references to cfs_rqs complete */
- call_rcu(&tg->rcu, free_sched_group_rcu);
+ call_rcu(&tg->rcu, sched_free_group_rcu);
}
void sched_offline_group(struct task_group *tg)
@@ -8456,31 +8484,26 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
if (IS_ERR(tg))
return ERR_PTR(-ENOMEM);
+ sched_online_group(tg, parent);
+
return &tg->css;
}
-static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
+static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
{
struct task_group *tg = css_tg(css);
- struct task_group *parent = css_tg(css->parent);
- if (parent)
- sched_online_group(tg, parent);
- return 0;
+ sched_offline_group(tg);
}
static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
{
struct task_group *tg = css_tg(css);
- sched_destroy_group(tg);
-}
-
-static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
-{
- struct task_group *tg = css_tg(css);
-
- sched_offline_group(tg);
+ /*
+ * Relies on the RCU grace period between css_released() and this.
+ */
+ sched_free_group(tg);
}
static void cpu_cgroup_fork(struct task_struct *task, void *private)
@@ -8840,9 +8863,8 @@ static struct cftype cpu_files[] = {
struct cgroup_subsys cpu_cgrp_subsys = {
.css_alloc = cpu_cgroup_css_alloc,
+ .css_released = cpu_cgroup_css_released,
.css_free = cpu_cgroup_css_free,
- .css_online = cpu_cgroup_css_online,
- .css_offline = cpu_cgroup_css_offline,
.fork = cpu_cgroup_fork,
.can_attach = cpu_cgroup_can_attach,
.attach = cpu_cgroup_attach,
diff --git a/kernel/kernel/sched/cputime.c b/kernel/kernel/sched/cputime.c
index 0f75a38cf..558b98af2 100644
--- a/kernel/kernel/sched/cputime.c
+++ b/kernel/kernel/sched/cputime.c
@@ -259,21 +259,21 @@ static __always_inline bool steal_account_process_tick(void)
#ifdef CONFIG_PARAVIRT
if (static_key_false(&paravirt_steal_enabled)) {
u64 steal;
- cputime_t steal_ct;
+ unsigned long steal_jiffies;
steal = paravirt_steal_clock(smp_processor_id());
steal -= this_rq()->prev_steal_time;
/*
- * cputime_t may be less precise than nsecs (eg: if it's
- * based on jiffies). Lets cast the result to cputime
+ * steal is in nsecs but our caller is expecting steal
+ * time in jiffies. Lets cast the result to jiffies
* granularity and account the rest on the next rounds.
*/
- steal_ct = nsecs_to_cputime(steal);
- this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct);
+ steal_jiffies = nsecs_to_jiffies(steal);
+ this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies);
- account_steal_time(steal_ct);
- return steal_ct;
+ account_steal_time(jiffies_to_cputime(steal_jiffies));
+ return steal_jiffies;
}
#endif
return false;
@@ -600,19 +600,25 @@ static void cputime_adjust(struct task_cputime *curr,
stime = curr->stime;
utime = curr->utime;
- if (utime == 0) {
- stime = rtime;
+ /*
+ * If either stime or both stime and utime are 0, assume all runtime is
+ * userspace. Once a task gets some ticks, the monotonicy code at
+ * 'update' will ensure things converge to the observed ratio.
+ */
+ if (stime == 0) {
+ utime = rtime;
goto update;
}
- if (stime == 0) {
- utime = rtime;
+ if (utime == 0) {
+ stime = rtime;
goto update;
}
stime = scale_stime((__force u64)stime, (__force u64)rtime,
(__force u64)(stime + utime));
+update:
/*
* Make sure stime doesn't go backwards; this preserves monotonicity
* for utime because rtime is monotonic.
@@ -635,7 +641,6 @@ static void cputime_adjust(struct task_cputime *curr,
stime = rtime - utime;
}
-update:
prev->stime = stime;
prev->utime = utime;
out:
@@ -680,7 +685,7 @@ static cputime_t get_vtime_delta(struct task_struct *tsk)
{
unsigned long long delta = vtime_delta(tsk);
- WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_SLEEPING);
+ WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
tsk->vtime_snap += delta;
/* CHECKME: always safe to convert nsecs to cputime? */
@@ -696,45 +701,37 @@ static void __vtime_account_system(struct task_struct *tsk)
void vtime_account_system(struct task_struct *tsk)
{
- raw_spin_lock(&tsk->vtime_lock);
- write_seqcount_begin(&tsk->vtime_seq);
+ write_seqcount_begin(&tsk->vtime_seqcount);
__vtime_account_system(tsk);
- write_seqcount_end(&tsk->vtime_seq);
- raw_spin_unlock(&tsk->vtime_lock);
+ write_seqcount_end(&tsk->vtime_seqcount);
}
void vtime_gen_account_irq_exit(struct task_struct *tsk)
{
- raw_spin_lock(&tsk->vtime_lock);
- write_seqcount_begin(&tsk->vtime_seq);
+ write_seqcount_begin(&tsk->vtime_seqcount);
__vtime_account_system(tsk);
if (context_tracking_in_user())
tsk->vtime_snap_whence = VTIME_USER;
- write_seqcount_end(&tsk->vtime_seq);
- raw_spin_unlock(&tsk->vtime_lock);
+ write_seqcount_end(&tsk->vtime_seqcount);
}
void vtime_account_user(struct task_struct *tsk)
{
cputime_t delta_cpu;
- raw_spin_lock(&tsk->vtime_lock);
- write_seqcount_begin(&tsk->vtime_seq);
+ write_seqcount_begin(&tsk->vtime_seqcount);
delta_cpu = get_vtime_delta(tsk);
tsk->vtime_snap_whence = VTIME_SYS;
account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
- write_seqcount_end(&tsk->vtime_seq);
- raw_spin_unlock(&tsk->vtime_lock);
+ write_seqcount_end(&tsk->vtime_seqcount);
}
void vtime_user_enter(struct task_struct *tsk)
{
- raw_spin_lock(&tsk->vtime_lock);
- write_seqcount_begin(&tsk->vtime_seq);
+ write_seqcount_begin(&tsk->vtime_seqcount);
__vtime_account_system(tsk);
tsk->vtime_snap_whence = VTIME_USER;
- write_seqcount_end(&tsk->vtime_seq);
- raw_spin_unlock(&tsk->vtime_lock);
+ write_seqcount_end(&tsk->vtime_seqcount);
}
void vtime_guest_enter(struct task_struct *tsk)
@@ -746,23 +743,19 @@ void vtime_guest_enter(struct task_struct *tsk)
* synchronization against the reader (task_gtime())
* that can thus safely catch up with a tickless delta.
*/
- raw_spin_lock(&tsk->vtime_lock);
- write_seqcount_begin(&tsk->vtime_seq);
+ write_seqcount_begin(&tsk->vtime_seqcount);
__vtime_account_system(tsk);
current->flags |= PF_VCPU;
- write_seqcount_end(&tsk->vtime_seq);
- raw_spin_unlock(&tsk->vtime_lock);
+ write_seqcount_end(&tsk->vtime_seqcount);
}
EXPORT_SYMBOL_GPL(vtime_guest_enter);
void vtime_guest_exit(struct task_struct *tsk)
{
- raw_spin_lock(&tsk->vtime_lock);
- write_seqcount_begin(&tsk->vtime_seq);
+ write_seqcount_begin(&tsk->vtime_seqcount);
__vtime_account_system(tsk);
current->flags &= ~PF_VCPU;
- write_seqcount_end(&tsk->vtime_seq);
- raw_spin_unlock(&tsk->vtime_lock);
+ write_seqcount_end(&tsk->vtime_seqcount);
}
EXPORT_SYMBOL_GPL(vtime_guest_exit);
@@ -775,30 +768,26 @@ void vtime_account_idle(struct task_struct *tsk)
void arch_vtime_task_switch(struct task_struct *prev)
{
- raw_spin_lock(&prev->vtime_lock);
- write_seqcount_begin(&prev->vtime_seq);
- prev->vtime_snap_whence = VTIME_SLEEPING;
- write_seqcount_end(&prev->vtime_seq);
- raw_spin_unlock(&prev->vtime_lock);
+ write_seqcount_begin(&prev->vtime_seqcount);
+ prev->vtime_snap_whence = VTIME_INACTIVE;
+ write_seqcount_end(&prev->vtime_seqcount);
- raw_spin_lock(&current->vtime_lock);
- write_seqcount_begin(&current->vtime_seq);
+ write_seqcount_begin(&current->vtime_seqcount);
current->vtime_snap_whence = VTIME_SYS;
current->vtime_snap = sched_clock_cpu(smp_processor_id());
- write_seqcount_end(&current->vtime_seq);
- raw_spin_unlock(&current->vtime_lock);
+ write_seqcount_end(&current->vtime_seqcount);
}
void vtime_init_idle(struct task_struct *t, int cpu)
{
unsigned long flags;
- raw_spin_lock_irqsave(&t->vtime_lock, flags);
- write_seqcount_begin(&t->vtime_seq);
+ local_irq_save(flags);
+ write_seqcount_begin(&t->vtime_seqcount);
t->vtime_snap_whence = VTIME_SYS;
t->vtime_snap = sched_clock_cpu(cpu);
- write_seqcount_end(&t->vtime_seq);
- raw_spin_unlock_irqrestore(&t->vtime_lock, flags);
+ write_seqcount_end(&t->vtime_seqcount);
+ local_irq_restore(flags);
}
cputime_t task_gtime(struct task_struct *t)
@@ -810,13 +799,13 @@ cputime_t task_gtime(struct task_struct *t)
return t->gtime;
do {
- seq = read_seqcount_begin(&t->vtime_seq);
+ seq = read_seqcount_begin(&t->vtime_seqcount);
gtime = t->gtime;
if (t->flags & PF_VCPU)
gtime += vtime_delta(t);
- } while (read_seqcount_retry(&t->vtime_seq, seq));
+ } while (read_seqcount_retry(&t->vtime_seqcount, seq));
return gtime;
}
@@ -839,7 +828,7 @@ fetch_task_cputime(struct task_struct *t,
*udelta = 0;
*sdelta = 0;
- seq = read_seqcount_begin(&t->vtime_seq);
+ seq = read_seqcount_begin(&t->vtime_seqcount);
if (u_dst)
*u_dst = *u_src;
@@ -847,7 +836,7 @@ fetch_task_cputime(struct task_struct *t,
*s_dst = *s_src;
/* Task is sleeping, nothing to add */
- if (t->vtime_snap_whence == VTIME_SLEEPING ||
+ if (t->vtime_snap_whence == VTIME_INACTIVE ||
is_idle_task(t))
continue;
@@ -863,7 +852,7 @@ fetch_task_cputime(struct task_struct *t,
if (t->vtime_snap_whence == VTIME_SYS)
*sdelta = delta;
}
- } while (read_seqcount_retry(&t->vtime_seq, seq));
+ } while (read_seqcount_retry(&t->vtime_seqcount, seq));
}
diff --git a/kernel/kernel/sched/fair.c b/kernel/kernel/sched/fair.c
index 8c71e627b..cf0a1adba 100644
--- a/kernel/kernel/sched/fair.c
+++ b/kernel/kernel/sched/fair.c
@@ -687,8 +687,6 @@ void init_entity_runnable_average(struct sched_entity *se)
/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
}
-static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
-static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq);
#else
void init_entity_runnable_average(struct sched_entity *se)
{
@@ -1193,8 +1191,6 @@ static void task_numa_assign(struct task_numa_env *env,
{
if (env->best_task)
put_task_struct(env->best_task);
- if (p)
- get_task_struct(p);
env->best_task = p;
env->best_imp = imp;
@@ -1262,20 +1258,30 @@ static void task_numa_compare(struct task_numa_env *env,
long imp = env->p->numa_group ? groupimp : taskimp;
long moveimp = imp;
int dist = env->dist;
+ bool assigned = false;
rcu_read_lock();
raw_spin_lock_irq(&dst_rq->lock);
cur = dst_rq->curr;
/*
- * No need to move the exiting task, and this ensures that ->curr
- * wasn't reaped and thus get_task_struct() in task_numa_assign()
- * is safe under RCU read lock.
- * Note that rcu_read_lock() itself can't protect from the final
- * put_task_struct() after the last schedule().
+ * No need to move the exiting task or idle task.
*/
if ((cur->flags & PF_EXITING) || is_idle_task(cur))
cur = NULL;
+ else {
+ /*
+ * The task_struct must be protected here to protect the
+ * p->numa_faults access in the task_weight since the
+ * numa_faults could already be freed in the following path:
+ * finish_task_switch()
+ * --> put_task_struct()
+ * --> __put_task_struct()
+ * --> task_numa_free()
+ */
+ get_task_struct(cur);
+ }
+
raw_spin_unlock_irq(&dst_rq->lock);
/*
@@ -1359,6 +1365,7 @@ balance:
*/
if (!load_too_imbalanced(src_load, dst_load, env)) {
imp = moveimp - 1;
+ put_task_struct(cur);
cur = NULL;
goto assign;
}
@@ -1384,9 +1391,16 @@ balance:
env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
assign:
+ assigned = true;
task_numa_assign(env, cur, imp);
unlock:
rcu_read_unlock();
+ /*
+ * The dst_rq->curr isn't assigned. The protection for task_struct is
+ * finished.
+ */
+ if (cur && !assigned)
+ put_task_struct(cur);
}
static void task_numa_find_cpu(struct task_numa_env *env,
@@ -2682,6 +2696,23 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
+/*
+ * Unsigned subtract and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define sub_positive(_ptr, _val) do { \
+ typeof(_ptr) ptr = (_ptr); \
+ typeof(*ptr) val = (_val); \
+ typeof(*ptr) res, var = READ_ONCE(*ptr); \
+ res = var - val; \
+ if (res > var) \
+ res = 0; \
+ WRITE_ONCE(*ptr, res); \
+} while (0)
+
/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
{
@@ -2690,15 +2721,15 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
if (atomic_long_read(&cfs_rq->removed_load_avg)) {
s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
- sa->load_avg = max_t(long, sa->load_avg - r, 0);
- sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
+ sub_positive(&sa->load_avg, r);
+ sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
removed = 1;
}
if (atomic_long_read(&cfs_rq->removed_util_avg)) {
long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
- sa->util_avg = max_t(long, sa->util_avg - r, 0);
- sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
+ sub_positive(&sa->util_avg, r);
+ sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
}
decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
@@ -2764,10 +2795,10 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
&se->avg, se->on_rq * scale_load_down(se->load.weight),
cfs_rq->curr == se, NULL);
- cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
- cfs_rq->avg.load_sum = max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0);
- cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
- cfs_rq->avg.util_sum = max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0);
+ sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
+ sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
+ sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
+ sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
}
/* Add the load generated by se into cfs_rq's load average */
@@ -4577,19 +4608,24 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
return wl;
for_each_sched_entity(se) {
- long w, W;
+ struct cfs_rq *cfs_rq = se->my_q;
+ long W, w = cfs_rq_load_avg(cfs_rq);
- tg = se->my_q->tg;
+ tg = cfs_rq->tg;
/*
* W = @wg + \Sum rw_j
*/
- W = wg + calc_tg_weight(tg, se->my_q);
+ W = wg + atomic_long_read(&tg->load_avg);
+
+ /* Ensure \Sum rw_j >= rw_i */
+ W -= cfs_rq->tg_load_avg_contrib;
+ W += w;
/*
* w = rw_i + @wl
*/
- w = cfs_rq_load_avg(se->my_q) + wl;
+ w += wl;
/*
* wl = S * s'_i; see (2)
diff --git a/kernel/kernel/sched/loadavg.c b/kernel/kernel/sched/loadavg.c
index ef7159012..b0b93fd33 100644
--- a/kernel/kernel/sched/loadavg.c
+++ b/kernel/kernel/sched/loadavg.c
@@ -99,10 +99,13 @@ long calc_load_fold_active(struct rq *this_rq)
static unsigned long
calc_load(unsigned long load, unsigned long exp, unsigned long active)
{
- load *= exp;
- load += active * (FIXED_1 - exp);
- load += 1UL << (FSHIFT - 1);
- return load >> FSHIFT;
+ unsigned long newload;
+
+ newload = load * exp + active * (FIXED_1 - exp);
+ if (active >= load)
+ newload += FIXED_1-1;
+
+ return newload / FIXED_1;
}
#ifdef CONFIG_NO_HZ_COMMON
diff --git a/kernel/kernel/sched/sched.h b/kernel/kernel/sched/sched.h
index 7dd5f53b6..a8a9b156e 100644
--- a/kernel/kernel/sched/sched.h
+++ b/kernel/kernel/sched/sched.h
@@ -1780,3 +1780,16 @@ static inline u64 irq_time_read(int cpu)
}
#endif /* CONFIG_64BIT */
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+static inline void account_reset_rq(struct rq *rq)
+{
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ rq->prev_irq_time = 0;
+#endif
+#ifdef CONFIG_PARAVIRT
+ rq->prev_steal_time = 0;
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+ rq->prev_steal_time_rq = 0;
+#endif
+}
diff --git a/kernel/kernel/sched/work-simple.c b/kernel/kernel/sched/swork.c
index 9ffe40543..1950f40ca 100644
--- a/kernel/kernel/sched/work-simple.c
+++ b/kernel/kernel/sched/swork.c
@@ -6,7 +6,7 @@
*/
#include <linux/swait.h>
-#include <linux/work-simple.h>
+#include <linux/swork.h>
#include <linux/kthread.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
diff --git a/kernel/kernel/sysctl.c b/kernel/kernel/sysctl.c
index dc6858d66..2f0d15725 100644
--- a/kernel/kernel/sysctl.c
+++ b/kernel/kernel/sysctl.c
@@ -1735,6 +1735,20 @@ static struct ctl_table fs_table[] = {
.proc_handler = &pipe_proc_fn,
.extra1 = &pipe_min_size,
},
+ {
+ .procname = "pipe-user-pages-hard",
+ .data = &pipe_user_pages_hard,
+ .maxlen = sizeof(pipe_user_pages_hard),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+ {
+ .procname = "pipe-user-pages-soft",
+ .data = &pipe_user_pages_soft,
+ .maxlen = sizeof(pipe_user_pages_soft),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
{ }
};
@@ -2037,6 +2051,21 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
return 0;
}
+static int do_proc_douintvec_conv(bool *negp, unsigned long *lvalp,
+ int *valp,
+ int write, void *data)
+{
+ if (write) {
+ if (*negp)
+ return -EINVAL;
+ *valp = *lvalp;
+ } else {
+ unsigned int val = *valp;
+ *lvalp = (unsigned long)val;
+ }
+ return 0;
+}
+
static const char proc_wspace_sep[] = { ' ', '\t', '\n' };
static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
@@ -2164,8 +2193,27 @@ static int do_proc_dointvec(struct ctl_table *table, int write,
int proc_dointvec(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
- return do_proc_dointvec(table,write,buffer,lenp,ppos,
- NULL,NULL);
+ return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL);
+}
+
+/**
+ * proc_douintvec - read a vector of unsigned integers
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer
+ * values from/to the user buffer, treated as an ASCII string.
+ *
+ * Returns 0 on success.
+ */
+int proc_douintvec(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return do_proc_dointvec(table, write, buffer, lenp, ppos,
+ do_proc_douintvec_conv, NULL);
}
/*
@@ -2366,6 +2414,7 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
break;
if (neg)
continue;
+ val = convmul * val / convdiv;
if ((min && val < *min) || (max && val > *max))
continue;
*i = val;
@@ -2778,6 +2827,12 @@ int proc_dointvec(struct ctl_table *table, int write,
return -ENOSYS;
}
+int proc_douintvec(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
int proc_dointvec_minmax(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
@@ -2823,6 +2878,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
* exception granted :-)
*/
EXPORT_SYMBOL(proc_dointvec);
+EXPORT_SYMBOL(proc_douintvec);
EXPORT_SYMBOL(proc_dointvec_jiffies);
EXPORT_SYMBOL(proc_dointvec_minmax);
EXPORT_SYMBOL(proc_dointvec_userhz_jiffies);
diff --git a/kernel/kernel/sysctl_binary.c b/kernel/kernel/sysctl_binary.c
index 7e7746a42..10a1d7dc9 100644
--- a/kernel/kernel/sysctl_binary.c
+++ b/kernel/kernel/sysctl_binary.c
@@ -1321,7 +1321,7 @@ static ssize_t binary_sysctl(const int *name, int nlen,
}
mnt = task_active_pid_ns(current)->proc_mnt;
- file = file_open_root(mnt->mnt_root, mnt, pathname, flags);
+ file = file_open_root(mnt->mnt_root, mnt, pathname, flags, 0);
result = PTR_ERR(file);
if (IS_ERR(file))
goto out_putname;
diff --git a/kernel/kernel/time/clocksource.c b/kernel/kernel/time/clocksource.c
index 1347882d1..b98810d2f 100644
--- a/kernel/kernel/time/clocksource.c
+++ b/kernel/kernel/time/clocksource.c
@@ -323,13 +323,42 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs)
/* cs is a watchdog. */
if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
+ }
+ spin_unlock_irqrestore(&watchdog_lock, flags);
+}
+
+static void clocksource_select_watchdog(bool fallback)
+{
+ struct clocksource *cs, *old_wd;
+ unsigned long flags;
+
+ spin_lock_irqsave(&watchdog_lock, flags);
+ /* save current watchdog */
+ old_wd = watchdog;
+ if (fallback)
+ watchdog = NULL;
+
+ list_for_each_entry(cs, &clocksource_list, list) {
+ /* cs is a clocksource to be watched. */
+ if (cs->flags & CLOCK_SOURCE_MUST_VERIFY)
+ continue;
+
+ /* Skip current if we were requested for a fallback. */
+ if (fallback && cs == old_wd)
+ continue;
+
/* Pick the best watchdog. */
- if (!watchdog || cs->rating > watchdog->rating) {
+ if (!watchdog || cs->rating > watchdog->rating)
watchdog = cs;
- /* Reset watchdog cycles */
- clocksource_reset_watchdog();
- }
}
+ /* If we failed to find a fallback restore the old one. */
+ if (!watchdog)
+ watchdog = old_wd;
+
+ /* If we changed the watchdog we need to reset cycles. */
+ if (watchdog != old_wd)
+ clocksource_reset_watchdog();
+
/* Check if the watchdog timer needs to be started. */
clocksource_start_watchdog();
spin_unlock_irqrestore(&watchdog_lock, flags);
@@ -404,6 +433,7 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs)
cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
}
+static void clocksource_select_watchdog(bool fallback) { }
static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }
static inline void clocksource_resume_watchdog(void) { }
static inline int __clocksource_watchdog_kthread(void) { return 0; }
@@ -736,6 +766,7 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
clocksource_enqueue(cs);
clocksource_enqueue_watchdog(cs);
clocksource_select();
+ clocksource_select_watchdog(false);
mutex_unlock(&clocksource_mutex);
return 0;
}
@@ -758,6 +789,7 @@ void clocksource_change_rating(struct clocksource *cs, int rating)
mutex_lock(&clocksource_mutex);
__clocksource_change_rating(cs, rating);
clocksource_select();
+ clocksource_select_watchdog(false);
mutex_unlock(&clocksource_mutex);
}
EXPORT_SYMBOL(clocksource_change_rating);
@@ -767,12 +799,12 @@ EXPORT_SYMBOL(clocksource_change_rating);
*/
static int clocksource_unbind(struct clocksource *cs)
{
- /*
- * I really can't convince myself to support this on hardware
- * designed by lobotomized monkeys.
- */
- if (clocksource_is_watchdog(cs))
- return -EBUSY;
+ if (clocksource_is_watchdog(cs)) {
+ /* Select and try to install a replacement watchdog. */
+ clocksource_select_watchdog(true);
+ if (clocksource_is_watchdog(cs))
+ return -EBUSY;
+ }
if (cs == curr_clocksource) {
/* Select and try to install a replacement clock source */
diff --git a/kernel/kernel/time/hrtimer.c b/kernel/kernel/time/hrtimer.c
index 2659c632a..549526e9f 100644
--- a/kernel/kernel/time/hrtimer.c
+++ b/kernel/kernel/time/hrtimer.c
@@ -96,6 +96,9 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
};
static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
+ /* Make sure we catch unsupported clockids */
+ [0 ... MAX_CLOCKS - 1] = HRTIMER_MAX_CLOCK_BASES,
+
[CLOCK_REALTIME] = HRTIMER_BASE_REALTIME,
[CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC,
[CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME,
@@ -104,7 +107,9 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
static inline int hrtimer_clockid_to_base(clockid_t clock_id)
{
- return hrtimer_clock_to_base_table[clock_id];
+ int base = hrtimer_clock_to_base_table[clock_id];
+ BUG_ON(base == HRTIMER_MAX_CLOCK_BASES);
+ return base;
}
/*
diff --git a/kernel/kernel/time/ntp.c b/kernel/kernel/time/ntp.c
index cd925efb2..0f6868fd2 100644
--- a/kernel/kernel/time/ntp.c
+++ b/kernel/kernel/time/ntp.c
@@ -717,8 +717,24 @@ int ntp_validate_timex(struct timex *txc)
return -EINVAL;
}
- if ((txc->modes & ADJ_SETOFFSET) && (!capable(CAP_SYS_TIME)))
- return -EPERM;
+ if (txc->modes & ADJ_SETOFFSET) {
+ /* In order to inject time, you gotta be super-user! */
+ if (!capable(CAP_SYS_TIME))
+ return -EPERM;
+
+ if (txc->modes & ADJ_NANO) {
+ struct timespec ts;
+
+ ts.tv_sec = txc->time.tv_sec;
+ ts.tv_nsec = txc->time.tv_usec;
+ if (!timespec_inject_offset_valid(&ts))
+ return -EINVAL;
+
+ } else {
+ if (!timeval_inject_offset_valid(&txc->time))
+ return -EINVAL;
+ }
+ }
/*
* Check for potential multiplication overflows that can
diff --git a/kernel/kernel/time/posix-cpu-timers.c b/kernel/kernel/time/posix-cpu-timers.c
index bf28ef601..b7342b6e6 100644
--- a/kernel/kernel/time/posix-cpu-timers.c
+++ b/kernel/kernel/time/posix-cpu-timers.c
@@ -809,6 +809,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
timer->it.cpu.expires = 0;
sample_to_timespec(timer->it_clock, timer->it.cpu.expires,
&itp->it_value);
+ return;
} else {
cpu_timer_sample_group(timer->it_clock, p, &now);
unlock_task_sighand(p, &flags);
diff --git a/kernel/kernel/time/tick-broadcast.c b/kernel/kernel/time/tick-broadcast.c
index f6aae7977..d2a20e83e 100644
--- a/kernel/kernel/time/tick-broadcast.c
+++ b/kernel/kernel/time/tick-broadcast.c
@@ -871,6 +871,9 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
{
int cpu = smp_processor_id();
+ if (!bc)
+ return;
+
/* Set it up only once ! */
if (bc->event_handler != tick_handle_oneshot_broadcast) {
int was_periodic = clockevent_state_periodic(bc);
diff --git a/kernel/kernel/time/timekeeping.c b/kernel/kernel/time/timekeeping.c
index a1c5c6fc2..e060b34d5 100644
--- a/kernel/kernel/time/timekeeping.c
+++ b/kernel/kernel/time/timekeeping.c
@@ -298,17 +298,34 @@ u32 (*arch_gettimeoffset)(void) = default_arch_gettimeoffset;
static inline u32 arch_gettimeoffset(void) { return 0; }
#endif
+static inline u64 timekeeping_delta_to_ns(struct tk_read_base *tkr,
+ cycle_t delta)
+{
+ u64 nsec;
+
+ nsec = delta * tkr->mult + tkr->xtime_nsec;
+ nsec >>= tkr->shift;
+
+ /* If arch requires, add in get_arch_timeoffset() */
+ return nsec + arch_gettimeoffset();
+}
+
static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
{
cycle_t delta;
- s64 nsec;
delta = timekeeping_get_delta(tkr);
+ return timekeeping_delta_to_ns(tkr, delta);
+}
- nsec = (delta * tkr->mult + tkr->xtime_nsec) >> tkr->shift;
+static inline s64 timekeeping_cycles_to_ns(struct tk_read_base *tkr,
+ cycle_t cycles)
+{
+ cycle_t delta;
- /* If arch requires, add in get_arch_timeoffset() */
- return nsec + arch_gettimeoffset();
+ /* calculate the delta since the last update_wall_time */
+ delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask);
+ return timekeeping_delta_to_ns(tkr, delta);
}
/**
@@ -383,7 +400,13 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
do {
seq = raw_read_seqcount_latch(&tkf->seq);
tkr = tkf->base + (seq & 0x01);
- now = ktime_to_ns(tkr->base) + timekeeping_get_ns(tkr);
+ now = ktime_to_ns(tkr->base);
+
+ now += timekeeping_delta_to_ns(tkr,
+ clocksource_delta(
+ tkr->read(tkr->clock),
+ tkr->cycle_last,
+ tkr->mask));
} while (read_seqcount_retry(&tkf->seq, seq));
return now;
@@ -958,7 +981,7 @@ int timekeeping_inject_offset(struct timespec *ts)
struct timespec64 ts64, tmp;
int ret = 0;
- if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
+ if (!timespec_inject_offset_valid(ts))
return -EINVAL;
ts64 = timespec_to_timespec64(*ts);
diff --git a/kernel/kernel/time/timekeeping_debug.c b/kernel/kernel/time/timekeeping_debug.c
index f6bd65236..107310a6f 100644
--- a/kernel/kernel/time/timekeeping_debug.c
+++ b/kernel/kernel/time/timekeeping_debug.c
@@ -23,7 +23,9 @@
#include "timekeeping_internal.h"
-static unsigned int sleep_time_bin[32] = {0};
+#define NUM_BINS 32
+
+static unsigned int sleep_time_bin[NUM_BINS] = {0};
static int tk_debug_show_sleep_time(struct seq_file *s, void *data)
{
@@ -69,6 +71,9 @@ late_initcall(tk_debug_sleep_time_init);
void tk_debug_account_sleep_time(struct timespec64 *t)
{
- sleep_time_bin[fls(t->tv_sec)]++;
+ /* Cap bin index so we don't overflow the array */
+ int bin = min(fls(t->tv_sec), NUM_BINS-1);
+
+ sleep_time_bin[bin]++;
}
diff --git a/kernel/kernel/time/timer.c b/kernel/kernel/time/timer.c
index 016a0bf52..8b757541a 100644
--- a/kernel/kernel/time/timer.c
+++ b/kernel/kernel/time/timer.c
@@ -1051,7 +1051,7 @@ static void wait_for_running_timer(struct timer_list *timer)
base->running_timer != timer);
}
-# define wakeup_timer_waiters(b) wake_up(&(b)->wait_for_running_timer)
+# define wakeup_timer_waiters(b) wake_up_all(&(b)->wait_for_running_timer)
#else
static inline void wait_for_running_timer(struct timer_list *timer)
{
@@ -1313,8 +1313,8 @@ static inline void __run_timers(struct tvec_base *base)
}
}
}
- wakeup_timer_waiters(base);
spin_unlock_irq(&base->lock);
+ wakeup_timer_waiters(base);
}
#ifdef CONFIG_NO_HZ_COMMON
diff --git a/kernel/kernel/trace/Makefile b/kernel/kernel/trace/Makefile
index 3bbaea068..bc08c6730 100644
--- a/kernel/kernel/trace/Makefile
+++ b/kernel/kernel/trace/Makefile
@@ -1,4 +1,8 @@
+# We are fully aware of the dangers of __builtin_return_address()
+FRAME_CFLAGS := $(call cc-disable-warning,frame-address)
+KBUILD_CFLAGS += $(FRAME_CFLAGS)
+
# Do not instrument the tracer itself:
ifdef CONFIG_FUNCTION_TRACER
diff --git a/kernel/kernel/trace/ring_buffer.c b/kernel/kernel/trace/ring_buffer.c
index 9c6045a27..acbb0e73d 100644
--- a/kernel/kernel/trace/ring_buffer.c
+++ b/kernel/kernel/trace/ring_buffer.c
@@ -437,7 +437,7 @@ struct ring_buffer_per_cpu {
raw_spinlock_t reader_lock; /* serialize readers */
arch_spinlock_t lock;
struct lock_class_key lock_key;
- unsigned int nr_pages;
+ unsigned long nr_pages;
unsigned int current_context;
struct list_head *pages;
struct buffer_page *head_page; /* read from head */
@@ -458,7 +458,7 @@ struct ring_buffer_per_cpu {
u64 write_stamp;
u64 read_stamp;
/* ring buffer pages to update, > 0 to add, < 0 to remove */
- int nr_pages_to_update;
+ long nr_pages_to_update;
struct list_head new_pages; /* new pages to add */
struct work_struct update_pages_work;
struct completion update_done;
@@ -1137,10 +1137,10 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
return 0;
}
-static int __rb_allocate_pages(int nr_pages, struct list_head *pages, int cpu)
+static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu)
{
- int i;
struct buffer_page *bpage, *tmp;
+ long i;
for (i = 0; i < nr_pages; i++) {
struct page *page;
@@ -1177,7 +1177,7 @@ free_pages:
}
static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
- unsigned nr_pages)
+ unsigned long nr_pages)
{
LIST_HEAD(pages);
@@ -1202,7 +1202,7 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
}
static struct ring_buffer_per_cpu *
-rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)
+rb_allocate_cpu_buffer(struct ring_buffer *buffer, long nr_pages, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
struct buffer_page *bpage;
@@ -1302,8 +1302,9 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
struct lock_class_key *key)
{
struct ring_buffer *buffer;
+ long nr_pages;
int bsize;
- int cpu, nr_pages;
+ int cpu;
/* keep it in its own cache line */
buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()),
@@ -1429,12 +1430,12 @@ static inline unsigned long rb_page_write(struct buffer_page *bpage)
}
static int
-rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned int nr_pages)
+rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages)
{
struct list_head *tail_page, *to_remove, *next_page;
struct buffer_page *to_remove_page, *tmp_iter_page;
struct buffer_page *last_page, *first_page;
- unsigned int nr_removed;
+ unsigned long nr_removed;
unsigned long head_bit;
int page_entries;
@@ -1651,7 +1652,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
int cpu_id)
{
struct ring_buffer_per_cpu *cpu_buffer;
- unsigned nr_pages;
+ unsigned long nr_pages;
int cpu, err = 0;
/*
@@ -1665,14 +1666,13 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
!cpumask_test_cpu(cpu_id, buffer->cpumask))
return size;
- size = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
- size *= BUF_PAGE_SIZE;
+ nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
/* we need a minimum of two pages */
- if (size < BUF_PAGE_SIZE * 2)
- size = BUF_PAGE_SIZE * 2;
+ if (nr_pages < 2)
+ nr_pages = 2;
- nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+ size = nr_pages * BUF_PAGE_SIZE;
/*
* Don't succeed if resizing is disabled, as a reader might be
@@ -4645,8 +4645,9 @@ static int rb_cpu_notify(struct notifier_block *self,
struct ring_buffer *buffer =
container_of(self, struct ring_buffer, cpu_notify);
long cpu = (long)hcpu;
- int cpu_i, nr_pages_same;
- unsigned int nr_pages;
+ long nr_pages_same;
+ int cpu_i;
+ unsigned long nr_pages;
switch (action) {
case CPU_UP_PREPARE:
diff --git a/kernel/kernel/trace/trace.c b/kernel/kernel/trace/trace.c
index f1ee84b00..cad1a28bf 100644
--- a/kernel/kernel/trace/trace.c
+++ b/kernel/kernel/trace/trace.c
@@ -2559,17 +2559,17 @@ get_total_entries(struct trace_buffer *buf,
static void print_lat_help_header(struct seq_file *m)
{
- seq_puts(m, "# _--------=> CPU# \n"
- "# / _-------=> irqs-off \n"
- "# | / _------=> need-resched \n"
- "# || / _-----=> need-resched_lazy \n"
- "# ||| / _----=> hardirq/softirq \n"
- "# |||| / _---=> preempt-depth \n"
- "# ||||| / _--=> preempt-lazy-depth\n"
- "# |||||| / _-=> migrate-disable \n"
- "# ||||||| / delay \n"
- "# cmd pid |||||||| time | caller \n"
- "# \\ / |||||||| \\ | / \n");
+ seq_puts(m, "# _--------=> CPU# \n"
+ "# / _-------=> irqs-off \n"
+ "# | / _------=> need-resched \n"
+ "# || / _-----=> need-resched_lazy \n"
+ "# ||| / _----=> hardirq/softirq \n"
+ "# |||| / _---=> preempt-depth \n"
+ "# ||||| / _--=> preempt-lazy-depth\n"
+ "# |||||| / _-=> migrate-disable \n"
+ "# ||||||| / delay \n"
+ "# cmd pid |||||||| time | caller \n"
+ "# \\ / |||||||| \\ | / \n");
}
static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
@@ -2598,11 +2598,11 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file
"# |/ _-----=> need-resched_lazy\n"
"# || / _---=> hardirq/softirq\n"
"# ||| / _--=> preempt-depth\n"
- "# |||| /_--=> preempt-lazy-depth\n"
- "# ||||| _-=> migrate-disable \n"
- "# ||||| / delay\n"
- "# TASK-PID CPU# |||||| TIMESTAMP FUNCTION\n"
- "# | | | |||||| | |\n");
+ "# |||| / _-=> preempt-lazy-depth\n"
+ "# ||||| / _-=> migrate-disable \n"
+ "# |||||| / delay\n"
+ "# TASK-PID CPU# ||||||| TIMESTAMP FUNCTION\n"
+ "# | | | ||||||| | |\n");
}
void
@@ -4737,19 +4737,20 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
struct trace_iterator *iter = filp->private_data;
ssize_t sret;
- /* return any leftover data */
- sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
- if (sret != -EBUSY)
- return sret;
-
- trace_seq_init(&iter->seq);
-
/*
* Avoid more than one consumer on a single file descriptor
* This is just a matter of traces coherency, the ring buffer itself
* is protected.
*/
mutex_lock(&iter->mutex);
+
+ /* return any leftover data */
+ sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
+ if (sret != -EBUSY)
+ goto out;
+
+ trace_seq_init(&iter->seq);
+
if (iter->trace->read) {
sret = iter->trace->read(iter, filp, ubuf, cnt, ppos);
if (sret)
@@ -4959,7 +4960,10 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
spd.nr_pages = i;
- ret = splice_to_pipe(pipe, &spd);
+ if (i)
+ ret = splice_to_pipe(pipe, &spd);
+ else
+ ret = 0;
out:
splice_shrink_spd(&spd);
return ret;
@@ -5773,9 +5777,6 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
return -EBUSY;
#endif
- if (splice_grow_spd(pipe, &spd))
- return -ENOMEM;
-
if (*ppos & (PAGE_SIZE - 1))
return -EINVAL;
@@ -5785,6 +5786,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
len &= PAGE_MASK;
}
+ if (splice_grow_spd(pipe, &spd))
+ return -ENOMEM;
+
again:
trace_access_lock(iter->cpu_file);
entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file);
@@ -5842,19 +5846,21 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
/* did we read anything? */
if (!spd.nr_pages) {
if (ret)
- return ret;
+ goto out;
+ ret = -EAGAIN;
if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK))
- return -EAGAIN;
+ goto out;
ret = wait_on_pipe(iter, true);
if (ret)
- return ret;
+ goto out;
goto again;
}
ret = splice_to_pipe(pipe, &spd);
+out:
splice_shrink_spd(&spd);
return ret;
diff --git a/kernel/kernel/trace/trace_events.c b/kernel/kernel/trace/trace_events.c
index 0e508e99b..5bd79b347 100644
--- a/kernel/kernel/trace/trace_events.c
+++ b/kernel/kernel/trace/trace_events.c
@@ -246,6 +246,14 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer,
local_save_flags(fbuffer->flags);
fbuffer->pc = preempt_count();
+ /*
+ * If CONFIG_PREEMPT is enabled, then the tracepoint itself disables
+ * preemption (adding one to the preempt_count). Since we are
+ * interested in the preempt_count at the time the tracepoint was
+ * hit, we need to subtract one to offset the increment.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT))
+ fbuffer->pc--;
fbuffer->trace_file = trace_file;
fbuffer->event =
@@ -2109,8 +2117,13 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file)
trace_create_file("filter", 0644, file->dir, file,
&ftrace_event_filter_fops);
- trace_create_file("trigger", 0644, file->dir, file,
- &event_trigger_fops);
+ /*
+ * Only event directories that can be enabled should have
+ * triggers.
+ */
+ if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE))
+ trace_create_file("trigger", 0644, file->dir, file,
+ &event_trigger_fops);
trace_create_file("format", 0444, file->dir, call,
&ftrace_event_format_fops);
diff --git a/kernel/kernel/trace/trace_functions_graph.c b/kernel/kernel/trace/trace_functions_graph.c
index a663cbb84..7fd6f5a26 100644
--- a/kernel/kernel/trace/trace_functions_graph.c
+++ b/kernel/kernel/trace/trace_functions_graph.c
@@ -780,6 +780,10 @@ print_graph_entry_leaf(struct trace_iterator *iter,
cpu_data = per_cpu_ptr(data->cpu_data, cpu);
+ /* If a graph tracer ignored set_graph_notrace */
+ if (call->depth < -1)
+ call->depth += FTRACE_NOTRACE_DEPTH;
+
/*
* Comments display at + 1 to depth. Since
* this is a leaf function, keep the comments
@@ -788,7 +792,8 @@ print_graph_entry_leaf(struct trace_iterator *iter,
cpu_data->depth = call->depth - 1;
/* No need to keep this function around for this depth */
- if (call->depth < FTRACE_RETFUNC_DEPTH)
+ if (call->depth < FTRACE_RETFUNC_DEPTH &&
+ !WARN_ON_ONCE(call->depth < 0))
cpu_data->enter_funcs[call->depth] = 0;
}
@@ -818,11 +823,16 @@ print_graph_entry_nested(struct trace_iterator *iter,
struct fgraph_cpu_data *cpu_data;
int cpu = iter->cpu;
+ /* If a graph tracer ignored set_graph_notrace */
+ if (call->depth < -1)
+ call->depth += FTRACE_NOTRACE_DEPTH;
+
cpu_data = per_cpu_ptr(data->cpu_data, cpu);
cpu_data->depth = call->depth;
/* Save this function pointer to see if the exit matches */
- if (call->depth < FTRACE_RETFUNC_DEPTH)
+ if (call->depth < FTRACE_RETFUNC_DEPTH &&
+ !WARN_ON_ONCE(call->depth < 0))
cpu_data->enter_funcs[call->depth] = call->func;
}
@@ -1052,7 +1062,8 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
*/
cpu_data->depth = trace->depth - 1;
- if (trace->depth < FTRACE_RETFUNC_DEPTH) {
+ if (trace->depth < FTRACE_RETFUNC_DEPTH &&
+ !WARN_ON_ONCE(trace->depth < 0)) {
if (cpu_data->enter_funcs[trace->depth] != trace->func)
func_match = 0;
cpu_data->enter_funcs[trace->depth] = 0;
diff --git a/kernel/kernel/trace/trace_irqsoff.c b/kernel/kernel/trace/trace_irqsoff.c
index 069942c22..553e71254 100644
--- a/kernel/kernel/trace/trace_irqsoff.c
+++ b/kernel/kernel/trace/trace_irqsoff.c
@@ -110,8 +110,12 @@ static int func_prolog_dec(struct trace_array *tr,
return 0;
local_save_flags(*flags);
- /* slight chance to get a false positive on tracing_cpu */
- if (!irqs_disabled_flags(*flags))
+ /*
+ * Slight chance to get a false positive on tracing_cpu,
+ * although I'm starting to think there isn't a chance.
+ * Leave this for now just to be paranoid.
+ */
+ if (!irqs_disabled_flags(*flags) && !preempt_count())
return 0;
*data = per_cpu_ptr(tr->trace_buffer.data, cpu);
diff --git a/kernel/kernel/trace/trace_printk.c b/kernel/kernel/trace/trace_printk.c
index 060df67db..ad1d6164e 100644
--- a/kernel/kernel/trace/trace_printk.c
+++ b/kernel/kernel/trace/trace_printk.c
@@ -36,6 +36,10 @@ struct trace_bprintk_fmt {
static inline struct trace_bprintk_fmt *lookup_format(const char *fmt)
{
struct trace_bprintk_fmt *pos;
+
+ if (!fmt)
+ return ERR_PTR(-EINVAL);
+
list_for_each_entry(pos, &trace_bprintk_fmt_list, list) {
if (!strcmp(pos->fmt, fmt))
return pos;
@@ -57,7 +61,8 @@ void hold_module_trace_bprintk_format(const char **start, const char **end)
for (iter = start; iter < end; iter++) {
struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter);
if (tb_fmt) {
- *iter = tb_fmt->fmt;
+ if (!IS_ERR(tb_fmt))
+ *iter = tb_fmt->fmt;
continue;
}
@@ -296,6 +301,9 @@ static int t_show(struct seq_file *m, void *v)
const char *str = *fmt;
int i;
+ if (!*fmt)
+ return 0;
+
seq_printf(m, "0x%lx : \"", *(unsigned long *)fmt);
/*
diff --git a/kernel/kernel/watchdog.c b/kernel/kernel/watchdog.c
index 201775b44..fa2e079cc 100644
--- a/kernel/kernel/watchdog.c
+++ b/kernel/kernel/watchdog.c
@@ -330,7 +330,6 @@ static void watchdog_overflow_callback(struct perf_event *event,
*/
if (is_hardlockup()) {
int this_cpu = smp_processor_id();
- struct pt_regs *regs = get_irq_regs();
/* only print hardlockups once */
if (__this_cpu_read(hard_watchdog_warn) == true)
@@ -361,7 +360,7 @@ static void watchdog_overflow_callback(struct perf_event *event,
raw_spin_unlock(&watchdog_output_lock);
if (hardlockup_panic)
- panic("Hard LOCKUP");
+ nmi_panic(regs, "Hard LOCKUP");
__this_cpu_write(hard_watchdog_warn, true);
return;
@@ -918,6 +917,9 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write,
* both lockup detectors are disabled if proc_watchdog_update()
* returns an error.
*/
+ if (old == new)
+ goto out;
+
err = proc_watchdog_update();
}
out:
@@ -962,7 +964,7 @@ int proc_soft_watchdog(struct ctl_table *table, int write,
int proc_watchdog_thresh(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
- int err, old;
+ int err, old, new;
get_online_cpus();
mutex_lock(&watchdog_proc_mutex);
@@ -982,6 +984,10 @@ int proc_watchdog_thresh(struct ctl_table *table, int write,
/*
* Update the sample period. Restore on failure.
*/
+ new = ACCESS_ONCE(watchdog_thresh);
+ if (old == new)
+ goto out;
+
set_sample_period();
err = proc_watchdog_update();
if (err) {
diff --git a/kernel/kernel/workqueue.c b/kernel/kernel/workqueue.c
index efd1f5822..d5b0f4fc0 100644
--- a/kernel/kernel/workqueue.c
+++ b/kernel/kernel/workqueue.c
@@ -683,6 +683,35 @@ static void set_work_pool_and_clear_pending(struct work_struct *work,
*/
smp_wmb();
set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0);
+ /*
+ * The following mb guarantees that previous clear of a PENDING bit
+ * will not be reordered with any speculative LOADS or STORES from
+ * work->current_func, which is executed afterwards. This possible
+ * reordering can lead to a missed execution on attempt to qeueue
+ * the same @work. E.g. consider this case:
+ *
+ * CPU#0 CPU#1
+ * ---------------------------- --------------------------------
+ *
+ * 1 STORE event_indicated
+ * 2 queue_work_on() {
+ * 3 test_and_set_bit(PENDING)
+ * 4 } set_..._and_clear_pending() {
+ * 5 set_work_data() # clear bit
+ * 6 smp_mb()
+ * 7 work->current_func() {
+ * 8 LOAD event_indicated
+ * }
+ *
+ * Without an explicit full barrier speculative LOAD on line 8 can
+ * be executed before CPU#0 does STORE on line 1. If that happens,
+ * CPU#0 observes the PENDING bit is still set and new execution of
+ * a @work is not queued in a hope, that CPU#1 will eventually
+ * finish the queued @work. Meanwhile CPU#1 does not see
+ * event_indicated is set, because speculative LOAD was executed
+ * before actual STORE.
+ */
+ smp_mb();
}
static void clear_work_data(struct work_struct *work)
@@ -1106,9 +1135,11 @@ static void put_pwq_unlocked(struct pool_workqueue *pwq)
* As both pwqs and pools are RCU protected, the
* following lock operations are safe.
*/
+ rcu_read_lock();
local_spin_lock_irq(pendingb_lock, &pwq->pool->lock);
put_pwq(pwq);
local_spin_unlock_irq(pendingb_lock, &pwq->pool->lock);
+ rcu_read_unlock();
}
}
@@ -4466,6 +4497,17 @@ static void rebind_workers(struct worker_pool *pool)
pool->attrs->cpumask) < 0);
spin_lock_irq(&pool->lock);
+
+ /*
+ * XXX: CPU hotplug notifiers are weird and can call DOWN_FAILED
+ * w/o preceding DOWN_PREPARE. Work around it. CPU hotplug is
+ * being reworked and this can go away in time.
+ */
+ if (!(pool->flags & POOL_DISASSOCIATED)) {
+ spin_unlock_irq(&pool->lock);
+ return;
+ }
+
pool->flags &= ~POOL_DISASSOCIATED;
for_each_pool_worker(worker, pool) {