From f08bc4d6337768124d49faaada449e4803d42e8b Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Fri, 7 Apr 2017 02:33:30 +0200 Subject: padata: free correct variable commit 07a77929ba672d93642a56dc2255dd21e6e2290b upstream. The author meant to free the variable that was just allocated, instead of the one that failed to be allocated, but made a simple typo. This patch rectifies that. Signed-off-by: Jason A. Donenfeld Signed-off-by: Herbert Xu Signed-off-by: Greg Kroah-Hartman --- kernel/padata.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index 401227e3967c..ecc7b3f452c7 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -357,7 +357,7 @@ static int padata_setup_cpumasks(struct parallel_data *pd, cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_online_mask); if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) { - free_cpumask_var(pd->cpumask.cbcpu); + free_cpumask_var(pd->cpumask.pcpu); return -ENOMEM; } -- cgit v1.2.3 From ddf9b92f12dd9139789786a4ba1a33dbdf693b8a Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 11 May 2017 18:21:01 -0500 Subject: pid_ns: Sleep in TASK_INTERRUPTIBLE in zap_pid_ns_processes commit b9a985db98961ae1ba0be169f19df1c567e4ffe0 upstream. The code can potentially sleep for an indefinite amount of time in zap_pid_ns_processes triggering the hung task timeout, and increasing the system average. This is undesirable. Sleep with a task state of TASK_INTERRUPTIBLE instead of TASK_UNINTERRUPTIBLE to remove these undesirable side effects. Apparently under heavy load this has been allowing Chrome to trigger the hung time task timeout error and cause ChromeOS to reboot. Reported-by: Vovo Yang Reported-by: Guenter Roeck Tested-by: Guenter Roeck Fixes: 6347e9009104 ("pidns: guarantee that the pidns init will be the last pidns process reaped") Signed-off-by: "Eric W. Biederman" Signed-off-by: Greg Kroah-Hartman --- kernel/pid_namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index a65ba137fd15..567ecc826bc8 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -255,7 +255,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) * if reparented. */ for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); + set_current_state(TASK_INTERRUPTIBLE); if (pid_ns->nr_hashed == init_pids) break; schedule(); -- cgit v1.2.3 From 6a70a5833ecc9147d8257e80f39e11d582810082 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 12 May 2017 19:11:31 +0300 Subject: pid_ns: Fix race between setns'ed fork() and zap_pid_ns_processes() commit 3fd37226216620c1a468afa999739d5016fbc349 upstream. Imagine we have a pid namespace and a task from its parent's pid_ns, which made setns() to the pid namespace. The task is doing fork(), while the pid namespace's child reaper is dying. We have the race between them: Task from parent pid_ns Child reaper copy_process() .. alloc_pid() .. .. zap_pid_ns_processes() .. disable_pid_allocation() .. read_lock(&tasklist_lock) .. iterate over pids in pid_ns .. kill tasks linked to pids .. read_unlock(&tasklist_lock) write_lock_irq(&tasklist_lock); .. attach_pid(p, PIDTYPE_PID); .. .. .. So, just created task p won't receive SIGKILL signal, and the pid namespace will be in contradictory state. Only manual kill will help there, but does the userspace care about this? I suppose, the most users just inject a task into a pid namespace and wait a SIGCHLD from it. The patch fixes the problem. It simply checks for (pid_ns->nr_hashed & PIDNS_HASH_ADDING) in copy_process(). We do it under the tasklist_lock, and can't skip PIDNS_HASH_ADDING as noted by Oleg: "zap_pid_ns_processes() does disable_pid_allocation() and then takes tasklist_lock to kill the whole namespace. Given that copy_process() checks PIDNS_HASH_ADDING under write_lock(tasklist) they can't race; if copy_process() takes this lock first, the new child will be killed, otherwise copy_process() can't miss the change in ->nr_hashed." If allocation is disabled, we just return -ENOMEM like it's made for such cases in alloc_pid(). v2: Do not move disable_pid_allocation(), do not introduce a new variable in copy_process() and simplify the patch as suggested by Oleg Nesterov. Account the problem with double irq enabling found by Eric W. Biederman. Fixes: c876ad768215 ("pidns: Stop pid allocation when init dies") Signed-off-by: Kirill Tkhai CC: Andrew Morton CC: Ingo Molnar CC: Peter Zijlstra CC: Oleg Nesterov CC: Mike Rapoport CC: Michal Hocko CC: Andy Lutomirski CC: "Eric W. Biederman" CC: Andrei Vagin CC: Cyrill Gorcunov CC: Serge Hallyn Acked-by: Oleg Nesterov Signed-off-by: Eric W. Biederman Signed-off-by: Greg Kroah-Hartman --- kernel/fork.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 278a2ddad351..0ee630f3ad4b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1590,11 +1590,13 @@ static struct task_struct *copy_process(unsigned long clone_flags, */ recalc_sigpending(); if (signal_pending(current)) { - spin_unlock(¤t->sighand->siglock); - write_unlock_irq(&tasklist_lock); retval = -ERESTARTNOINTR; goto bad_fork_cancel_cgroup; } + if (unlikely(!(ns_of_pid(pid)->nr_hashed & PIDNS_HASH_ADDING))) { + retval = -ENOMEM; + goto bad_fork_cancel_cgroup; + } if (likely(p->pid)) { ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); @@ -1645,6 +1647,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, return p; bad_fork_cancel_cgroup: + spin_unlock(¤t->sighand->siglock); + write_unlock_irq(&tasklist_lock); cgroup_cancel_fork(p, cgrp_ss_priv); bad_fork_free_pid: threadgroup_change_end(current); -- cgit v1.2.3 From f01ae9cb0de282abfd20cd3c2e3477adbdb766ce Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 16 Jun 2016 15:57:15 +0300 Subject: sched/fair: Do not announce throttled next buddy in dequeue_task_fair() commit 754bd598be9bbc953bc709a9e8ed7f3188bfb9d7 upstream. Hierarchy could be already throttled at this point. Throttled next buddy could trigger a NULL pointer dereference in pick_next_task_fair(). Signed-off-by: Konstantin Khlebnikov Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ben Segall Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/146608183552.21905.15924473394414832071.stgit@buzz Signed-off-by: Ingo Molnar Cc: Ben Pineau Signed-off-by: Greg Kroah-Hartman --- kernel/sched/fair.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8f258f437ac2..3fa53654b7f2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4233,15 +4233,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { + /* Avoid re-evaluating load for this entity: */ + se = parent_entity(se); /* * Bias pick_next to pick a task from this cfs_rq, as * p is sleeping when it is within its sched_slice. */ - if (task_sleep && parent_entity(se)) - set_next_buddy(parent_entity(se)); - - /* avoid re-evaluating load for this entity */ - se = parent_entity(se); + if (task_sleep && se && !throttled_hierarchy(cfs_rq)) + set_next_buddy(se); break; } flags |= DEQUEUE_SLEEP; -- cgit v1.2.3 From ada79b5ecda79ec7b53053d9955a5ee04c8dd633 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 16 Jun 2016 15:57:01 +0300 Subject: sched/fair: Initialize throttle_count for new task-groups lazily commit 094f469172e00d6ab0a3130b0e01c83b3cf3a98d upstream. Cgroup created inside throttled group must inherit current throttle_count. Broken throttle_count allows to nominate throttled entries as a next buddy, later this leads to null pointer dereference in pick_next_task_fair(). This patch initialize cfs_rq->throttle_count at first enqueue: laziness allows to skip locking all rq at group creation. Lazy approach also allows to skip full sub-tree scan at throttling hierarchy (not in this patch). Signed-off-by: Konstantin Khlebnikov Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bsegall@google.com Link: http://lkml.kernel.org/r/146608182119.21870.8439834428248129633.stgit@buzz Signed-off-by: Ingo Molnar Cc: Ben Pineau Signed-off-by: Greg Kroah-Hartman --- kernel/sched/fair.c | 20 ++++++++++++++++++++ kernel/sched/sched.h | 2 +- 2 files changed, 21 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3fa53654b7f2..812069b66f47 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3918,6 +3918,26 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) if (!cfs_bandwidth_used()) return; + /* Synchronize hierarchical throttle counter: */ + if (unlikely(!cfs_rq->throttle_uptodate)) { + struct rq *rq = rq_of(cfs_rq); + struct cfs_rq *pcfs_rq; + struct task_group *tg; + + cfs_rq->throttle_uptodate = 1; + + /* Get closest up-to-date node, because leaves go first: */ + for (tg = cfs_rq->tg->parent; tg; tg = tg->parent) { + pcfs_rq = tg->cfs_rq[cpu_of(rq)]; + if (pcfs_rq->throttle_uptodate) + break; + } + if (tg) { + cfs_rq->throttle_count = pcfs_rq->throttle_count; + cfs_rq->throttled_clock_task = rq_clock_task(rq); + } + } + /* an active group must be handled by the update_curr()->put() path */ if (!cfs_rq->runtime_enabled || cfs_rq->curr) return; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0517abd7dd73..4e5db65d1aab 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -417,7 +417,7 @@ struct cfs_rq { u64 throttled_clock, throttled_clock_task; u64 throttled_clock_task_time; - int throttled, throttle_count; + int throttled, throttle_count, throttle_uptodate; struct list_head throttled_list; #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ -- cgit v1.2.3 From e07db0d720d37678976956a5f972828fa6dca5a9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 11 May 2017 13:54:11 +0200 Subject: genirq: Fix chained interrupt data ordering commit 2c4569ca26986d18243f282dd727da27e9adae4c upstream. irq_set_chained_handler_and_data() sets up the chained interrupt and then stores the handler data. That's racy against an immediate interrupt which gets handled before the store of the handler data happened. The handler will dereference a NULL pointer and crash. Cure it by storing handler data before installing the chained handler. Reported-by: Borislav Petkov Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- kernel/irq/chip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 15206453b12a..e4453d9f788c 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -810,8 +810,8 @@ irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle, if (!desc) return; - __irq_do_set_handler(desc, handle, 1, NULL); desc->irq_common_data.handler_data = data; + __irq_do_set_handler(desc, handle, 1, NULL); irq_put_desc_busunlock(desc, flags); } -- cgit v1.2.3 From 6384f782a69cf93a8a59322e1b6cf29f27fa0c8f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 17 May 2017 10:19:49 +0200 Subject: tracing/kprobes: Enforce kprobes teardown after testing commit 30e7d894c1478c88d50ce94ddcdbd7f9763d9cdd upstream. Enabling the tracer selftest triggers occasionally the warning in text_poke(), which warns when the to be modified page is not marked reserved. The reason is that the tracer selftest installs kprobes on functions marked __init for testing. These probes are removed after the tests, but that removal schedules the delayed kprobes_optimizer work, which will do the actual text poke. If the work is executed after the init text is freed, then the warning triggers. The bug can be reproduced reliably when the work delay is increased. Flush the optimizer work and wait for the optimizing/unoptimizing lists to become empty before returning from the kprobes tracer selftest. That ensures that all operations which were queued due to the probes removal have completed. Link: http://lkml.kernel.org/r/20170516094802.76a468bb@gandalf.local.home Signed-off-by: Thomas Gleixner Acked-by: Masami Hiramatsu Fixes: 6274de498 ("kprobes: Support delayed unoptimizing") Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Greg Kroah-Hartman --- include/linux/kprobes.h | 4 +++- kernel/kprobes.c | 2 +- kernel/trace/trace_kprobe.c | 5 +++++ 3 files changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 8f6849084248..e23392517db9 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -330,7 +330,9 @@ extern int proc_kprobes_optimization_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos); #endif - +extern void wait_for_kprobe_optimizer(void); +#else +static inline void wait_for_kprobe_optimizer(void) { } #endif /* CONFIG_OPTPROBES */ #ifdef CONFIG_KPROBES_ON_FTRACE extern void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, diff --git a/kernel/kprobes.c b/kernel/kprobes.c index d10ab6b9b5e0..695763516908 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -563,7 +563,7 @@ static void kprobe_optimizer(struct work_struct *work) } /* Wait for completing optimization and unoptimization */ -static void wait_for_kprobe_optimizer(void) +void wait_for_kprobe_optimizer(void) { mutex_lock(&kprobe_mutex); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index c9956440d0e6..12ea4ea619ee 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1471,6 +1471,11 @@ static __init int kprobe_trace_self_tests_init(void) end: release_all_trace_kprobes(); + /* + * Wait for the optimizer work to finish. Otherwise it might fiddle + * with probes in already freed __init text. + */ + wait_for_kprobe_optimizer(); if (warn) pr_cont("NG: Some tests are failed. Please check them.\n"); else -- cgit v1.2.3