From 14fd6ba8248b82c4bf3efa960bbf15a85b09cce0 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 12 Mar 2018 09:59:43 -0700 Subject: perf/cgroup: Fix child event counting bug [ Upstream commit c917e0f259908e75bd2a65877e25f9d90c22c848 ] When a perf_event is attached to parent cgroup, it should count events for all children cgroups: parent_group <---- perf_event \ - child_group <---- process(es) However, in our tests, we found this perf_event cannot report reliable results. Here is an example case: # create cgroups mkdir -p /sys/fs/cgroup/p/c # start perf for parent group perf stat -e instructions -G "p" # on another console, run test process in child cgroup: stressapptest -s 2 -M 1000 & echo $! > /sys/fs/cgroup/p/c/cgroup.procs # after the test process is done, stop perf in the first console shows instructions p The instruction should not be "not counted" as the process runs in the child cgroup. We found this is because perf_event->cgrp and cpuctx->cgrp are not identical, thus perf_event->cgrp are not updated properly. This patch fixes this by updating perf_cgroup properly for ancestor cgroup(s). Reported-by: Ephraim Park Signed-off-by: Song Liu Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/20180312165943.1057894-1-songliubraving@fb.com Signed-off-by: Ingo Molnar Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- kernel/events/core.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index 92d1f12f4407..06bb884f2c3b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -419,9 +419,15 @@ static inline void __update_cgrp_time(struct perf_cgroup *cgrp) static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) { - struct perf_cgroup *cgrp_out = cpuctx->cgrp; - if (cgrp_out) - __update_cgrp_time(cgrp_out); + struct perf_cgroup *cgrp = cpuctx->cgrp; + struct cgroup_subsys_state *css; + + if (cgrp) { + for (css = &cgrp->css; css; css = css->parent) { + cgrp = container_of(css, struct perf_cgroup, css); + __update_cgrp_time(cgrp); + } + } } static inline void update_cgrp_time_from_event(struct perf_event *event) @@ -449,6 +455,7 @@ perf_cgroup_set_timestamp(struct task_struct *task, { struct perf_cgroup *cgrp; struct perf_cgroup_info *info; + struct cgroup_subsys_state *css; /* * ctx->lock held by caller @@ -459,8 +466,12 @@ perf_cgroup_set_timestamp(struct task_struct *task, return; cgrp = perf_cgroup_from_task(task, ctx); - info = this_cpu_ptr(cgrp->info); - info->timestamp = ctx->timestamp; + + for (css = &cgrp->css; css; css = css->parent) { + cgrp = container_of(css, struct perf_cgroup, css); + info = this_cpu_ptr(cgrp->info); + info->timestamp = ctx->timestamp; + } } #define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */ -- cgit v1.2.3 From 9a1dcfb8ff30b3afe121d11606ed332cd1536c16 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 9 Mar 2018 12:52:04 +0100 Subject: perf/core: Fix perf_output_read_group() [ Upstream commit 9e5b127d6f33468143d90c8a45ca12410e4c3fa7 ] Mark reported his arm64 perf fuzzer runs sometimes splat like: armv8pmu_read_counter+0x1e8/0x2d8 armpmu_event_update+0x8c/0x188 armpmu_read+0xc/0x18 perf_output_read+0x550/0x11e8 perf_event_read_event+0x1d0/0x248 perf_event_exit_task+0x468/0xbb8 do_exit+0x690/0x1310 do_group_exit+0xd0/0x2b0 get_signal+0x2e8/0x17a8 do_signal+0x144/0x4f8 do_notify_resume+0x148/0x1e8 work_pending+0x8/0x14 which asserts that we only call pmu::read() on ACTIVE events. The above callchain does: perf_event_exit_task() perf_event_exit_task_context() task_ctx_sched_out() // INACTIVE perf_event_exit_event() perf_event_set_state(EXIT) // EXIT sync_child_event() perf_event_read_event() perf_output_read() perf_output_read_group() leader->pmu->read() Which results in doing a pmu::read() on an !ACTIVE event. I _think_ this is 'new' since we added attr.inherit_stat, which added the perf_event_read_event() to the exit path, without that perf_event_read_output() would only trigger from samples and for @event to trigger a sample, it's leader _must_ be ACTIVE too. Still, adding this check makes it consistent with the @sub case for the siblings. Reported-and-Tested-by: Mark Rutland Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- kernel/events/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index 06bb884f2c3b..990ac41d8a5f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5299,7 +5299,8 @@ static void perf_output_read_group(struct perf_output_handle *handle, if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) values[n++] = running; - if (leader != event) + if ((leader != event) && + (leader->state == PERF_EVENT_STATE_ACTIVE)) leader->pmu->read(leader); values[n++] = perf_event_count(leader); -- cgit v1.2.3