From 021de3d904b88b1771a3a2cfc5b75023c391e646 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 6 Aug 2014 15:36:31 -0400 Subject: ring-buffer: Up rb_iter_peek() loop count to 3 After writting a test to try to trigger the bug that caused the ring buffer iterator to become corrupted, I hit another bug: WARNING: CPU: 1 PID: 5281 at kernel/trace/ring_buffer.c:3766 rb_iter_peek+0x113/0x238() Modules linked in: ipt_MASQUERADE sunrpc [...] CPU: 1 PID: 5281 Comm: grep Tainted: G W 3.16.0-rc3-test+ #143 Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./To be filled by O.E.M., BIOS SDBLI944.86P 05/08/2007 0000000000000000 ffffffff81809a80 ffffffff81503fb0 0000000000000000 ffffffff81040ca1 ffff8800796d6010 ffffffff810c138d ffff8800796d6010 ffff880077438c80 ffff8800796d6010 ffff88007abbe600 0000000000000003 Call Trace: [] ? dump_stack+0x4a/0x75 [] ? warn_slowpath_common+0x7e/0x97 [] ? rb_iter_peek+0x113/0x238 [] ? rb_iter_peek+0x113/0x238 [] ? ring_buffer_iter_peek+0x2d/0x5c [] ? tracing_iter_reset+0x6e/0x96 [] ? s_start+0xd7/0x17b [] ? kmem_cache_alloc_trace+0xda/0xea [] ? seq_read+0x148/0x361 [] ? vfs_read+0x93/0xf1 [] ? SyS_read+0x60/0x8e [] ? tracesys+0xdd/0xe2 Debugging this bug, which triggers when the rb_iter_peek() loops too many times (more than 2 times), I discovered there's a case that can cause that function to legitimately loop 3 times! rb_iter_peek() is different than rb_buffer_peek() as the rb_buffer_peek() only deals with the reader page (it's for consuming reads). The rb_iter_peek() is for traversing the buffer without consuming it, and as such, it can loop for one more reason. That is, if we hit the end of the reader page or any page, it will go to the next page and try again. That is, we have this: 1. iter->head > iter->head_page->page->commit (rb_inc_iter() which moves the iter to the next page) try again 2. event = rb_iter_head_event() event->type_len == RINGBUF_TYPE_TIME_EXTEND rb_advance_iter() try again 3. read the event. But we never get to 3, because the count is greater than 2 and we cause the WARNING and return NULL. Up the counter to 3. Cc: stable@vger.kernel.org # 2.6.37+ Fixes: 69d1b839f7ee "ring-buffer: Bind time extend and data events together" Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel/trace/ring_buffer.c') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index ff7027199a9a..31a9edd7aa93 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1984,7 +1984,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) /** * rb_update_event - update event type and data - * @event: the even to update + * @event: the event to update * @type: the type of event * @length: the size of the event field in the ring buffer * @@ -3764,12 +3764,14 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) return NULL; /* - * We repeat when a time extend is encountered. - * Since the time extend is always attached to a data event, - * we should never loop more than once. - * (We never hit the following condition more than twice). + * We repeat when a time extend is encountered or we hit + * the end of the page. Since the time extend is always attached + * to a data event, we should never loop more than three times. + * Once for going to next page, once on time extend, and + * finally once to get the event. + * (We never hit the following condition more than thrice). */ - if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2)) + if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3)) return NULL; if (rb_per_cpu_empty(cpu_buffer)) -- cgit v1.2.3 From 651e22f2701b4113989237c3048d17337dd2185c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 6 Aug 2014 14:11:33 -0400 Subject: ring-buffer: Always reset iterator to reader page When performing a consuming read, the ring buffer swaps out a page from the ring buffer with a empty page and this page that was swapped out becomes the new reader page. The reader page is owned by the reader and since it was swapped out of the ring buffer, writers do not have access to it (there's an exception to that rule, but it's out of scope for this commit). When reading the "trace" file, it is a non consuming read, which means that the data in the ring buffer will not be modified. When the trace file is opened, a ring buffer iterator is allocated and writes to the ring buffer are disabled, such that the iterator will not have issues iterating over the data. Although the ring buffer disabled writes, it does not disable other reads, or even consuming reads. If a consuming read happens, then the iterator is reset and starts reading from the beginning again. My tests would sometimes trigger this bug on my i386 box: WARNING: CPU: 0 PID: 5175 at kernel/trace/trace.c:1527 __trace_find_cmdline+0x66/0xaa() Modules linked in: CPU: 0 PID: 5175 Comm: grep Not tainted 3.16.0-rc3-test+ #8 Hardware name: /DG965MQ, BIOS MQ96510J.86A.0372.2006.0605.1717 06/05/2006 00000000 00000000 f09c9e1c c18796b3 c1b5d74c f09c9e4c c103a0e3 c1b5154b f09c9e78 00001437 c1b5d74c 000005f7 c10bd85a c10bd85a c1cac57c f09c9eb0 ed0e0000 f09c9e64 c103a185 00000009 f09c9e5c c1b5154b f09c9e78 f09c9e80^M Call Trace: [] dump_stack+0x4b/0x75 [] warn_slowpath_common+0x7e/0x95 [] ? __trace_find_cmdline+0x66/0xaa [] ? __trace_find_cmdline+0x66/0xaa [] warn_slowpath_fmt+0x33/0x35 [] __trace_find_cmdline+0x66/0xaa^M [] trace_find_cmdline+0x40/0x64 [] trace_print_context+0x27/0xec [] ? trace_seq_printf+0x37/0x5b [] print_trace_line+0x319/0x39b [] ? ring_buffer_read+0x47/0x50 [] s_show+0x192/0x1ab [] ? s_next+0x5a/0x7c [] seq_read+0x267/0x34c [] vfs_read+0x8c/0xef [] ? seq_lseek+0x154/0x154 [] SyS_read+0x54/0x7f [] syscall_call+0x7/0xb ---[ end trace 3f507febd6b4cc83 ]--- >>>> ##### CPU 1 buffer started #### Which was the __trace_find_cmdline() function complaining about the pid in the event record being negative. After adding more test cases, this would trigger more often. Strangely enough, it would never trigger on a single test, but instead would trigger only when running all the tests. I believe that was the case because it required one of the tests to be shutting down via delayed instances while a new test started up. After spending several days debugging this, I found that it was caused by the iterator becoming corrupted. Debugging further, I found out why the iterator became corrupted. It happened with the rb_iter_reset(). As consuming reads may not read the full reader page, and only part of it, there's a "read" field to know where the last read took place. The iterator, must also start at the read position. In the rb_iter_reset() code, if the reader page was disconnected from the ring buffer, the iterator would start at the head page within the ring buffer (where writes still happen). But the mistake there was that it still used the "read" field to start the iterator on the head page, where it should always start at zero because readers never read from within the ring buffer where writes occur. I originally wrote a patch to have it set the iter->head to 0 instead of iter->head_page->read, but then I questioned why it wasn't always setting the iter to point to the reader page, as the reader page is still valid. The list_empty(reader_page->list) just means that it was successful in swapping out. But the reader_page may still have data. There was a bug report a long time ago that was not reproducible that had something about trace_pipe (consuming read) not matching trace (iterator read). This may explain why that happened. Anyway, the correct answer to this bug is to always use the reader page an not reset the iterator to inside the writable ring buffer. Cc: stable@vger.kernel.org # 2.6.28+ Fixes: d769041f8653 "ring_buffer: implement new locking" Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) (limited to 'kernel/trace/ring_buffer.c') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 31a9edd7aa93..b95381ebdd5e 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3357,21 +3357,16 @@ static void rb_iter_reset(struct ring_buffer_iter *iter) struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; /* Iterator usage is expected to have record disabled */ - if (list_empty(&cpu_buffer->reader_page->list)) { - iter->head_page = rb_set_head_page(cpu_buffer); - if (unlikely(!iter->head_page)) - return; - iter->head = iter->head_page->read; - } else { - iter->head_page = cpu_buffer->reader_page; - iter->head = cpu_buffer->reader_page->read; - } + iter->head_page = cpu_buffer->reader_page; + iter->head = cpu_buffer->reader_page->read; + + iter->cache_reader_page = iter->head_page; + iter->cache_read = iter->head; + if (iter->head) iter->read_stamp = cpu_buffer->read_stamp; else iter->read_stamp = iter->head_page->page->time_stamp; - iter->cache_reader_page = cpu_buffer->reader_page; - iter->cache_read = cpu_buffer->read; } /** -- cgit v1.2.3 From 4ce97dbf50245227add17c83d87dc838e7ca79d0 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 25 Aug 2014 13:59:41 -0400 Subject: trace: Fix epoll hang when we race with new entries Epoll on trace_pipe can sometimes hang in a weird case. If the ring buffer is empty when we set waiters_pending but an event shows up exactly at that moment we can miss being woken up by the ring buffers irq work. Since ring_buffer_empty() is inherently racey we will sometimes think that the buffer is not empty. So we don't get woken up and we don't think there are any events even though there were some ready when we added the watch, which makes us hang. This patch fixes this by making sure that we are actually on the wait list before we set waiters_pending, and add a memory barrier to make sure ring_buffer_empty() is going to be correct. Link: http://lkml.kernel.org/p/1408989581-23727-1-git-send-email-jbacik@fb.com Cc: stable@vger.kernel.org # 3.10+ Cc: Martin Lau Signed-off-by: Josef Bacik Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'kernel/trace/ring_buffer.c') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index afb04b9b818a..b38fb2b9e237 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -626,8 +626,22 @@ int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu, work = &cpu_buffer->irq_work; } - work->waiters_pending = true; poll_wait(filp, &work->waiters, poll_table); + work->waiters_pending = true; + /* + * There's a tight race between setting the waiters_pending and + * checking if the ring buffer is empty. Once the waiters_pending bit + * is set, the next event will wake the task up, but we can get stuck + * if there's only a single event in. + * + * FIXME: Ideally, we need a memory barrier on the writer side as well, + * but adding a memory barrier to all events will cause too much of a + * performance hit in the fast path. We only need a memory barrier when + * the buffer goes from empty to having content. But as this race is + * extremely small, and it's not a problem if another event comes in, we + * will fix it later. + */ + smp_mb(); if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) || (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu))) -- cgit v1.2.3 From 24607f114fd14f2f37e3e0cb3d47bce96e81e848 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 2 Oct 2014 16:51:18 -0400 Subject: ring-buffer: Fix infinite spin in reading buffer Commit 651e22f2701b "ring-buffer: Always reset iterator to reader page" fixed one bug but in the process caused another one. The reset is to update the header page, but that fix also changed the way the cached reads were updated. The cache reads are used to test if an iterator needs to be updated or not. A ring buffer iterator, when created, disables writes to the ring buffer but does not stop other readers or consuming reads from happening. Although all readers are synchronized via a lock, they are only synchronized when in the ring buffer functions. Those functions may be called by any number of readers. The iterator continues down when its not interrupted by a consuming reader. If a consuming read occurs, the iterator starts from the beginning of the buffer. The way the iterator sees that a consuming read has happened since its last read is by checking the reader "cache". The cache holds the last counts of the read and the reader page itself. Commit 651e22f2701b changed what was saved by the cache_read when the rb_iter_reset() occurred, making the iterator never match the cache. Then if the iterator calls rb_iter_reset(), it will go into an infinite loop by checking if the cache doesn't match, doing the reset and retrying, just to see that the cache still doesn't match! Which should never happen as the reset is suppose to set the cache to the current value and there's locks that keep a consuming reader from having access to the data. Fixes: 651e22f2701b "ring-buffer: Always reset iterator to reader page" Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace/ring_buffer.c') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b38fb2b9e237..2d75c94ae87d 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3359,7 +3359,7 @@ static void rb_iter_reset(struct ring_buffer_iter *iter) iter->head = cpu_buffer->reader_page->read; iter->cache_reader_page = iter->head_page; - iter->cache_read = iter->head; + iter->cache_read = cpu_buffer->read; if (iter->head) iter->read_stamp = cpu_buffer->read_stamp; -- cgit v1.2.3 From e30f53aad2202b5526c40c36d8eeac8bf290bde5 Mon Sep 17 00:00:00 2001 From: Rabin Vincent Date: Mon, 10 Nov 2014 19:46:34 +0100 Subject: tracing: Do not busy wait in buffer splice On a !PREEMPT kernel, attempting to use trace-cmd results in a soft lockup: # trace-cmd record -e raw_syscalls:* -F false NMI watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [trace-cmd:61] ... Call Trace: [] ? __wake_up_common+0x90/0x90 [] wait_on_pipe+0x35/0x40 [] tracing_buffers_splice_read+0x2e3/0x3c0 [] ? tracing_stats_read+0x2a0/0x2a0 [] ? _raw_spin_unlock+0x2b/0x40 [] ? do_read_fault+0x21b/0x290 [] ? handle_mm_fault+0x2ba/0xbd0 [] ? trace_event_buffer_lock_reserve+0x40/0x80 [] ? trace_buffer_lock_reserve+0x22/0x60 [] ? trace_event_buffer_lock_reserve+0x40/0x80 [] do_splice_to+0x6d/0x90 [] SyS_splice+0x7c1/0x800 [] tracesys_phase2+0xd3/0xd8 The problem is this: tracing_buffers_splice_read() calls ring_buffer_wait() to wait for data in the ring buffers. The buffers are not empty so ring_buffer_wait() returns immediately. But tracing_buffers_splice_read() calls ring_buffer_read_page() with full=1, meaning it only wants to read a full page. When the full page is not available, tracing_buffers_splice_read() tries to wait again with ring_buffer_wait(), which again returns immediately, and so on. Fix this by adding a "full" argument to ring_buffer_wait() which will make ring_buffer_wait() wait until the writer has left the reader's page, i.e. until full-page reads will succeed. Link: http://lkml.kernel.org/r/1415645194-25379-1-git-send-email-rabin@rab.in Cc: stable@vger.kernel.org # 3.16+ Fixes: b1169cc69ba9 ("tracing: Remove mock up poll wait function") Signed-off-by: Rabin Vincent Signed-off-by: Steven Rostedt --- include/linux/ring_buffer.h | 2 +- kernel/trace/ring_buffer.c | 81 ++++++++++++++++++++++++++++++--------------- kernel/trace/trace.c | 23 ++++--------- 3 files changed, 62 insertions(+), 44 deletions(-) (limited to 'kernel/trace/ring_buffer.c') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 49a4d6f59108..e2c13cd863bd 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -97,7 +97,7 @@ __ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *k __ring_buffer_alloc((size), (flags), &__key); \ }) -int ring_buffer_wait(struct ring_buffer *buffer, int cpu); +int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full); int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu, struct file *filp, poll_table *poll_table); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 2d75c94ae87d..a56e07c8d15b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -538,16 +538,18 @@ static void rb_wake_up_waiters(struct irq_work *work) * ring_buffer_wait - wait for input to the ring buffer * @buffer: buffer to wait on * @cpu: the cpu buffer to wait on + * @full: wait until a full page is available, if @cpu != RING_BUFFER_ALL_CPUS * * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon * as data is added to any of the @buffer's cpu buffers. Otherwise * it will wait for data to be added to a specific cpu buffer. */ -int ring_buffer_wait(struct ring_buffer *buffer, int cpu) +int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full) { - struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_per_cpu *uninitialized_var(cpu_buffer); DEFINE_WAIT(wait); struct rb_irq_work *work; + int ret = 0; /* * Depending on what the caller is waiting for, either any @@ -564,36 +566,61 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu) } - prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE); + while (true) { + prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE); - /* - * The events can happen in critical sections where - * checking a work queue can cause deadlocks. - * After adding a task to the queue, this flag is set - * only to notify events to try to wake up the queue - * using irq_work. - * - * We don't clear it even if the buffer is no longer - * empty. The flag only causes the next event to run - * irq_work to do the work queue wake up. The worse - * that can happen if we race with !trace_empty() is that - * an event will cause an irq_work to try to wake up - * an empty queue. - * - * There's no reason to protect this flag either, as - * the work queue and irq_work logic will do the necessary - * synchronization for the wake ups. The only thing - * that is necessary is that the wake up happens after - * a task has been queued. It's OK for spurious wake ups. - */ - work->waiters_pending = true; + /* + * The events can happen in critical sections where + * checking a work queue can cause deadlocks. + * After adding a task to the queue, this flag is set + * only to notify events to try to wake up the queue + * using irq_work. + * + * We don't clear it even if the buffer is no longer + * empty. The flag only causes the next event to run + * irq_work to do the work queue wake up. The worse + * that can happen if we race with !trace_empty() is that + * an event will cause an irq_work to try to wake up + * an empty queue. + * + * There's no reason to protect this flag either, as + * the work queue and irq_work logic will do the necessary + * synchronization for the wake ups. The only thing + * that is necessary is that the wake up happens after + * a task has been queued. It's OK for spurious wake ups. + */ + work->waiters_pending = true; + + if (signal_pending(current)) { + ret = -EINTR; + break; + } + + if (cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) + break; + + if (cpu != RING_BUFFER_ALL_CPUS && + !ring_buffer_empty_cpu(buffer, cpu)) { + unsigned long flags; + bool pagebusy; + + if (!full) + break; + + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + if (!pagebusy) + break; + } - if ((cpu == RING_BUFFER_ALL_CPUS && ring_buffer_empty(buffer)) || - (cpu != RING_BUFFER_ALL_CPUS && ring_buffer_empty_cpu(buffer, cpu))) schedule(); + } finish_wait(&work->waiters, &wait); - return 0; + + return ret; } /** diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8a528392b1f4..15209335888d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1076,13 +1076,14 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) } #endif /* CONFIG_TRACER_MAX_TRACE */ -static int wait_on_pipe(struct trace_iterator *iter) +static int wait_on_pipe(struct trace_iterator *iter, bool full) { /* Iterators are static, they should be filled or empty */ if (trace_buffer_iter(iter, iter->cpu_file)) return 0; - return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file); + return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file, + full); } #ifdef CONFIG_FTRACE_STARTUP_TEST @@ -4434,15 +4435,12 @@ static int tracing_wait_pipe(struct file *filp) mutex_unlock(&iter->mutex); - ret = wait_on_pipe(iter); + ret = wait_on_pipe(iter, false); mutex_lock(&iter->mutex); if (ret) return ret; - - if (signal_pending(current)) - return -EINTR; } return 1; @@ -5372,16 +5370,12 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, goto out_unlock; } mutex_unlock(&trace_types_lock); - ret = wait_on_pipe(iter); + ret = wait_on_pipe(iter, false); mutex_lock(&trace_types_lock); if (ret) { size = ret; goto out_unlock; } - if (signal_pending(current)) { - size = -EINTR; - goto out_unlock; - } goto again; } size = 0; @@ -5587,14 +5581,11 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, goto out; } mutex_unlock(&trace_types_lock); - ret = wait_on_pipe(iter); + ret = wait_on_pipe(iter, true); mutex_lock(&trace_types_lock); if (ret) goto out; - if (signal_pending(current)) { - ret = -EINTR; - goto out; - } + goto again; } -- cgit v1.2.3