From 1d4d262769cd1894a0306b9c57e72f005cd9e75a Mon Sep 17 00:00:00 2001
From: Jan Dittmer <jdi@l4x.org>
Date: Sat, 28 Oct 2006 10:38:38 -0700
Subject: [PATCH] Add missing space in module.c for taintskernel

Obvious fix.

Signed-off-by: Jan Dittmer <jdi@l4x.org>
Acked-by: Florin Malita <fmalita@gmail.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/module.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 67009bd56c52..5072a943fe35 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1342,7 +1342,7 @@ static void set_license(struct module *mod, const char *license)
 
 	if (!license_is_gpl_compatible(license)) {
 		if (!(tainted & TAINT_PROPRIETARY_MODULE))
-			printk(KERN_WARNING "%s: module license '%s' taints"
+			printk(KERN_WARNING "%s: module license '%s' taints "
 				"kernel.\n", mod->name, license);
 		add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
 	}
-- 
cgit v1.2.3


From 5fa3839a64203b2ab727dcb37da9b2d7079fca28 Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Sat, 28 Oct 2006 10:38:46 -0700
Subject: [PATCH] Constify compat_get_bitmap argument

This means we can call it when the bitmap we want to fetch is declared
const.

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/compat.h | 2 +-
 kernel/compat.c        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index f4ebf96f5308..f1553196826f 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -196,7 +196,7 @@ asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp,
 #define BITS_TO_COMPAT_LONGS(bits) \
 	(((bits)+BITS_PER_COMPAT_LONG-1)/BITS_PER_COMPAT_LONG)
 
-long compat_get_bitmap(unsigned long *mask, compat_ulong_t __user *umask,
+long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask,
 		       unsigned long bitmap_size);
 long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask,
 		       unsigned long bitmap_size);
diff --git a/kernel/compat.c b/kernel/compat.c
index 75573e5d27b0..d4898aad6cfa 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -678,7 +678,7 @@ int get_compat_sigevent(struct sigevent *event,
 		? -EFAULT : 0;
 }
 
-long compat_get_bitmap(unsigned long *mask, compat_ulong_t __user *umask,
+long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask,
 		       unsigned long bitmap_size)
 {
 	int i, j;
-- 
cgit v1.2.3


From fca178c0c6e8d52a1875be36b070f30884ebfae9 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sat, 28 Oct 2006 10:38:49 -0700
Subject: [PATCH] fill_tgid: fix task_struct leak and possible oops

1. fill_tgid() forgets to do put_task_struct(first).

2. release_task(first) can happen after fill_tgid() drops tasklist_lock,
   it is unsafe to dereference first->signal.

This is a temporary fix, imho the locking should be reworked.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Shailabh Nagar <nagar@watson.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Jay Lan <jlan@sgi.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/taskstats.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 5d6a8c54ee85..9aeee511a463 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -237,14 +237,17 @@ static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk,
 	} else
 		get_task_struct(first);
 
-	/* Start with stats from dead tasks */
-	spin_lock_irqsave(&first->signal->stats_lock, flags);
-	if (first->signal->stats)
-		memcpy(stats, first->signal->stats, sizeof(*stats));
-	spin_unlock_irqrestore(&first->signal->stats_lock, flags);
 
 	tsk = first;
 	read_lock(&tasklist_lock);
+	/* Start with stats from dead tasks */
+	if (first->signal) {
+		spin_lock_irqsave(&first->signal->stats_lock, flags);
+		if (first->signal->stats)
+			memcpy(stats, first->signal->stats, sizeof(*stats));
+		spin_unlock_irqrestore(&first->signal->stats_lock, flags);
+	}
+
 	do {
 		if (tsk->exit_state == EXIT_ZOMBIE && thread_group_leader(tsk))
 			continue;
@@ -264,7 +267,7 @@ static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk,
 	 * Accounting subsytems can also add calls here to modify
 	 * fields of taskstats.
 	 */
-
+	put_task_struct(first);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 05d5bcd60e8202e5c7b28cf61186043a4d612623 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sat, 28 Oct 2006 10:38:50 -0700
Subject: [PATCH] bacct_add_tsk: fix unsafe and wrong parent/group_leader
 dereference

1. ts = timespec_sub(uptime, current->group_leader->start_time);

   It is possible that current != tsk. Probably it was supposed
   to be 'tsk->group_leader->start_time. But why we are reading
   group_leader's start_time ? This accounting is per thread,
   not per procees, I changed this to 'tsk->start_time.
   Please corect me.

2. stats->ac_ppid = (tsk->parent) ? tsk->parent->pid : 0;

   tsk->parent never == NULL, and it is unsafe to dereference it.
   Both the task and it's parent may exit after the caller unlocks
   tasklist_lock, the memory could be unmapped (DEBUG_SLAB).
   (And we should use ->real_parent->tgid in fact).

Q: I don't understand the 'if (thread_group_leader(tsk))' check.
Why it is needed ?

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Shailabh Nagar <nagar@watson.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Acked-by: Jay Lan <jlan@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/tsacct.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index db443221ba5b..65a5036a3d95 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -36,7 +36,7 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
 
 	/* calculate task elapsed time in timespec */
 	do_posix_clock_monotonic_gettime(&uptime);
-	ts = timespec_sub(uptime, current->group_leader->start_time);
+	ts = timespec_sub(uptime, tsk->start_time);
 	/* rebase elapsed time to usec */
 	ac_etime = timespec_to_ns(&ts);
 	do_div(ac_etime, NSEC_PER_USEC);
@@ -58,7 +58,10 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
 	stats->ac_uid	 = tsk->uid;
 	stats->ac_gid	 = tsk->gid;
 	stats->ac_pid	 = tsk->pid;
-	stats->ac_ppid	 = (tsk->parent) ? tsk->parent->pid : 0;
+	rcu_read_lock();
+	stats->ac_ppid	 = pid_alive(tsk) ?
+				rcu_dereference(tsk->real_parent)->tgid : 0;
+	rcu_read_unlock();
 	stats->ac_utime	 = cputime_to_msecs(tsk->utime) * USEC_PER_MSEC;
 	stats->ac_stime	 = cputime_to_msecs(tsk->stime) * USEC_PER_MSEC;
 	stats->ac_minflt = tsk->min_flt;
-- 
cgit v1.2.3


From 093a8e8aecd77b2799934996a55a6838e1e2b8f3 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sat, 28 Oct 2006 10:38:51 -0700
Subject: [PATCH] taskstats_tgid_free: fix usage

taskstats_tgid_free() is called on copy_process's error path. This is wrong.

	IF (clone_flags & CLONE_THREAD)
		We should not clear ->signal->taskstats, current uses it,
		it probably has a valid accumulated info.
	ELSE
		taskstats_tgid_init() set ->signal->taskstats = NULL,
		there is nothing to free.

Move the callsite to __exit_signal(). We don't need any locking, entire
thread group is exiting, nobody should have a reference to soon to be
released ->signal.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Shailabh Nagar <nagar@watson.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Jay Lan <jlan@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/taskstats_kern.h | 13 ++-----------
 kernel/exit.c                  |  1 +
 kernel/fork.c                  |  1 -
 3 files changed, 3 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/taskstats_kern.h b/include/linux/taskstats_kern.h
index 16894b7edcc8..a437ca0d226b 100644
--- a/include/linux/taskstats_kern.h
+++ b/include/linux/taskstats_kern.h
@@ -49,17 +49,8 @@ static inline void taskstats_tgid_alloc(struct signal_struct *sig)
 
 static inline void taskstats_tgid_free(struct signal_struct *sig)
 {
-	struct taskstats *stats = NULL;
-	unsigned long flags;
-
-	spin_lock_irqsave(&sig->stats_lock, flags);
-	if (sig->stats) {
-		stats = sig->stats;
-		sig->stats = NULL;
-	}
-	spin_unlock_irqrestore(&sig->stats_lock, flags);
-	if (stats)
-		kmem_cache_free(taskstats_cache, stats);
+	if (sig->stats)
+		kmem_cache_free(taskstats_cache, sig->stats);
 }
 
 extern void taskstats_exit_alloc(struct taskstats **, unsigned int *);
diff --git a/kernel/exit.c b/kernel/exit.c
index f250a5e3e281..06de6c4e8ca3 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -128,6 +128,7 @@ static void __exit_signal(struct task_struct *tsk)
 	flush_sigqueue(&tsk->pending);
 	if (sig) {
 		flush_sigqueue(&sig->shared_pending);
+		taskstats_tgid_free(sig);
 		__cleanup_signal(sig);
 	}
 }
diff --git a/kernel/fork.c b/kernel/fork.c
index 29ebb30850ed..213326609bac 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -897,7 +897,6 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 void __cleanup_signal(struct signal_struct *sig)
 {
 	exit_thread_group_keys(sig);
-	taskstats_tgid_free(sig);
 	kmem_cache_free(signal_cachep, sig);
 }
 
-- 
cgit v1.2.3


From b8534d7bd89df0cd41cd47bcd6733a05ea9a691a Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sat, 28 Oct 2006 10:38:53 -0700
Subject: [PATCH] taskstats: kill ->taskstats_lock in favor of ->siglock

signal_struct is (mostly) protected by ->sighand->siglock, I think we don't
need ->taskstats_lock to protect ->stats.  This also allows us to simplify the
locking in fill_tgid().

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Shailabh Nagar <nagar@watson.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Jay Lan <jlan@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h          |  1 -
 include/linux/taskstats_kern.h | 15 ++++++---------
 kernel/fork.c                  |  2 +-
 kernel/taskstats.c             | 16 ++++++----------
 4 files changed, 13 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6735c1cf334c..eafe4a7b8237 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -466,7 +466,6 @@ struct signal_struct {
 	struct pacct_struct pacct;	/* per-process accounting information */
 #endif
 #ifdef CONFIG_TASKSTATS
-	spinlock_t stats_lock;
 	struct taskstats *stats;
 #endif
 };
diff --git a/include/linux/taskstats_kern.h b/include/linux/taskstats_kern.h
index 664224008fb2..6562a2050a25 100644
--- a/include/linux/taskstats_kern.h
+++ b/include/linux/taskstats_kern.h
@@ -23,28 +23,26 @@ static inline void taskstats_exit_free(struct taskstats *tidstats)
 
 static inline void taskstats_tgid_init(struct signal_struct *sig)
 {
-	spin_lock_init(&sig->stats_lock);
 	sig->stats = NULL;
 }
 
-static inline void taskstats_tgid_alloc(struct signal_struct *sig)
+static inline void taskstats_tgid_alloc(struct task_struct *tsk)
 {
+	struct signal_struct *sig = tsk->signal;
 	struct taskstats *stats;
-	unsigned long flags;
 
 	if (sig->stats != NULL)
 		return;
 
+	/* No problem if kmem_cache_zalloc() fails */
 	stats = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL);
-	if (!stats)
-		return;
 
-	spin_lock_irqsave(&sig->stats_lock, flags);
+	spin_lock_irq(&tsk->sighand->siglock);
 	if (!sig->stats) {
 		sig->stats = stats;
 		stats = NULL;
 	}
-	spin_unlock_irqrestore(&sig->stats_lock, flags);
+	spin_unlock_irq(&tsk->sighand->siglock);
 
 	if (stats)
 		kmem_cache_free(taskstats_cache, stats);
@@ -59,7 +57,6 @@ static inline void taskstats_tgid_free(struct signal_struct *sig)
 extern void taskstats_exit_alloc(struct taskstats **, unsigned int *);
 extern void taskstats_exit_send(struct task_struct *, struct taskstats *, int, unsigned int);
 extern void taskstats_init_early(void);
-extern void taskstats_tgid_alloc(struct signal_struct *);
 #else
 static inline void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu)
 {}
@@ -71,7 +68,7 @@ static inline void taskstats_exit_send(struct task_struct *tsk,
 {}
 static inline void taskstats_tgid_init(struct signal_struct *sig)
 {}
-static inline void taskstats_tgid_alloc(struct signal_struct *sig)
+static inline void taskstats_tgid_alloc(struct task_struct *tsk)
 {}
 static inline void taskstats_tgid_free(struct signal_struct *sig)
 {}
diff --git a/kernel/fork.c b/kernel/fork.c
index 213326609bac..3da978eec791 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -830,7 +830,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 	if (clone_flags & CLONE_THREAD) {
 		atomic_inc(&current->signal->count);
 		atomic_inc(&current->signal->live);
-		taskstats_tgid_alloc(current->signal);
+		taskstats_tgid_alloc(current);
 		return 0;
 	}
 	sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 9aeee511a463..b2efda94615a 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -241,11 +241,11 @@ static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk,
 	tsk = first;
 	read_lock(&tasklist_lock);
 	/* Start with stats from dead tasks */
-	if (first->signal) {
-		spin_lock_irqsave(&first->signal->stats_lock, flags);
+	if (first->sighand) {
+		spin_lock_irqsave(&first->sighand->siglock, flags);
 		if (first->signal->stats)
 			memcpy(stats, first->signal->stats, sizeof(*stats));
-		spin_unlock_irqrestore(&first->signal->stats_lock, flags);
+		spin_unlock_irqrestore(&first->sighand->siglock, flags);
 	}
 
 	do {
@@ -276,7 +276,7 @@ static void fill_tgid_exit(struct task_struct *tsk)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&tsk->signal->stats_lock, flags);
+	spin_lock_irqsave(&tsk->sighand->siglock, flags);
 	if (!tsk->signal->stats)
 		goto ret;
 
@@ -288,7 +288,7 @@ static void fill_tgid_exit(struct task_struct *tsk)
 	 */
 	delayacct_add_tsk(tsk->signal->stats, tsk);
 ret:
-	spin_unlock_irqrestore(&tsk->signal->stats_lock, flags);
+	spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
 	return;
 }
 
@@ -464,15 +464,10 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
 	size_t size;
 	int is_thread_group;
 	struct nlattr *na;
-	unsigned long flags;
 
 	if (!family_registered || !tidstats)
 		return;
 
-	spin_lock_irqsave(&tsk->signal->stats_lock, flags);
-	is_thread_group = tsk->signal->stats ? 1 : 0;
-	spin_unlock_irqrestore(&tsk->signal->stats_lock, flags);
-
 	rc = 0;
 	/*
 	 * Size includes space for nested attributes
@@ -480,6 +475,7 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
 	size = nla_total_size(sizeof(u32)) +
 		nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
 
+	is_thread_group = (tsk->signal->stats != NULL);
 	if (is_thread_group)
 		size = 2 * size;	/* PID + STATS + TGID + STATS */
 
-- 
cgit v1.2.3


From a98b6094261c0112e9c455c96995972181bff049 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sat, 28 Oct 2006 10:38:54 -0700
Subject: [PATCH] taskstats: don't use tasklist_lock

Remove tasklist_lock from taskstats.c. find_task_by_pid() is rcu-safe.
->siglock allows us to traverse subthread without tasklist.

Q: delay accounting looks wrong to me.  If sub-thread has already called
taskstats_exit_send() but didn't call release_task(self) yet it will be
accounted twice.  The window is big.  No?

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Shailabh Nagar <nagar@watson.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Jay Lan <jlan@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/taskstats.c | 59 ++++++++++++++++++++++--------------------------------
 1 file changed, 24 insertions(+), 35 deletions(-)

(limited to 'kernel')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index b2efda94615a..b724aeea5443 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -174,21 +174,19 @@ static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
 	up_write(&listeners->sem);
 }
 
-static int fill_pid(pid_t pid, struct task_struct *pidtsk,
+static int fill_pid(pid_t pid, struct task_struct *tsk,
 		struct taskstats *stats)
 {
 	int rc = 0;
-	struct task_struct *tsk = pidtsk;
 
-	if (!pidtsk) {
-		read_lock(&tasklist_lock);
+	if (!tsk) {
+		rcu_read_lock();
 		tsk = find_task_by_pid(pid);
-		if (!tsk) {
-			read_unlock(&tasklist_lock);
+		if (tsk)
+			get_task_struct(tsk);
+		rcu_read_unlock();
+		if (!tsk)
 			return -ESRCH;
-		}
-		get_task_struct(tsk);
-		read_unlock(&tasklist_lock);
 	} else
 		get_task_struct(tsk);
 
@@ -214,40 +212,28 @@ static int fill_pid(pid_t pid, struct task_struct *pidtsk,
 
 }
 
-static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk,
+static int fill_tgid(pid_t tgid, struct task_struct *first,
 		struct taskstats *stats)
 {
-	struct task_struct *tsk, *first;
+	struct task_struct *tsk;
 	unsigned long flags;
+	int rc = -ESRCH;
 
 	/*
 	 * Add additional stats from live tasks except zombie thread group
 	 * leaders who are already counted with the dead tasks
 	 */
-	first = tgidtsk;
-	if (!first) {
-		read_lock(&tasklist_lock);
+	rcu_read_lock();
+	if (!first)
 		first = find_task_by_pid(tgid);
-		if (!first) {
-			read_unlock(&tasklist_lock);
-			return -ESRCH;
-		}
-		get_task_struct(first);
-		read_unlock(&tasklist_lock);
-	} else
-		get_task_struct(first);
 
+	if (!first || !lock_task_sighand(first, &flags))
+		goto out;
 
-	tsk = first;
-	read_lock(&tasklist_lock);
-	/* Start with stats from dead tasks */
-	if (first->sighand) {
-		spin_lock_irqsave(&first->sighand->siglock, flags);
-		if (first->signal->stats)
-			memcpy(stats, first->signal->stats, sizeof(*stats));
-		spin_unlock_irqrestore(&first->sighand->siglock, flags);
-	}
+	if (first->signal->stats)
+		memcpy(stats, first->signal->stats, sizeof(*stats));
 
+	tsk = first;
 	do {
 		if (tsk->exit_state == EXIT_ZOMBIE && thread_group_leader(tsk))
 			continue;
@@ -260,15 +246,18 @@ static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk,
 		delayacct_add_tsk(stats, tsk);
 
 	} while_each_thread(first, tsk);
-	read_unlock(&tasklist_lock);
-	stats->version = TASKSTATS_VERSION;
 
+	unlock_task_sighand(first, &flags);
+	rc = 0;
+out:
+	rcu_read_unlock();
+
+	stats->version = TASKSTATS_VERSION;
 	/*
 	 * Accounting subsytems can also add calls here to modify
 	 * fields of taskstats.
 	 */
-	put_task_struct(first);
-	return 0;
+	return rc;
 }
 
 
-- 
cgit v1.2.3


From d7c3f5f231c60d7e6ada5770b536df2b3ec1bd08 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sat, 28 Oct 2006 10:38:54 -0700
Subject: [PATCH] fill_tgid: cleanup delays accounting

fill_tgid() should skip not only an already exited group leader.  If the
task has ->exit_state != 0 it already did exit_notify(), so it also did
fill_tgid_exit()->delayacct_add_tsk(->signal->stats) and we should skip it
to avoid a double accounting.

This patch doesn't close the race completely, but it cleanups the code.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Shailabh Nagar <nagar@watson.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Jay Lan <jlan@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/taskstats.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index b724aeea5443..8adfb8069c6d 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -235,7 +235,7 @@ static int fill_tgid(pid_t tgid, struct task_struct *first,
 
 	tsk = first;
 	do {
-		if (tsk->exit_state == EXIT_ZOMBIE && thread_group_leader(tsk))
+		if (tsk->exit_state)
 			continue;
 		/*
 		 * Accounting subsystem can call its functions here to
-- 
cgit v1.2.3


From bb1d860551c4307b1a7ee9a21b120319075e987e Mon Sep 17 00:00:00 2001
From: Jim Houston <jim.houston@comcast.net>
Date: Sat, 28 Oct 2006 10:38:56 -0700
Subject: [PATCH] time_adjust cleared before use

I notice that the code which implements adjtime clears the time_adjust
value before using it.  The attached patch makes the obvious fix.

Acked-by: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Jim Houston <jim.houston@ccur.com>
Cc: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/time/ntp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 47195fa0ec4f..3afeaa3a73f9 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -161,9 +161,9 @@ void second_overflow(void)
 			time_adjust += MAX_TICKADJ;
 			tick_length -= MAX_TICKADJ_SCALED;
 		} else {
-			time_adjust = 0;
 			tick_length += (s64)(time_adjust * NSEC_PER_USEC /
 					     HZ) << TICK_LENGTH_SHIFT;
+			time_adjust = 0;
 		}
 	}
 }
-- 
cgit v1.2.3


From 8fa1d7d3b2c51594c0f3aa151983dd51f605e07d Mon Sep 17 00:00:00 2001
From: Satoru Takeuchi <takeuchi_satoru@jp.fujitsu.com>
Date: Sat, 28 Oct 2006 10:38:57 -0700
Subject: [PATCH] cpu-hotplug: release `workqueue_mutex' properly on CPU
 hot-remove

_cpu_down() acquires `workqueue_mutex' on its process, but doen't release it
if __cpu_disable() fails.

Signed-off-by: Satoru Takeuchi <takeuchi_satoru@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpu.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 27dd3ee47099..663c920b2234 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -150,18 +150,18 @@ static int _cpu_down(unsigned int cpu)
 	p = __stop_machine_run(take_cpu_down, NULL, cpu);
 	mutex_unlock(&cpu_bitmask_lock);
 
-	if (IS_ERR(p)) {
+	if (IS_ERR(p) || cpu_online(cpu)) {
 		/* CPU didn't die: tell everyone.  Can't complain. */
 		if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED,
 				(void *)(long)cpu) == NOTIFY_BAD)
 			BUG();
 
-		err = PTR_ERR(p);
-		goto out_allowed;
-	}
-
-	if (cpu_online(cpu))
+		if (IS_ERR(p)) {
+			err = PTR_ERR(p);
+			goto out_allowed;
+		}
 		goto out_thread;
+	}
 
 	/* Wait for it to sleep (leaving idle task). */
 	while (!idle_cpu(cpu))
-- 
cgit v1.2.3


From 057647fc47b3a5fbcfa997041db3f483d506603c Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Sat, 28 Oct 2006 10:38:58 -0700
Subject: [PATCH] workqueue: update kerneldoc

This patch (as812) changes the kerneldoc comments explaining the return
values from queue_work(), queue_delayed_work(), and
queue_delayed_work_on().  The updated comments explain more accurately the
meaning of the return code and avoid suggesting that a 0 value means the
routine was unsuccessful.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/workqueue.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 3df9bfc7ff78..17c2f03d2c27 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -99,7 +99,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
  * @wq: workqueue to use
  * @work: work to queue
  *
- * Returns non-zero if it was successfully added.
+ * Returns 0 if @work was already on a queue, non-zero otherwise.
  *
  * We queue the work to the CPU it was submitted, but there is no
  * guarantee that it will be processed by that CPU.
@@ -138,7 +138,7 @@ static void delayed_work_timer_fn(unsigned long __data)
  * @work: work to queue
  * @delay: number of jiffies to wait before queueing
  *
- * Returns non-zero if it was successfully added.
+ * Returns 0 if @work was already on a queue, non-zero otherwise.
  */
 int fastcall queue_delayed_work(struct workqueue_struct *wq,
 			struct work_struct *work, unsigned long delay)
@@ -169,7 +169,7 @@ EXPORT_SYMBOL_GPL(queue_delayed_work);
  * @work: work to queue
  * @delay: number of jiffies to wait before queueing
  *
- * Returns non-zero if it was successfully added.
+ * Returns 0 if @work was already on a queue, non-zero otherwise.
  */
 int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 			struct work_struct *work, unsigned long delay)
-- 
cgit v1.2.3


From d46a3d0d07ba539aea5b0e1ad30e568f0cb03576 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 29 Oct 2006 16:45:58 +0300
Subject: [PATCH] taskstats: fix sk_buff leak

'return genlmsg_cancel()' in taskstats_user_cmd/taskstats_exit_send
potentially leaks a skb.  Unless we pass 'rep_skb' to the netlink layer
we own sk_buff.  This means we should always do kfree_skb() on failure.

[ Thomas acked and pointed out missing return value in original version ]

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Thomas Graf <tgraf@suug.ch>
Cc: Andrew Morton <akpm@osdl.org>
Cc: Shailabh Nagar <nagar@watson.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Jay Lan <jlan@sgi.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/taskstats.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 8adfb8069c6d..f3c3e9d43d2c 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -411,7 +411,7 @@ static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
 	return send_reply(rep_skb, info->snd_pid);
 
 nla_put_failure:
-	return genlmsg_cancel(rep_skb, reply);
+	rc = genlmsg_cancel(rep_skb, reply);
 err:
 	nlmsg_free(rep_skb);
 	return rc;
@@ -507,7 +507,6 @@ send:
 
 nla_put_failure:
 	genlmsg_cancel(rep_skb, reply);
-	goto ret;
 err_skb:
 	nlmsg_free(rep_skb);
 ret:
-- 
cgit v1.2.3


From 3d8334def5cf831d2ed438aae021696a2faa4ddd Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 29 Oct 2006 18:57:16 +0300
Subject: [PATCH] taskstats: fix sk_buff size calculation

prepare_reply() adds GENL_HDRLEN to the payload (genlmsg_total_size()),
but then it does genlmsg_put()->nlmsg_put().  This means we forget to
reserve a room for 'struct nlmsghdr'.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Thomas Graf <tgraf@suug.ch>
Cc: Andrew Morton <akpm@osdl.org>
Cc: Shailabh Nagar <nagar@watson.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Jay Lan <jlan@sgi.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/taskstats.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index f3c3e9d43d2c..2039585ec5e1 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -77,7 +77,8 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
 	/*
 	 * If new attributes are added, please revisit this allocation
 	 */
-	skb = nlmsg_new(genlmsg_total_size(size), GFP_KERNEL);
+	size = nlmsg_total_size(genlmsg_total_size(size));
+	skb = nlmsg_new(size, GFP_KERNEL);
 	if (!skb)
 		return -ENOMEM;
 
-- 
cgit v1.2.3


From 0e2d57fc6e7dabdbfdd4f26c861e7e6c75d5bdcf Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Sun, 29 Oct 2006 22:46:34 -0800
Subject: [PATCH] ndiswrapper: don't set the module->taints flags

For ndiswrapper, don't set the module->taints flags, just set the kernel
global tainted flag.  This should allow ndiswrapper to continue to use GPL
symbols.

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Florin Malita <fmalita@gmail.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/module.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 5072a943fe35..f0166563c602 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1718,7 +1718,7 @@ static struct module *load_module(void __user *umod,
 	set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
 
 	if (strcmp(mod->name, "ndiswrapper") == 0)
-		add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
+		add_taint(TAINT_PROPRIETARY_MODULE);
 	if (strcmp(mod->name, "driverloader") == 0)
 		add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
 
-- 
cgit v1.2.3


From f0ec1aaf54caddd21c259aea8b2ecfbde4ee4fb9 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 29 Oct 2006 22:46:43 -0800
Subject: [PATCH] xacct_add_tsk: fix pure theoretical ->mm use-after-free

Paranoid fix. The task can free its ->mm after the 'if (p->mm)' check.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Shailabh Nagar <nagar@watson.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Jay Lan <jlan@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/tsacct.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 65a5036a3d95..96f77013d3f0 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -80,13 +80,17 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
  */
 void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
 {
+	struct mm_struct *mm;
+
 	/* convert pages-jiffies to Mbyte-usec */
 	stats->coremem = jiffies_to_usecs(p->acct_rss_mem1) * PAGE_SIZE / MB;
 	stats->virtmem = jiffies_to_usecs(p->acct_vm_mem1) * PAGE_SIZE / MB;
-	if (p->mm) {
+	mm = get_task_mm(p);
+	if (mm) {
 		/* adjust to KB unit */
-		stats->hiwater_rss   = p->mm->hiwater_rss * PAGE_SIZE / KB;
-		stats->hiwater_vm    = p->mm->hiwater_vm * PAGE_SIZE / KB;
+		stats->hiwater_rss   = mm->hiwater_rss * PAGE_SIZE / KB;
+		stats->hiwater_vm    = mm->hiwater_vm * PAGE_SIZE / KB;
+		mmput(mm);
 	}
 	stats->read_char	= p->rchar;
 	stats->write_char	= p->wchar;
-- 
cgit v1.2.3


From 4a279ff1ea1cf325775ada983035123fcdc8e986 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Mon, 30 Oct 2006 22:07:15 -0800
Subject: [PATCH] taskstats: fix sub-threads accounting

If there are no listeners, taskstats_exit_send() just returns because
taskstats_exit_alloc() didn't allocate *tidstats.  This is wrong, each
sub-thread should do fill_tgid_exit() on exit, otherwise its ->delays is
not recorded in ->signal->stats and lost.

Q: We don't send TASKSTATS_TYPE_AGGR_TGID when single-threaded process
exits.  Is it good?  How can the listener figure out that it was actually a
process exit, not sub-thread?

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Balbir Singh <balbir@in.ibm.com>
Acked-by: Shailabh Nagar <nagar@watson.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/taskstats.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 2039585ec5e1..f45c5e70773c 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -455,10 +455,9 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
 	int is_thread_group;
 	struct nlattr *na;
 
-	if (!family_registered || !tidstats)
+	if (!family_registered)
 		return;
 
-	rc = 0;
 	/*
 	 * Size includes space for nested attributes
 	 */
@@ -466,8 +465,15 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
 		nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
 
 	is_thread_group = (tsk->signal->stats != NULL);
-	if (is_thread_group)
-		size = 2 * size;	/* PID + STATS + TGID + STATS */
+	if (is_thread_group) {
+		/* PID + STATS + TGID + STATS */
+		size = 2 * size;
+		/* fill the tsk->signal->stats structure */
+		fill_tgid_exit(tsk);
+	}
+
+	if (!tidstats)
+		return;
 
 	rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
 	if (rc < 0)
@@ -487,11 +493,8 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
 		goto send;
 
 	/*
-	 * tsk has/had a thread group so fill the tsk->signal->stats structure
 	 * Doesn't matter if tsk is the leader or the last group member leaving
 	 */
-
-	fill_tgid_exit(tsk);
 	if (!group_dead)
 		goto send;
 
-- 
cgit v1.2.3


From f46c483357c2d87606bbefb511321e3efd4baae0 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Thu, 2 Nov 2006 22:07:16 -0800
Subject: [PATCH] Add printk_timed_ratelimit()

printk_ratelimit() has global state which makes it not useful for callers
which wish to perform ratelimiting at a particular frequency.

Add a printk_timed_ratelimit() which utilises caller-provided state storage to
permit more flexibility.

This function can in fact be used for things other than printk ratelimiting
and is perhaps poorly named.

Cc: Ulrich Drepper <drepper@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/kernel.h |  2 ++
 kernel/printk.c        | 21 +++++++++++++++++++++
 2 files changed, 23 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 80f39cab470a..24b611147adb 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -171,6 +171,8 @@ __attribute_const__ roundup_pow_of_two(unsigned long x)
 
 extern int printk_ratelimit(void);
 extern int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst);
+extern bool printk_timed_ratelimit(unsigned long *caller_jiffies,
+				unsigned int interval_msec);
 
 static inline void console_silent(void)
 {
diff --git a/kernel/printk.c b/kernel/printk.c
index f7d427ef5038..66426552fbfe 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -31,6 +31,7 @@
 #include <linux/security.h>
 #include <linux/bootmem.h>
 #include <linux/syscalls.h>
+#include <linux/jiffies.h>
 
 #include <asm/uaccess.h>
 
@@ -1101,3 +1102,23 @@ int printk_ratelimit(void)
 				printk_ratelimit_burst);
 }
 EXPORT_SYMBOL(printk_ratelimit);
+
+/**
+ * printk_timed_ratelimit - caller-controlled printk ratelimiting
+ * @caller_jiffies: pointer to caller's state
+ * @interval_msecs: minimum interval between prints
+ *
+ * printk_timed_ratelimit() returns true if more than @interval_msecs
+ * milliseconds have elapsed since the last time printk_timed_ratelimit()
+ * returned true.
+ */
+bool printk_timed_ratelimit(unsigned long *caller_jiffies,
+			unsigned int interval_msecs)
+{
+	if (*caller_jiffies == 0 || time_after(jiffies, *caller_jiffies)) {
+		*caller_jiffies = jiffies + msecs_to_jiffies(interval_msecs);
+		return true;
+	}
+	return false;
+}
+EXPORT_SYMBOL(printk_timed_ratelimit);
-- 
cgit v1.2.3


From 19c6b6ed3f597a583f58e3fc99256cc01ae8c394 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Thu, 2 Nov 2006 22:07:17 -0800
Subject: [PATCH] schedule removal of FUTEX_FD

Apparently FUTEX_FD is unfixably racy and nothing uses it (or if it does, it
shouldn't).

Add a warning printk, give any remaining users six months to migrate off it.

Cc: Ulrich Drepper <drepper@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/futex.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index b364e0026191..93ef30ba209f 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1507,6 +1507,13 @@ static int futex_fd(u32 __user *uaddr, int signal)
 	struct futex_q *q;
 	struct file *filp;
 	int ret, err;
+	static unsigned long printk_interval;
+
+	if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) {
+		printk(KERN_WARNING "Process `%s' used FUTEX_FD, which "
+		    	"will be removed from the kernel in June 2007\n",
+			current->comm);
+	}
 
 	ret = -EINVAL;
 	if (!valid_signal(signal))
-- 
cgit v1.2.3


From b918f6e62cd46774f9fc0a3fbba6bd10ad85ee14 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 2 Nov 2006 22:07:19 -0800
Subject: [PATCH] swsusp: debugging

Add a swsusp debugging mode.  This does everything that's needed for a suspend
except for actually suspending.  So we can look in the log messages and work
out a) what code is being slow and b) which drivers are misbehaving.

(1)
# echo testproc > /sys/power/disk
# echo disk > /sys/power/state

This should turn off the non-boot CPU, freeze all processes, wait for 5
seconds and then thaw the processes and the CPU.

(2)
# echo test > /sys/power/disk
# echo disk > /sys/power/state

This should turn off the non-boot CPU, freeze all processes, shrink
memory, suspend all devices, wait for 5 seconds, resume the devices etc.

Cc: Pavel Machek <pavel@ucw.cz>
Cc: Stefan Seyfried <seife@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/ABI/testing/sysfs-power | 17 +++++++++++++++-
 Documentation/power/interface.txt     | 13 ++++++++++++
 include/linux/pm.h                    |  4 +++-
 kernel/power/disk.c                   | 37 ++++++++++++++++++++++++++---------
 4 files changed, 60 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/ABI/testing/sysfs-power b/Documentation/ABI/testing/sysfs-power
index d882f8093871..dcff4d0623ad 100644
--- a/Documentation/ABI/testing/sysfs-power
+++ b/Documentation/ABI/testing/sysfs-power
@@ -21,7 +21,7 @@ Description:
 		these states.
 
 What:		/sys/power/disk
-Date:		August 2006
+Date:		September 2006
 Contact:	Rafael J. Wysocki <rjw@sisk.pl>
 Description:
 		The /sys/power/disk file controls the operating mode of the
@@ -39,6 +39,19 @@ Description:
 		'reboot' - the memory image will be saved by the kernel and
 		the system will be rebooted.
 
+		Additionally, /sys/power/disk can be used to turn on one of the
+		two testing modes of the suspend-to-disk mechanism: 'testproc'
+		or 'test'.  If the suspend-to-disk mechanism is in the
+		'testproc' mode, writing 'disk' to /sys/power/state will cause
+		the kernel to disable nonboot CPUs and freeze tasks, wait for 5
+		seconds, unfreeze tasks and enable nonboot CPUs.  If it is in
+		the 'test' mode, writing 'disk' to /sys/power/state will cause
+		the kernel to disable nonboot CPUs and freeze tasks, shrink
+		memory, suspend devices, wait for 5 seconds, resume devices,
+		unfreeze tasks and enable nonboot CPUs.  Then, we are able to
+		look in the log messages and work out, for example, which code
+		is being slow and which device drivers are misbehaving.
+
 		The suspend-to-disk method may be chosen by writing to this
 		file one of the accepted strings:
 
@@ -46,6 +59,8 @@ Description:
 		'platform'
 		'shutdown'
 		'reboot'
+		'testproc'
+		'test'
 
 		It will only change to 'firmware' or 'platform' if the system
 		supports that.
diff --git a/Documentation/power/interface.txt b/Documentation/power/interface.txt
index a66bec222b16..74311d7e0f3c 100644
--- a/Documentation/power/interface.txt
+++ b/Documentation/power/interface.txt
@@ -30,6 +30,17 @@ testing). The system will support either 'firmware' or 'platform', and
 that is known a priori. But, the user may choose 'shutdown' or
 'reboot' as alternatives. 
 
+Additionally, /sys/power/disk can be used to turn on one of the two testing
+modes of the suspend-to-disk mechanism: 'testproc' or 'test'.  If the
+suspend-to-disk mechanism is in the 'testproc' mode, writing 'disk' to
+/sys/power/state will cause the kernel to disable nonboot CPUs and freeze
+tasks, wait for 5 seconds, unfreeze tasks and enable nonboot CPUs.  If it is
+in the 'test' mode, writing 'disk' to /sys/power/state will cause the kernel
+to disable nonboot CPUs and freeze tasks, shrink memory, suspend devices, wait
+for 5 seconds, resume devices, unfreeze tasks and enable nonboot CPUs.  Then,
+we are able to look in the log messages and work out, for example, which code
+is being slow and which device drivers are misbehaving.
+
 Reading from this file will display what the mode is currently set
 to. Writing to this file will accept one of
 
@@ -37,6 +48,8 @@ to. Writing to this file will accept one of
        'platform'
        'shutdown'
        'reboot'
+       'testproc'
+       'test'
 
 It will only change to 'firmware' or 'platform' if the system supports
 it. 
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 6b27e07aef19..070394e846d0 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -116,7 +116,9 @@ typedef int __bitwise suspend_disk_method_t;
 #define	PM_DISK_PLATFORM	((__force suspend_disk_method_t) 2)
 #define	PM_DISK_SHUTDOWN	((__force suspend_disk_method_t) 3)
 #define	PM_DISK_REBOOT		((__force suspend_disk_method_t) 4)
-#define	PM_DISK_MAX		((__force suspend_disk_method_t) 5)
+#define	PM_DISK_TEST		((__force suspend_disk_method_t) 5)
+#define	PM_DISK_TESTPROC	((__force suspend_disk_method_t) 6)
+#define	PM_DISK_MAX		((__force suspend_disk_method_t) 7)
 
 struct pm_ops {
 	suspend_disk_method_t pm_disk_mode;
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index d3a158a60312..b1fb7866b0b3 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -71,7 +71,7 @@ static inline void platform_finish(void)
 
 static int prepare_processes(void)
 {
-	int error;
+	int error = 0;
 
 	pm_prepare_console();
 
@@ -84,6 +84,12 @@ static int prepare_processes(void)
 		goto thaw;
 	}
 
+	if (pm_disk_mode == PM_DISK_TESTPROC) {
+		printk("swsusp debug: Waiting for 5 seconds.\n");
+		mdelay(5000);
+		goto thaw;
+	}
+
 	/* Free memory before shutting down devices. */
 	if (!(error = swsusp_shrink_memory()))
 		return 0;
@@ -120,13 +126,21 @@ int pm_suspend_disk(void)
 	if (error)
 		return error;
 
+	if (pm_disk_mode == PM_DISK_TESTPROC)
+		goto Thaw;
+
 	suspend_console();
 	error = device_suspend(PMSG_FREEZE);
 	if (error) {
 		resume_console();
 		printk("Some devices failed to suspend\n");
-		unprepare_processes();
-		return error;
+		goto Thaw;
+	}
+
+	if (pm_disk_mode == PM_DISK_TEST) {
+		printk("swsusp debug: Waiting for 5 seconds.\n");
+		mdelay(5000);
+		goto Done;
 	}
 
 	pr_debug("PM: snapshotting memory.\n");
@@ -143,16 +157,17 @@ int pm_suspend_disk(void)
 			power_down(pm_disk_mode);
 		else {
 			swsusp_free();
-			unprepare_processes();
-			return error;
+			goto Thaw;
 		}
-	} else
+	} else {
 		pr_debug("PM: Image restored successfully.\n");
+	}
 
 	swsusp_free();
  Done:
 	device_resume();
 	resume_console();
+ Thaw:
 	unprepare_processes();
 	return error;
 }
@@ -249,6 +264,8 @@ static const char * const pm_disk_modes[] = {
 	[PM_DISK_PLATFORM]	= "platform",
 	[PM_DISK_SHUTDOWN]	= "shutdown",
 	[PM_DISK_REBOOT]	= "reboot",
+	[PM_DISK_TEST]		= "test",
+	[PM_DISK_TESTPROC]	= "testproc",
 };
 
 /**
@@ -303,17 +320,19 @@ static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n)
 		}
 	}
 	if (mode) {
-		if (mode == PM_DISK_SHUTDOWN || mode == PM_DISK_REBOOT)
+		if (mode == PM_DISK_SHUTDOWN || mode == PM_DISK_REBOOT ||
+		     mode == PM_DISK_TEST || mode == PM_DISK_TESTPROC) {
 			pm_disk_mode = mode;
-		else {
+		} else {
 			if (pm_ops && pm_ops->enter &&
 			    (mode == pm_ops->pm_disk_mode))
 				pm_disk_mode = mode;
 			else
 				error = -EINVAL;
 		}
-	} else
+	} else {
 		error = -EINVAL;
+	}
 
 	pr_debug("PM: suspend-to-disk mode set to '%s'\n",
 		 pm_disk_modes[mode]);
-- 
cgit v1.2.3


From 3fd593979802f81ff6452596ac61e3840f917589 Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Thu, 2 Nov 2006 22:07:24 -0800
Subject: [PATCH] Create compat_sys_migrate_pages

This is needed on bigendian 64bit architectures.

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Acked-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/compat.h |  4 ++++
 kernel/compat.c        | 33 +++++++++++++++++++++++++++++++++
 kernel/sys_ni.c        |  1 +
 3 files changed, 38 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index f1553196826f..80b17f440ec1 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -230,5 +230,9 @@ asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp);
 extern int compat_printk(const char *fmt, ...);
 extern void sigset_from_compat(sigset_t *set, compat_sigset_t *compat);
 
+asmlinkage long compat_sys_migrate_pages(compat_pid_t pid,
+		compat_ulong_t maxnode, const compat_ulong_t __user *old_nodes,
+		const compat_ulong_t __user *new_nodes);
+
 #endif /* CONFIG_COMPAT */
 #endif /* _LINUX_COMPAT_H */
diff --git a/kernel/compat.c b/kernel/compat.c
index d4898aad6cfa..6952dd057300 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -982,4 +982,37 @@ asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages,
 	}
 	return sys_move_pages(pid, nr_pages, pages, nodes, status, flags);
 }
+
+asmlinkage long compat_sys_migrate_pages(compat_pid_t pid,
+			compat_ulong_t maxnode,
+			const compat_ulong_t __user *old_nodes,
+			const compat_ulong_t __user *new_nodes)
+{
+	unsigned long __user *old = NULL;
+	unsigned long __user *new = NULL;
+	nodemask_t tmp_mask;
+	unsigned long nr_bits;
+	unsigned long size;
+
+	nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
+	size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
+	if (old_nodes) {
+		if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
+			return -EFAULT;
+		old = compat_alloc_user_space(new_nodes ? size * 2 : size);
+		if (new_nodes)
+			new = old + size / sizeof(unsigned long);
+		if (copy_to_user(old, nodes_addr(tmp_mask), size))
+			return -EFAULT;
+	}
+	if (new_nodes) {
+		if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
+			return -EFAULT;
+		if (new == NULL)
+			new = compat_alloc_user_space(size);
+		if (copy_to_user(new, nodes_addr(tmp_mask), size))
+			return -EFAULT;
+	}
+	return sys_migrate_pages(pid, nr_bits + 1, old, new);
+}
 #endif
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 0e53314b14de..d7306d0f3dfc 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -135,6 +135,7 @@ cond_syscall(sys_madvise);
 cond_syscall(sys_mremap);
 cond_syscall(sys_remap_file_pages);
 cond_syscall(compat_sys_move_pages);
+cond_syscall(compat_sys_migrate_pages);
 
 /* block-layer dependent */
 cond_syscall(sys_bdflush);
-- 
cgit v1.2.3


From 45c18b0bb579b5c1b89f8c99f1b6ffa4c586ba08 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Sat, 4 Nov 2006 10:06:02 -0800
Subject: Fix unlikely (but possible) race condition on task->user access

There's a possible race condition when doing a "switch_uid()" from one
user to another, which could race with another thread doing a signal
allocation and looking at the old thread ->user pointer as it is freed.

This explains an oops reported by Lukasz Trabinski:
	http://permalink.gmane.org/gmane.linux.kernel/462241

We fix this by delaying the (reference-counted) freeing of the user
structure until the thread signal handler lock has been released, so
that we know that the signal allocation has either seen the new value or
has properly incremented the reference count of the old one.

Race identified by Oleg Nesterov.

Cc: Lukasz Trabinski <lukasz@wsisiz.edu.pl>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Andrew Morton <akpm@osdl.org>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/user.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'kernel')

diff --git a/kernel/user.c b/kernel/user.c
index 6408c0424291..220e586127a0 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -187,6 +187,17 @@ void switch_uid(struct user_struct *new_user)
 	atomic_dec(&old_user->processes);
 	switch_uid_keyring(new_user);
 	current->user = new_user;
+
+	/*
+	 * We need to synchronize with __sigqueue_alloc()
+	 * doing a get_uid(p->user).. If that saw the old
+	 * user value, we need to wait until it has exited
+	 * its critical region before we can free the old
+	 * structure.
+	 */
+	smp_mb();
+	spin_unlock_wait(&current->sighand->siglock);
+
 	free_uid(old_user);
 	suid_keys(current);
 }
-- 
cgit v1.2.3


From 10b1fbdb0a0ca91847a534ad26d0bc250c25b74f Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Sat, 4 Nov 2006 13:03:00 -0800
Subject: Make sure "user->sigpending" count is in sync

The previous commit (45c18b0bb579b5c1b89f8c99f1b6ffa4c586ba08, aka "Fix
unlikely (but possible) race condition on task->user access") fixed a
potential oops due to __sigqueue_alloc() getting its "user" pointer out
of sync with switch_user(), and accessing a user pointer that had been
de-allocated on another CPU.

It still left another (much less serious) problem, where a concurrent
__sigqueue_alloc and swich_user could cause sigqueue_alloc to do signal
pending reference counting for a _different_ user than the one it then
actually ended up using.  No oops, but we'd end up with the wrong signal
accounting.

Another case of Oleg's eagle-eyes picking up the problem.

This is trivially fixed by just making sure we load whichever "user"
structure we decide to use (it doesn't matter _which_ one we pick, we
just need to pick one) just once.

Acked-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Andrew Morton <akpm@osdl.org>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 7ed8d5304bec..df18c167a2a7 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -267,18 +267,25 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
 					 int override_rlimit)
 {
 	struct sigqueue *q = NULL;
+	struct user_struct *user;
 
-	atomic_inc(&t->user->sigpending);
+	/*
+	 * In order to avoid problems with "switch_user()", we want to make
+	 * sure that the compiler doesn't re-load "t->user"
+	 */
+	user = t->user;
+	barrier();
+	atomic_inc(&user->sigpending);
 	if (override_rlimit ||
-	    atomic_read(&t->user->sigpending) <=
+	    atomic_read(&user->sigpending) <=
 			t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur)
 		q = kmem_cache_alloc(sigqueue_cachep, flags);
 	if (unlikely(q == NULL)) {
-		atomic_dec(&t->user->sigpending);
+		atomic_dec(&user->sigpending);
 	} else {
 		INIT_LIST_HEAD(&q->list);
 		q->flags = 0;
-		q->user = get_uid(t->user);
+		q->user = get_uid(user);
 	}
 	return(q);
 }
-- 
cgit v1.2.3


From 4b96b1a10cb00c867103b21f0f2a6c91b705db11 Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Sun, 5 Nov 2006 23:52:04 -0800
Subject: [PATCH] Fix the spurious unlock_cpu_hotplug false warnings

Cpu-hotplug locking has a minor race case caused because of setting the
variable "recursive" to NULL *after* releasing the cpu_bitmask_lock in the
function unlock_cpu_hotplug,instead of doing so before releasing the
cpu_bitmask_lock.

This was the cause of most of the recent false spurious lock_cpu_unlock
warnings.

This should fix the problem reported by Martin Lorenz reported in
http://lkml.org/lkml/2006/10/29/127.

Thanks to Srinivasa DS for pointing it out.

Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 663c920b2234..272254f20d97 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -58,8 +58,8 @@ void unlock_cpu_hotplug(void)
 		recursive_depth--;
 		return;
 	}
-	mutex_unlock(&cpu_bitmask_lock);
 	recursive = NULL;
+	mutex_unlock(&cpu_bitmask_lock);
 }
 EXPORT_SYMBOL_GPL(unlock_cpu_hotplug);
 
-- 
cgit v1.2.3


From 64efade11cddc4237c1b95ea4ca18af122a7e19e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sun, 5 Nov 2006 23:52:10 -0800
Subject: [PATCH] lockdep: fix delayacct locking bug

Make the delayacct lock irqsave; this avoids the possible deadlock where
an interrupt is taken while holding the delayacct lock which needs to
take the delayacct lock.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Shailabh Nagar <nagar@watson.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/delayacct.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 36752f124c6a..66a0ea48751d 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -66,6 +66,7 @@ static void delayacct_end(struct timespec *start, struct timespec *end,
 {
 	struct timespec ts;
 	s64 ns;
+	unsigned long flags;
 
 	do_posix_clock_monotonic_gettime(end);
 	ts = timespec_sub(*end, *start);
@@ -73,10 +74,10 @@ static void delayacct_end(struct timespec *start, struct timespec *end,
 	if (ns < 0)
 		return;
 
-	spin_lock(&current->delays->lock);
+	spin_lock_irqsave(&current->delays->lock, flags);
 	*total += ns;
 	(*count)++;
-	spin_unlock(&current->delays->lock);
+	spin_unlock_irqrestore(&current->delays->lock, flags);
 }
 
 void __delayacct_blkio_start(void)
@@ -104,6 +105,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
 	s64 tmp;
 	struct timespec ts;
 	unsigned long t1,t2,t3;
+	unsigned long flags;
 
 	/* Though tsk->delays accessed later, early exit avoids
 	 * unnecessary returning of other data
@@ -136,14 +138,14 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
 
 	/* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */
 
-	spin_lock(&tsk->delays->lock);
+	spin_lock_irqsave(&tsk->delays->lock, flags);
 	tmp = d->blkio_delay_total + tsk->delays->blkio_delay;
 	d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp;
 	tmp = d->swapin_delay_total + tsk->delays->swapin_delay;
 	d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp;
 	d->blkio_count += tsk->delays->blkio_count;
 	d->swapin_count += tsk->delays->swapin_count;
-	spin_unlock(&tsk->delays->lock);
+	spin_unlock_irqrestore(&tsk->delays->lock, flags);
 
 done:
 	return 0;
@@ -152,11 +154,12 @@ done:
 __u64 __delayacct_blkio_ticks(struct task_struct *tsk)
 {
 	__u64 ret;
+	unsigned long flags;
 
-	spin_lock(&tsk->delays->lock);
+	spin_lock_irqsave(&tsk->delays->lock, flags);
 	ret = nsec_to_clock_t(tsk->delays->blkio_delay +
 				tsk->delays->swapin_delay);
-	spin_unlock(&tsk->delays->lock);
+	spin_unlock_irqrestore(&tsk->delays->lock, flags);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 0e009be8a0c2309f3696df70f72ef0075aa34c9c Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sun, 5 Nov 2006 23:52:11 -0800
Subject: [PATCH] Improve the removed sysctl warnings

Don't warn about libpthread's access to kernel.version.  When it receives
-ENOSYS it will read /proc/sys/kernel/version.

If anything else shows up print the sysctl number string.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Cal Peake <cp@absolutedigital.net>
Cc: Alan Cox <alan@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sysctl.c | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8bff2c18fb5a..0c8e805bbd6f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2680,13 +2680,33 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
 asmlinkage long sys_sysctl(struct __sysctl_args __user *args)
 {
 	static int msg_count;
+	struct __sysctl_args tmp;
+	int name[CTL_MAXNAME];
+	int i;
+
+	/* Read in the sysctl name for better debug message logging */
+	if (copy_from_user(&tmp, args, sizeof(tmp)))
+		return -EFAULT;
+	if (tmp.nlen <= 0 || tmp.nlen >= CTL_MAXNAME)
+		return -ENOTDIR;
+	for (i = 0; i < tmp.nlen; i++)
+		if (get_user(name[i], tmp.name + i))
+			return -EFAULT;
+
+	/* Ignore accesses to kernel.version */
+	if ((tmp.nlen == 2) && (name[0] == CTL_KERN) && (name[1] == KERN_VERSION))
+		goto out;
 
 	if (msg_count < 5) {
 		msg_count++;
 		printk(KERN_INFO
 			"warning: process `%s' used the removed sysctl "
-			"system call\n", current->comm);
+			"system call with ", current->comm);
+		for (i = 0; i < tmp.nlen; i++)
+			printk("%d.", name[i]);
+		printk("\n");
 	}
+out:
 	return -ENOSYS;
 }
 
-- 
cgit v1.2.3


From d99f160ac53e51090f015a8f0617cea25f81a191 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sun, 5 Nov 2006 23:52:12 -0800
Subject: [PATCH] sysctl: allow a zero ctl_name in the middle of a sysctl table

Since it is becoming clear that there are just enough users of the binary
sysctl interface that completely removing the binary interface from the kernel
will not be an option for foreseeable future, we need to find a way to address
the sysctl maintenance issues.

The basic problem is that sysctl requires one central authority to allocate
sysctl numbers, or else conflicts and ABI breakage occur.  The proc interface
to sysctl does not have that problem, as names are not densely allocated.

By not terminating a sysctl table until I have neither a ctl_name nor a
procname, it becomes simple to add sysctl entries that don't show up in the
binary sysctl interface.  Which allows people to avoid allocating a binary
sysctl value when not needed.

I have audited the kernel code and in my reading I have not found a single
sysctl table that wasn't terminated by a completely zero filled entry.  So
this change in behavior should not affect anything.

I think this mechanism eases the pain enough that combined with a little
disciple we can solve the reoccurring sysctl ABI breakage.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Acked-by: Alan Cox <alan@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sysctl.h | 9 ++++++---
 kernel/sysctl.c        | 8 +++++---
 2 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 1b24bd45e080..c184732a70fc 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -961,8 +961,8 @@ extern ctl_handler sysctl_ms_jiffies;
 /*
  * Register a set of sysctl names by calling register_sysctl_table
  * with an initialised array of ctl_table's.  An entry with zero
- * ctl_name terminates the table.  table->de will be set up by the
- * registration and need not be initialised in advance.
+ * ctl_name and NULL procname terminates the table.  table->de will be
+ * set up by the registration and need not be initialised in advance.
  *
  * sysctl names can be mirrored automatically under /proc/sys.  The
  * procname supplied controls /proc naming.
@@ -973,7 +973,10 @@ extern ctl_handler sysctl_ms_jiffies;
  * Leaf nodes in the sysctl tree will be represented by a single file
  * under /proc; non-leaf nodes will be represented by directories.  A
  * null procname disables /proc mirroring at this node.
- * 
+ *
+ * sysctl entries with a zero ctl_name will not be available through
+ * the binary sysctl interface.
+ *
  * sysctl(2) can automatically manage read and write requests through
  * the sysctl table.  The data and maxlen fields of the ctl_table
  * struct enable minimal validation of the values being written to be
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0c8e805bbd6f..09e569f4792b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1315,7 +1315,9 @@ repeat:
 		return -ENOTDIR;
 	if (get_user(n, name))
 		return -EFAULT;
-	for ( ; table->ctl_name; table++) {
+	for ( ; table->ctl_name || table->procname; table++) {
+		if (!table->ctl_name)
+			continue;
 		if (n == table->ctl_name || table->ctl_name == CTL_ANY) {
 			int error;
 			if (table->child) {
@@ -1532,7 +1534,7 @@ static void register_proc_table(ctl_table * table, struct proc_dir_entry *root,
 	int len;
 	mode_t mode;
 	
-	for (; table->ctl_name; table++) {
+	for (; table->ctl_name || table->procname; table++) {
 		/* Can't do anything without a proc name. */
 		if (!table->procname)
 			continue;
@@ -1579,7 +1581,7 @@ static void register_proc_table(ctl_table * table, struct proc_dir_entry *root,
 static void unregister_proc_table(ctl_table * table, struct proc_dir_entry *root)
 {
 	struct proc_dir_entry *de;
-	for (; table->ctl_name; table++) {
+	for (; table->ctl_name || table->procname; table++) {
 		if (!(de = table->de))
 			continue;
 		if (de->mode & S_IFDIR) {
-- 
cgit v1.2.3


From 0130b0b32ee53dc7add773fcea984f6a26ef1da3 Mon Sep 17 00:00:00 2001
From: Sharyathi Nagesh <sharyath@in.ibm.com>
Date: Fri, 10 Nov 2006 12:27:54 -0800
Subject: [PATCH] fix Data Acess error in dup_fd

On running the Stress Test on machine for more than 72 hours following
error message was observed.

0:mon> e
cpu 0x0: Vector: 300 (Data Access) at [c00000007ce2f7f0]
    pc: c000000000060d90: .dup_fd+0x240/0x39c
    lr: c000000000060d6c: .dup_fd+0x21c/0x39c
    sp: c00000007ce2fa70
   msr: 800000000000b032
   dar: ffffffff00000028
 dsisr: 40000000
  current = 0xc000000074950980
  paca    = 0xc000000000454500
    pid   = 27330, comm = bash

0:mon> t
[c00000007ce2fa70] c000000000060d28 .dup_fd+0x1d8/0x39c (unreliable)
[c00000007ce2fb30] c000000000060f48 .copy_files+0x5c/0x88
[c00000007ce2fbd0] c000000000061f5c .copy_process+0x574/0x1520
[c00000007ce2fcd0] c000000000062f88 .do_fork+0x80/0x1c4
[c00000007ce2fdc0] c000000000011790 .sys_clone+0x5c/0x74
[c00000007ce2fe30] c000000000008950 .ppc_clone+0x8/0xc

The problem is because of race window.  When if(expand) block is executed in
dup_fd unlocking of oldf->file_lock give a window for fdtable in oldf to be
modified.  So actual open_files in oldf may not match with open_files
variable.

Cc: Vadim Lobanov <vlobanov@speakeasy.net>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 3da978eec791..4b4eab2a3161 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -687,6 +687,7 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
 		 * the latest pointer.
 		 */
 		spin_lock(&oldf->file_lock);
+		open_files = count_open_files(old_fdt);
 		old_fdt = files_fdtable(oldf);
 	}
 
-- 
cgit v1.2.3


From f72fa707604c015a6625e80f269506032d5430dc Mon Sep 17 00:00:00 2001
From: Pavel Emelianov <xemul@openvz.org>
Date: Fri, 10 Nov 2006 12:27:56 -0800
Subject: [PATCH] Fix misrouted interrupts deadlocks

While testing kernel on machine with "irqpoll" option I've caught such a
lockup:

	__do_IRQ()
	   spin_lock(&desc->lock);
           desc->chip->ack(); /* IRQ is ACKed */
	note_interrupt()
	misrouted_irq()
	handle_IRQ_event()
           if (...)
	      local_irq_enable_in_hardirq();
	/* interrupts are enabled from now */
	...
	__do_IRQ() /* same IRQ we've started from */
	   spin_lock(&desc->lock); /* LOCKUP */

Looking at misrouted_irq() code I've found that a potential deadlock like
this can also take place:

1CPU:
__do_IRQ()
   spin_lock(&desc->lock); /* irq = A */
misrouted_irq()
   for (i = 1; i < NR_IRQS; i++) {
      spin_lock(&desc->lock); /* irq = B */
      if (desc->status & IRQ_INPROGRESS) {

2CPU:
__do_IRQ()
   spin_lock(&desc->lock); /* irq = B */
misrouted_irq()
   for (i = 1; i < NR_IRQS; i++) {
      spin_lock(&desc->lock); /* irq = A */
      if (desc->status & IRQ_INPROGRESS) {

As the second lock on both CPUs is taken before checking that this irq is
being handled in another processor this may cause a deadlock.  This issue
is only theoretical.

I propose the attached patch to fix booth problems: when trying to handle
misrouted IRQ active desc->lock may be unlocked.

Acked-by: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/irq/spurious.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 543ea2e5ad93..9c7e2e4c1fe7 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -147,7 +147,11 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
 	if (unlikely(irqfixup)) {
 		/* Don't punish working computers */
 		if ((irqfixup == 2 && irq == 0) || action_ret == IRQ_NONE) {
-			int ok = misrouted_irq(irq);
+			int ok;
+
+			spin_unlock(&desc->lock);
+			ok = misrouted_irq(irq);
+			spin_lock(&desc->lock);
 			if (action_ret == IRQ_NONE)
 				desc->irqs_unhandled -= ok;
 		}
-- 
cgit v1.2.3


From 8b126b77536186eef69d408eb7959ce7f558f251 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Tue, 14 Nov 2006 02:03:23 -0800
Subject: [PATCH] setup_irq(): better mismatch debugging

When we get a mismatch between handlers on the same IRQ, all we get is "IRQ
handler type mismatch for IRQ n".  Let's print the name of the
presently-registered handler with which we got the mismatch.

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/irq/manage.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 6879202afe9a..b385878c6e80 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -216,6 +216,7 @@ int setup_irq(unsigned int irq, struct irqaction *new)
 {
 	struct irq_desc *desc = irq_desc + irq;
 	struct irqaction *old, **p;
+	const char *old_name = NULL;
 	unsigned long flags;
 	int shared = 0;
 
@@ -255,8 +256,10 @@ int setup_irq(unsigned int irq, struct irqaction *new)
 		 * set the trigger type must match.
 		 */
 		if (!((old->flags & new->flags) & IRQF_SHARED) ||
-		    ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK))
+		    ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK)) {
+			old_name = old->name;
 			goto mismatch;
+		}
 
 #if defined(CONFIG_IRQ_PER_CPU)
 		/* All handlers must agree on per-cpuness */
@@ -322,11 +325,13 @@ int setup_irq(unsigned int irq, struct irqaction *new)
 	return 0;
 
 mismatch:
-	spin_unlock_irqrestore(&desc->lock, flags);
 	if (!(new->flags & IRQF_PROBE_SHARED)) {
 		printk(KERN_ERR "IRQ handler type mismatch for IRQ %d\n", irq);
+		if (old_name)
+			printk(KERN_ERR "current handler: %s\n", old_name);
 		dump_stack();
 	}
+	spin_unlock_irqrestore(&desc->lock, flags);
 	return -EBUSY;
 }
 
-- 
cgit v1.2.3


From 9a3a04ac386f44175b6a4142eaeab3d4170a57f3 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@woody.osdl.org>
Date: Tue, 14 Nov 2006 15:20:51 -0800
Subject: Revert "[PATCH] fix Data Acess error in dup_fd"

This reverts commit 0130b0b32ee53dc7add773fcea984f6a26ef1da3.

Sergey Vlasov points out (and Vadim Lobanov concurs) that the bug it was
supposed to fix must be some unrelated memory corruption, and the "fix"
actually causes more problems:

  "However, the new code does not look safe in all cases.  If some other
   task has opened more files while dup_fd() released oldf->file_lock, the
   new code will update open_files to the new larger value.  But newf was
   allocated with the old smaller value of open_files, therefore subsequent
   accesses to newf may try to write into unallocated memory."

so revert it.

Cc: Sharyathi Nagesh <sharyath@in.ibm.com>
Cc: Sergey Vlasov <vsu@altlinux.ru>
Cc: Vadim Lobanov <vlobanov@speakeasy.net>
Cc: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 4b4eab2a3161..3da978eec791 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -687,7 +687,6 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
 		 * the latest pointer.
 		 */
 		spin_lock(&oldf->file_lock);
-		open_files = count_open_files(old_fdt);
 		old_fdt = files_fdtable(oldf);
 	}
 
-- 
cgit v1.2.3


From b86432b42eba5671969a9e6483ee219674b7ee25 Mon Sep 17 00:00:00 2001
From: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Date: Thu, 16 Nov 2006 01:19:10 -0800
Subject: [PATCH] some irq_chip variables point to NULL

I got an oops when booting 2.6.19-rc5-mm1 on my ia64 machine.

Below is the log.

Oops 11012296146944 [1]
Modules linked in: binfmt_misc dm_mirror dm_multipath dm_mod thermal processor f
an container button sg eepro100 e100 mii

Pid: 0, CPU 0, comm:              swapper
psr : 0000121008022038 ifs : 800000000000040b ip  : [<a0000001000e1411>]    Not
tainted
ip is at __do_IRQ+0x371/0x3e0
unat: 0000000000000000 pfs : 000000000000040b rsc : 0000000000000003
rnat: 656960155aa56aa5 bsps: a00000010058b890 pr  : 656960155aa55a65
ldrs: 0000000000000000 ccv : 0000000000000000 fpsr: 0009804c0270033f
csd : 0000000000000000 ssd : 0000000000000000
b0  : a0000001000e1390 b6  : a0000001005beac0 b7  : e00000007f01aa00
f6  : 000000000000000000000 f7  : 0ffe69090000000000000
f8  : 1000a9090000000000000 f9  : 0ffff8000000000000000
f10 : 1000a908ffffff6f70000 f11 : 1003e0000000000000909
r1  : a000000100fbbff0 r2  : 0000000000010002 r3  : 0000000000010001
r8  : fffffffffffbffff r9  : a000000100bd8060 r10 : a000000100dd83b8
r11 : fffffffffffeffff r12 : a000000100bcbbb0 r13 : a000000100bc4000
r14 : 0000000000010000 r15 : 0000000000010000 r16 : a000000100c01aa8
r17 : a000000100d2c350 r18 : 0000000000000000 r19 : a000000100d2c300
r20 : a000000100c01a88 r21 : 0000000080010100 r22 : a000000100c01ac0
r23 : a0000001000108e0 r24 : e000000477980004 r25 : 0000000000000000
r26 : 0000000000000000 r27 : e00000000913400c r28 : e0000004799ee51c
r29 : e0000004778b87f0 r30 : a000000100d2c300 r31 : a00000010005c7e0

Call Trace:
 [<a000000100014600>] show_stack+0x40/0xa0
                                sp=a000000100bcb760 bsp=a000000100bc4f40
 [<a000000100014f00>] show_regs+0x840/0x880
                                sp=a000000100bcb930 bsp=a000000100bc4ee8
 [<a000000100037fb0>] die+0x250/0x320
                                sp=a000000100bcb930 bsp=a000000100bc4ea0
 [<a00000010005e5f0>] ia64_do_page_fault+0x8d0/0xa20
                                sp=a000000100bcb950 bsp=a000000100bc4e50
 [<a00000010000caa0>] ia64_leave_kernel+0x0/0x290
                                sp=a000000100bcb9e0 bsp=a000000100bc4e50
 [<a0000001000e1410>] __do_IRQ+0x370/0x3e0
                                sp=a000000100bcbbb0 bsp=a000000100bc4df0
 [<a000000100011f50>] ia64_handle_irq+0x170/0x220
                                sp=a000000100bcbbb0 bsp=a000000100bc4dc0
 [<a00000010000caa0>] ia64_leave_kernel+0x0/0x290
                                sp=a000000100bcbbb0 bsp=a000000100bc4dc0
 [<a000000100012390>] ia64_pal_call_static+0x90/0xc0
                                sp=a000000100bcbd80 bsp=a000000100bc4d78
 [<a000000100015630>] default_idle+0x90/0x160
                                sp=a000000100bcbd80 bsp=a000000100bc4d58
 [<a000000100014290>] cpu_idle+0x1f0/0x440
                                sp=a000000100bcbe20 bsp=a000000100bc4d18
 [<a000000100009980>] rest_init+0xc0/0xe0
                                sp=a000000100bcbe20 bsp=a000000100bc4d00
 [<a0000001009f8ea0>] start_kernel+0x6a0/0x6c0
                                sp=a000000100bcbe20 bsp=a000000100bc4ca0
 [<a0000001000089f0>] __end_ivt_text+0x6d0/0x6f0
                                sp=a000000100bcbe30 bsp=a000000100bc4c00
 <0>Kernel panic - not syncing: Aiee, killing interrupt handler!

The root cause is that some irq_chip variables, especially ia64_msi_chip,
initiate their memeber end to point to NULL. __do_IRQ doesn't check
if irq_chip->end is null and just calls it after processing the interrupt.

As irq_chip->end is called at many places, so I fix it by reinitiating
irq_chip->end to dummy_irq_chip.end, e.g., a noop function.

Signed-off-by: Zhang Yanmin <yanmin.zhang@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "Luck, Tony" <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/irq/chip.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 2d0dc3efe813..ebfd24a41858 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -233,6 +233,8 @@ void irq_chip_set_defaults(struct irq_chip *chip)
 		chip->shutdown = chip->disable;
 	if (!chip->name)
 		chip->name = chip->typename;
+	if (!chip->end)
+		chip->end = dummy_irq_chip.end;
 }
 
 static inline void mask_ack_irq(struct irq_desc *desc, int irq)
-- 
cgit v1.2.3


From 1ff5683043196b9ad628a5de6bf8eeca52ee8bfd Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 17 Nov 2006 19:57:22 +0100
Subject: [PATCH] lockdep: fix static keys in module-allocated percpu areas

lockdep got confused by certain locks in modules:

 INFO: trying to register non-static key.
 the code is fine but needs lockdep annotation.
 turning off the locking correctness validator.

 Call Trace:
  [<ffffffff8026f40d>] dump_trace+0xaa/0x3f2
  [<ffffffff8026f78f>] show_trace+0x3a/0x60
  [<ffffffff8026f9d1>] dump_stack+0x15/0x17
  [<ffffffff802abfe8>] __lock_acquire+0x724/0x9bb
  [<ffffffff802ac52b>] lock_acquire+0x4d/0x67
  [<ffffffff80267139>] rt_spin_lock+0x3d/0x41
  [<ffffffff8839ed3f>] :ip_conntrack:__ip_ct_refresh_acct+0x131/0x174
  [<ffffffff883a1334>] :ip_conntrack:udp_packet+0xbf/0xcf
  [<ffffffff8839f9af>] :ip_conntrack:ip_conntrack_in+0x394/0x4a7
  [<ffffffff8023551f>] nf_iterate+0x41/0x7f
  [<ffffffff8025946a>] nf_hook_slow+0x64/0xd5
  [<ffffffff802369a2>] ip_rcv+0x24e/0x506
  [...]

Steven Rostedt found the bug: static_obj() check did not take
PERCPU_ENOUGH_ROOM into account, so in-module DEFINE_PER_CPU-area locks
were triggering this message.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/lockdep.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index b739be2a6dc9..c9fefdb1a7db 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -1081,7 +1081,8 @@ static int static_obj(void *obj)
 	 */
 	for_each_possible_cpu(i) {
 		start = (unsigned long) &__per_cpu_start + per_cpu_offset(i);
-		end   = (unsigned long) &__per_cpu_end   + per_cpu_offset(i);
+		end   = (unsigned long) &__per_cpu_start + PERCPU_ENOUGH_ROOM
+					+ per_cpu_offset(i);
 
 		if ((addr >= start) && (addr < end))
 			return 1;
-- 
cgit v1.2.3