diff options
Diffstat (limited to 'kernel')
49 files changed, 782 insertions, 359 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 642d4277c2ea..2a999836ca18 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -4,11 +4,12 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ exit.o itimer.o time.o softirq.o resource.o \ - sysctl.o capability.o ptrace.o timer.o user.o \ + sysctl.o capability.o ptrace.o timer.o user.o user_namespace.o \ signal.o sys.o kmod.o workqueue.o pid.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ - hrtimer.o rwsem.o latency.o nsproxy.o srcu.o die_notifier.o + hrtimer.o rwsem.o latency.o nsproxy.o srcu.o die_notifier.o \ + utsname.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += time/ @@ -48,7 +49,6 @@ obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_RELAY) += relay.o obj-$(CONFIG_SYSCTL) += utsname_sysctl.o -obj-$(CONFIG_UTS_NS) += utsname.o obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o diff --git a/kernel/audit.c b/kernel/audit.c index d13276d41410..eb0f9165b401 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -58,6 +58,7 @@ #include <linux/selinux.h> #include <linux/inotify.h> #include <linux/freezer.h> +#include <linux/tty.h> #include "audit.h" @@ -391,6 +392,7 @@ static int kauditd_thread(void *dummy) { struct sk_buff *skb; + set_freezable(); while (!kthread_should_stop()) { skb = skb_dequeue(&audit_skb_queue); wake_up(&audit_backlog_wait); @@ -423,6 +425,31 @@ static int kauditd_thread(void *dummy) return 0; } +static int audit_prepare_user_tty(pid_t pid, uid_t loginuid) +{ + struct task_struct *tsk; + int err; + + read_lock(&tasklist_lock); + tsk = find_task_by_pid(pid); + err = -ESRCH; + if (!tsk) + goto out; + err = 0; + + spin_lock_irq(&tsk->sighand->siglock); + if (!tsk->signal->audit_tty) + err = -EPERM; + spin_unlock_irq(&tsk->sighand->siglock); + if (err) + goto out; + + tty_audit_push_task(tsk, loginuid); +out: + read_unlock(&tasklist_lock); + return err; +} + int audit_send_list(void *_dest) { struct audit_netlink_list *dest = _dest; @@ -511,6 +538,8 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) case AUDIT_DEL: case AUDIT_DEL_RULE: case AUDIT_SIGNAL_INFO: + case AUDIT_TTY_GET: + case AUDIT_TTY_SET: if (security_netlink_recv(skb, CAP_AUDIT_CONTROL)) err = -EPERM; break; @@ -622,6 +651,11 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) err = audit_filter_user(&NETLINK_CB(skb), msg_type); if (err == 1) { err = 0; + if (msg_type == AUDIT_USER_TTY) { + err = audit_prepare_user_tty(pid, loginuid); + if (err) + break; + } ab = audit_log_start(NULL, GFP_KERNEL, msg_type); if (ab) { audit_log_format(ab, @@ -638,8 +672,17 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) " subj=%s", ctx); kfree(ctx); } - audit_log_format(ab, " msg='%.1024s'", - (char *)data); + if (msg_type != AUDIT_USER_TTY) + audit_log_format(ab, " msg='%.1024s'", + (char *)data); + else { + int size; + + audit_log_format(ab, " msg="); + size = nlmsg_len(nlh); + audit_log_n_untrustedstring(ab, size, + data); + } audit_set_pid(ab, pid); audit_log_end(ab); } @@ -730,6 +773,45 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) 0, 0, sig_data, sizeof(*sig_data) + len); kfree(sig_data); break; + case AUDIT_TTY_GET: { + struct audit_tty_status s; + struct task_struct *tsk; + + read_lock(&tasklist_lock); + tsk = find_task_by_pid(pid); + if (!tsk) + err = -ESRCH; + else { + spin_lock_irq(&tsk->sighand->siglock); + s.enabled = tsk->signal->audit_tty != 0; + spin_unlock_irq(&tsk->sighand->siglock); + } + read_unlock(&tasklist_lock); + audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_TTY_GET, 0, 0, + &s, sizeof(s)); + break; + } + case AUDIT_TTY_SET: { + struct audit_tty_status *s; + struct task_struct *tsk; + + if (nlh->nlmsg_len < sizeof(struct audit_tty_status)) + return -EINVAL; + s = data; + if (s->enabled != 0 && s->enabled != 1) + return -EINVAL; + read_lock(&tasklist_lock); + tsk = find_task_by_pid(pid); + if (!tsk) + err = -ESRCH; + else { + spin_lock_irq(&tsk->sighand->siglock); + tsk->signal->audit_tty = s->enabled != 0; + spin_unlock_irq(&tsk->sighand->siglock); + } + read_unlock(&tasklist_lock); + break; + } default: err = -EINVAL; break; @@ -1185,7 +1267,7 @@ static void audit_log_n_string(struct audit_buffer *ab, size_t slen, } /** - * audit_log_n_unstrustedstring - log a string that may contain random characters + * audit_log_n_untrustedstring - log a string that may contain random characters * @ab: audit_buffer * @len: lenth of string (not including trailing null) * @string: string to be logged @@ -1201,25 +1283,24 @@ static void audit_log_n_string(struct audit_buffer *ab, size_t slen, const char *audit_log_n_untrustedstring(struct audit_buffer *ab, size_t len, const char *string) { - const unsigned char *p = string; + const unsigned char *p; - while (*p) { + for (p = string; p < (const unsigned char *)string + len && *p; p++) { if (*p == '"' || *p < 0x21 || *p > 0x7f) { audit_log_hex(ab, string, len); return string + len + 1; } - p++; } audit_log_n_string(ab, len, string); return p + 1; } /** - * audit_log_unstrustedstring - log a string that may contain random characters + * audit_log_untrustedstring - log a string that may contain random characters * @ab: audit_buffer * @string: string to be logged * - * Same as audit_log_n_unstrustedstring(), except that strlen is used to + * Same as audit_log_n_untrustedstring(), except that strlen is used to * determine string length. */ const char *audit_log_untrustedstring(struct audit_buffer *ab, const char *string) diff --git a/kernel/audit.h b/kernel/audit.h index 815d6f5c04ee..95877435c347 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -115,7 +115,6 @@ extern struct sk_buff * audit_make_reply(int pid, int seq, int type, extern void audit_send_reply(int pid, int seq, int type, int done, int multi, void *payload, int size); -extern void audit_log_lost(const char *message); extern void audit_panic(const char *message); struct audit_netlink_list { diff --git a/kernel/auditsc.c b/kernel/auditsc.c index e36481ed61b4..b7640a5f382a 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -71,9 +71,6 @@ extern struct list_head audit_filter_list[]; -/* No syscall auditing will take place unless audit_enabled != 0. */ -extern int audit_enabled; - /* AUDIT_NAMES is the number of slots we reserve in the audit_context * for saving names from getname(). */ #define AUDIT_NAMES 20 @@ -2040,7 +2037,7 @@ int __audit_signal_info(int sig, struct task_struct *t) /** * audit_core_dumps - record information about processes that end abnormally - * @sig: signal value + * @signr: signal value * * If a process ends with a core dump, something fishy is going on and we * should record the event for investigation. diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 4c49188cc49b..824b1c01f410 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -981,10 +981,10 @@ static int update_nodemask(struct cpuset *cs, char *buf) mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL); if (!mmarray) goto done; - write_lock_irq(&tasklist_lock); /* block fork */ + read_lock(&tasklist_lock); /* block fork */ if (atomic_read(&cs->count) <= ntasks) break; /* got enough */ - write_unlock_irq(&tasklist_lock); /* try again */ + read_unlock(&tasklist_lock); /* try again */ kfree(mmarray); } @@ -1006,7 +1006,7 @@ static int update_nodemask(struct cpuset *cs, char *buf) continue; mmarray[n++] = mm; } while_each_thread(g, p); - write_unlock_irq(&tasklist_lock); + read_unlock(&tasklist_lock); /* * Now that we've dropped the tasklist spinlock, we can diff --git a/kernel/exit.c b/kernel/exit.c index ca6a11b73023..e8af8d0c2483 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -31,6 +31,7 @@ #include <linux/mempolicy.h> #include <linux/taskstats_kern.h> #include <linux/delayacct.h> +#include <linux/freezer.h> #include <linux/cpuset.h> #include <linux/syscalls.h> #include <linux/signal.h> @@ -387,6 +388,11 @@ void daemonize(const char *name, ...) * they would be locked into memory. */ exit_mm(current); + /* + * We don't want to have TIF_FREEZE set if the system-wide hibernation + * or suspend transition begins right now. + */ + current->flags |= PF_NOFREEZE; set_special_pids(1, 1); proc_clear_tty(current); @@ -858,6 +864,34 @@ static void exit_notify(struct task_struct *tsk) release_task(tsk); } +#ifdef CONFIG_DEBUG_STACK_USAGE +static void check_stack_usage(void) +{ + static DEFINE_SPINLOCK(low_water_lock); + static int lowest_to_date = THREAD_SIZE; + unsigned long *n = end_of_stack(current); + unsigned long free; + + while (*n == 0) + n++; + free = (unsigned long)n - (unsigned long)end_of_stack(current); + + if (free >= lowest_to_date) + return; + + spin_lock(&low_water_lock); + if (free < lowest_to_date) { + printk(KERN_WARNING "%s used greatest stack depth: %lu bytes " + "left\n", + current->comm, free); + lowest_to_date = free; + } + spin_unlock(&low_water_lock); +} +#else +static inline void check_stack_usage(void) {} +#endif + fastcall NORET_TYPE void do_exit(long code) { struct task_struct *tsk = current; @@ -937,6 +971,8 @@ fastcall NORET_TYPE void do_exit(long code) if (unlikely(tsk->compat_robust_list)) compat_exit_robust_list(tsk); #endif + if (group_dead) + tty_audit_exit(); if (unlikely(tsk->audit_context)) audit_free(tsk); @@ -949,6 +985,7 @@ fastcall NORET_TYPE void do_exit(long code) exit_sem(tsk); __exit_files(tsk); __exit_fs(tsk); + check_stack_usage(); exit_thread(); cpuset_exit(tsk); exit_keys(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index da3a155bba0d..ba39bdb2a7b8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -49,6 +49,7 @@ #include <linux/delayacct.h> #include <linux/taskstats_kern.h> #include <linux/random.h> +#include <linux/tty.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -897,6 +898,8 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts } acct_init_pacct(&sig->pacct); + tty_audit_fork(sig); + return 0; } @@ -920,7 +923,7 @@ static inline void copy_flags(unsigned long clone_flags, struct task_struct *p) { unsigned long new_flags = p->flags; - new_flags &= ~(PF_SUPERPRIV | PF_NOFREEZE); + new_flags &= ~PF_SUPERPRIV; new_flags |= PF_FORKNOEXEC; if (!(clone_flags & CLONE_PTRACE)) p->ptrace = 0; @@ -999,7 +1002,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (atomic_read(&p->user->processes) >= p->signal->rlim[RLIMIT_NPROC].rlim_cur) { if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && - p->user != &root_user) + p->user != current->nsproxy->user_ns->root_user) goto bad_fork_free; } @@ -1059,6 +1062,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->lock_depth = -1; /* -1 = no lock */ do_posix_clock_monotonic_gettime(&p->start_time); + p->real_start_time = p->start_time; + monotonic_to_bootbased(&p->real_start_time); p->security = NULL; p->io_context = NULL; p->io_wait = NULL; @@ -1601,7 +1606,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) err = -EINVAL; if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| - CLONE_NEWUTS|CLONE_NEWIPC)) + CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWUSER)) goto bad_unshare_out; if ((err = unshare_thread(unshare_flags))) diff --git a/kernel/futex.c b/kernel/futex.c index 45490bec5831..5c3f45d07c53 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -121,6 +121,24 @@ static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS]; static struct vfsmount *futex_mnt; /* + * Take mm->mmap_sem, when futex is shared + */ +static inline void futex_lock_mm(struct rw_semaphore *fshared) +{ + if (fshared) + down_read(fshared); +} + +/* + * Release mm->mmap_sem, when the futex is shared + */ +static inline void futex_unlock_mm(struct rw_semaphore *fshared) +{ + if (fshared) + up_read(fshared); +} + +/* * We hash on the keys returned from get_futex_key (see below). */ static struct futex_hash_bucket *hash_futex(union futex_key *key) @@ -287,7 +305,18 @@ void drop_futex_key_refs(union futex_key *key) } EXPORT_SYMBOL_GPL(drop_futex_key_refs); -static inline int get_futex_value_locked(u32 *dest, u32 __user *from) +static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) +{ + u32 curval; + + pagefault_disable(); + curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); + pagefault_enable(); + + return curval; +} + +static int get_futex_value_locked(u32 *dest, u32 __user *from) { int ret; @@ -620,9 +649,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) newval = FUTEX_WAITERS | new_owner->pid; - pagefault_disable(); - curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); - pagefault_enable(); + curval = cmpxchg_futex_value_locked(uaddr, uval, newval); if (curval == -EFAULT) ret = -EFAULT; @@ -659,9 +686,7 @@ static int unlock_futex_pi(u32 __user *uaddr, u32 uval) * There is no waiter, so we unlock the futex. The owner died * bit has not to be preserved here. We are the owner: */ - pagefault_disable(); - oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0); - pagefault_enable(); + oldval = cmpxchg_futex_value_locked(uaddr, uval, 0); if (oldval == -EFAULT) return oldval; @@ -700,8 +725,7 @@ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared, union futex_key key; int ret; - if (fshared) - down_read(fshared); + futex_lock_mm(fshared); ret = get_futex_key(uaddr, fshared, &key); if (unlikely(ret != 0)) @@ -725,8 +749,7 @@ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared, spin_unlock(&hb->lock); out: - if (fshared) - up_read(fshared); + futex_unlock_mm(fshared); return ret; } @@ -746,8 +769,7 @@ futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared, int ret, op_ret, attempt = 0; retryfull: - if (fshared) - down_read(fshared); + futex_lock_mm(fshared); ret = get_futex_key(uaddr1, fshared, &key1); if (unlikely(ret != 0)) @@ -793,7 +815,7 @@ retry: */ if (attempt++) { ret = futex_handle_fault((unsigned long)uaddr2, - fshared, attempt); + fshared, attempt); if (ret) goto out; goto retry; @@ -803,8 +825,7 @@ retry: * If we would have faulted, release mmap_sem, * fault it in and start all over again. */ - if (fshared) - up_read(fshared); + futex_unlock_mm(fshared); ret = get_user(dummy, uaddr2); if (ret) @@ -841,8 +862,8 @@ retry: if (hb1 != hb2) spin_unlock(&hb2->lock); out: - if (fshared) - up_read(fshared); + futex_unlock_mm(fshared); + return ret; } @@ -861,8 +882,7 @@ static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared, int ret, drop_count = 0; retry: - if (fshared) - down_read(fshared); + futex_lock_mm(fshared); ret = get_futex_key(uaddr1, fshared, &key1); if (unlikely(ret != 0)) @@ -890,8 +910,7 @@ static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared, * If we would have faulted, release mmap_sem, fault * it in and start all over again. */ - if (fshared) - up_read(fshared); + futex_unlock_mm(fshared); ret = get_user(curval, uaddr1); @@ -944,8 +963,7 @@ out_unlock: drop_futex_key_refs(&key1); out: - if (fshared) - up_read(fshared); + futex_unlock_mm(fshared); return ret; } @@ -1113,10 +1131,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, while (!ret) { newval = (uval & FUTEX_OWNER_DIED) | newtid; - pagefault_disable(); - curval = futex_atomic_cmpxchg_inatomic(uaddr, - uval, newval); - pagefault_enable(); + curval = cmpxchg_futex_value_locked(uaddr, uval, newval); if (curval == -EFAULT) ret = -EFAULT; @@ -1134,6 +1149,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, #define ARG3_SHARED 1 static long futex_wait_restart(struct restart_block *restart); + static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, u32 val, ktime_t *abs_time) { @@ -1148,8 +1164,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, q.pi_state = NULL; retry: - if (fshared) - down_read(fshared); + futex_lock_mm(fshared); ret = get_futex_key(uaddr, fshared, &q.key); if (unlikely(ret != 0)) @@ -1186,8 +1201,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, * If we would have faulted, release mmap_sem, fault it in and * start all over again. */ - if (fshared) - up_read(fshared); + futex_unlock_mm(fshared); ret = get_user(uval, uaddr); @@ -1206,8 +1220,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, * Now the futex is queued and we have checked the data, we * don't want to hold mmap_sem while we sleep. */ - if (fshared) - up_read(fshared); + futex_unlock_mm(fshared); /* * There might have been scheduling since the queue_me(), as we @@ -1285,8 +1298,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, queue_unlock(&q, hb); out_release_sem: - if (fshared) - up_read(fshared); + futex_unlock_mm(fshared); return ret; } @@ -1333,8 +1345,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, q.pi_state = NULL; retry: - if (fshared) - down_read(fshared); + futex_lock_mm(fshared); ret = get_futex_key(uaddr, fshared, &q.key); if (unlikely(ret != 0)) @@ -1353,9 +1364,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, */ newval = current->pid; - pagefault_disable(); - curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval); - pagefault_enable(); + curval = cmpxchg_futex_value_locked(uaddr, 0, newval); if (unlikely(curval == -EFAULT)) goto uaddr_faulted; @@ -1398,9 +1407,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, lock_taken = 1; } - pagefault_disable(); - curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); - pagefault_enable(); + curval = cmpxchg_futex_value_locked(uaddr, uval, newval); if (unlikely(curval == -EFAULT)) goto uaddr_faulted; @@ -1428,8 +1435,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, * exit to complete. */ queue_unlock(&q, hb); - if (fshared) - up_read(fshared); + futex_unlock_mm(fshared); cond_resched(); goto retry; @@ -1465,8 +1471,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, * Now the futex is queued and we have checked the data, we * don't want to hold mmap_sem while we sleep. */ - if (fshared) - up_read(fshared); + futex_unlock_mm(fshared); WARN_ON(!q.pi_state); /* @@ -1480,8 +1485,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, ret = ret ? 0 : -EWOULDBLOCK; } - if (fshared) - down_read(fshared); + futex_lock_mm(fshared); spin_lock(q.lock_ptr); if (!ret) { @@ -1518,8 +1522,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, /* Unqueue and drop the lock */ unqueue_me_pi(&q); - if (fshared) - up_read(fshared); + futex_unlock_mm(fshared); return ret != -EINTR ? ret : -ERESTARTNOINTR; @@ -1527,8 +1530,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, queue_unlock(&q, hb); out_release_sem: - if (fshared) - up_read(fshared); + futex_unlock_mm(fshared); return ret; uaddr_faulted: @@ -1550,8 +1552,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, goto retry_unlocked; } - if (fshared) - up_read(fshared); + futex_unlock_mm(fshared); ret = get_user(uval, uaddr); if (!ret && (uval != -EFAULT)) @@ -1585,8 +1586,7 @@ retry: /* * First take all the futex related locks: */ - if (fshared) - down_read(fshared); + futex_lock_mm(fshared); ret = get_futex_key(uaddr, fshared, &key); if (unlikely(ret != 0)) @@ -1601,11 +1601,9 @@ retry_unlocked: * again. If it succeeds then we can return without waking * anyone else up: */ - if (!(uval & FUTEX_OWNER_DIED)) { - pagefault_disable(); - uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0); - pagefault_enable(); - } + if (!(uval & FUTEX_OWNER_DIED)) + uval = cmpxchg_futex_value_locked(uaddr, current->pid, 0); + if (unlikely(uval == -EFAULT)) goto pi_faulted; @@ -1647,8 +1645,7 @@ retry_unlocked: out_unlock: spin_unlock(&hb->lock); out: - if (fshared) - up_read(fshared); + futex_unlock_mm(fshared); return ret; @@ -1671,8 +1668,7 @@ pi_faulted: goto retry_unlocked; } - if (fshared) - up_read(fshared); + futex_unlock_mm(fshared); ret = get_user(uval, uaddr); if (!ret && (uval != -EFAULT)) @@ -1729,8 +1725,8 @@ static int futex_fd(u32 __user *uaddr, int signal) if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) { printk(KERN_WARNING "Process `%s' used FUTEX_FD, which " - "will be removed from the kernel in June 2007\n", - current->comm); + "will be removed from the kernel in June 2007\n", + current->comm); } ret = -EINVAL; @@ -1908,10 +1904,8 @@ retry: * Wake robust non-PI futexes here. The wakeup of * PI futexes happens in exit_pi_state(): */ - if (!pi) { - if (uval & FUTEX_WAITERS) + if (!pi && (uval & FUTEX_WAITERS)) futex_wake(uaddr, &curr->mm->mmap_sem, 1); - } } return 0; } diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 23c03f43e196..72d034258ba1 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1406,7 +1406,7 @@ static void migrate_hrtimers(int cpu) static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { - long cpu = (long)hcpu; + unsigned int cpu = (long)hcpu; switch (action) { diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index bd9e272d55e9..32b161972fad 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -172,7 +172,17 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret) { if (unlikely(action_ret != IRQ_HANDLED)) { - desc->irqs_unhandled++; + /* + * If we are seeing only the odd spurious IRQ caused by + * bus asynchronicity then don't eventually trigger an error, + * otherwise the couter becomes a doomsday timer for otherwise + * working systems + */ + if (jiffies - desc->last_unhandled > HZ/10) + desc->irqs_unhandled = 1; + else + desc->irqs_unhandled++; + desc->last_unhandled = jiffies; if (unlikely(action_ret != IRQ_NONE)) report_bad_irq(irq, desc, action_ret); } diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index fed54418626c..474219a41929 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -152,7 +152,7 @@ static unsigned int get_symbol_offset(unsigned long pos) /* Lookup the address for this symbol. Returns 0 if not found. */ unsigned long kallsyms_lookup_name(const char *name) { - char namebuf[KSYM_NAME_LEN+1]; + char namebuf[KSYM_NAME_LEN]; unsigned long i; unsigned int off; @@ -248,7 +248,7 @@ const char *kallsyms_lookup(unsigned long addr, { const char *msym; - namebuf[KSYM_NAME_LEN] = 0; + namebuf[KSYM_NAME_LEN - 1] = 0; namebuf[0] = 0; if (is_ksym_addr(addr)) { @@ -265,7 +265,7 @@ const char *kallsyms_lookup(unsigned long addr, /* see if it's in a module */ msym = module_address_lookup(addr, symbolsize, offset, modname); if (msym) - return strncpy(namebuf, msym, KSYM_NAME_LEN); + return strncpy(namebuf, msym, KSYM_NAME_LEN - 1); return NULL; } @@ -273,7 +273,7 @@ const char *kallsyms_lookup(unsigned long addr, int lookup_symbol_name(unsigned long addr, char *symname) { symname[0] = '\0'; - symname[KSYM_NAME_LEN] = '\0'; + symname[KSYM_NAME_LEN - 1] = '\0'; if (is_ksym_addr(addr)) { unsigned long pos; @@ -291,7 +291,7 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size, unsigned long *offset, char *modname, char *name) { name[0] = '\0'; - name[KSYM_NAME_LEN] = '\0'; + name[KSYM_NAME_LEN - 1] = '\0'; if (is_ksym_addr(addr)) { unsigned long pos; @@ -312,18 +312,17 @@ int sprint_symbol(char *buffer, unsigned long address) char *modname; const char *name; unsigned long offset, size; - char namebuf[KSYM_NAME_LEN+1]; + char namebuf[KSYM_NAME_LEN]; name = kallsyms_lookup(address, &size, &offset, &modname, namebuf); if (!name) return sprintf(buffer, "0x%lx", address); - else { - if (modname) - return sprintf(buffer, "%s+%#lx/%#lx [%s]", name, offset, + + if (modname) + return sprintf(buffer, "%s+%#lx/%#lx [%s]", name, offset, size, modname); - else - return sprintf(buffer, "%s+%#lx/%#lx", name, offset, size); - } + else + return sprintf(buffer, "%s+%#lx/%#lx", name, offset, size); } /* Look up a kernel symbol and print it to the kernel messages. */ @@ -343,8 +342,8 @@ struct kallsym_iter unsigned long value; unsigned int nameoff; /* If iterating in core kernel symbols */ char type; - char name[KSYM_NAME_LEN+1]; - char module_name[MODULE_NAME_LEN + 1]; + char name[KSYM_NAME_LEN]; + char module_name[MODULE_NAME_LEN]; int exported; }; diff --git a/kernel/kfifo.c b/kernel/kfifo.c index cee419143fd4..bc41ad0f24f8 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -24,6 +24,7 @@ #include <linux/slab.h> #include <linux/err.h> #include <linux/kfifo.h> +#include <linux/log2.h> /** * kfifo_init - allocates a new FIFO using a preallocated buffer @@ -41,7 +42,7 @@ struct kfifo *kfifo_init(unsigned char *buffer, unsigned int size, struct kfifo *fifo; /* size must be a power of 2 */ - BUG_ON(size & (size - 1)); + BUG_ON(!is_power_of_2(size)); fifo = kmalloc(sizeof(struct kfifo), gfp_mask); if (!fifo) diff --git a/kernel/kthread.c b/kernel/kthread.c index bbd51b81a3e8..a404f7ee7395 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -215,7 +215,7 @@ int kthread_stop(struct task_struct *k) EXPORT_SYMBOL(kthread_stop); -static __init void kthreadd_setup(void) +static noinline __init_refok void kthreadd_setup(void) { struct task_struct *tsk = current; diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 1a5ff2211d88..edba2ffb43de 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -379,7 +379,7 @@ get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4 static void print_lock_name(struct lock_class *class) { - char str[KSYM_NAME_LEN + 1], c1, c2, c3, c4; + char str[KSYM_NAME_LEN], c1, c2, c3, c4; const char *name; get_usage_chars(class, &c1, &c2, &c3, &c4); @@ -401,7 +401,7 @@ static void print_lock_name(struct lock_class *class) static void print_lockdep_cache(struct lockdep_map *lock) { const char *name; - char str[KSYM_NAME_LEN + 1]; + char str[KSYM_NAME_LEN]; name = lock->name; if (!name) diff --git a/kernel/module.c b/kernel/module.c index 015d60cfd90e..33c04ad51175 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -61,10 +61,8 @@ extern int module_sysfs_initialized; /* If this is set, the section belongs in the init part of the module */ #define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) -/* Protects module list */ -static DEFINE_SPINLOCK(modlist_lock); - -/* List of modules, protected by module_mutex AND modlist_lock */ +/* List of modules, protected by module_mutex or preempt_disable + * (add/delete uses stop_machine). */ static DEFINE_MUTEX(module_mutex); static LIST_HEAD(modules); @@ -760,14 +758,13 @@ static void print_unload_info(struct seq_file *m, struct module *mod) void __symbol_put(const char *symbol) { struct module *owner; - unsigned long flags; const unsigned long *crc; - spin_lock_irqsave(&modlist_lock, flags); + preempt_disable(); if (!__find_symbol(symbol, &owner, &crc, 1)) BUG(); module_put(owner); - spin_unlock_irqrestore(&modlist_lock, flags); + preempt_enable(); } EXPORT_SYMBOL(__symbol_put); @@ -1228,14 +1225,14 @@ static void free_module(struct module *mod) void *__symbol_get(const char *symbol) { struct module *owner; - unsigned long value, flags; + unsigned long value; const unsigned long *crc; - spin_lock_irqsave(&modlist_lock, flags); + preempt_disable(); value = __find_symbol(symbol, &owner, &crc, 1); if (value && !strong_try_module_get(owner)) value = 0; - spin_unlock_irqrestore(&modlist_lock, flags); + preempt_enable(); return (void *)value; } @@ -2136,7 +2133,7 @@ int lookup_module_symbol_name(unsigned long addr, char *symname) sym = get_ksymbol(mod, addr, NULL, NULL); if (!sym) goto out; - strlcpy(symname, sym, KSYM_NAME_LEN + 1); + strlcpy(symname, sym, KSYM_NAME_LEN); mutex_unlock(&module_mutex); return 0; } @@ -2161,9 +2158,9 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, if (!sym) goto out; if (modname) - strlcpy(modname, mod->name, MODULE_NAME_LEN + 1); + strlcpy(modname, mod->name, MODULE_NAME_LEN); if (name) - strlcpy(name, sym, KSYM_NAME_LEN + 1); + strlcpy(name, sym, KSYM_NAME_LEN); mutex_unlock(&module_mutex); return 0; } @@ -2184,8 +2181,8 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, *value = mod->symtab[symnum].st_value; *type = mod->symtab[symnum].st_info; strlcpy(name, mod->strtab + mod->symtab[symnum].st_name, - KSYM_NAME_LEN + 1); - strlcpy(module_name, mod->name, MODULE_NAME_LEN + 1); + KSYM_NAME_LEN); + strlcpy(module_name, mod->name, MODULE_NAME_LEN); *exported = is_exported(name, mod); mutex_unlock(&module_mutex); return 0; @@ -2232,26 +2229,13 @@ unsigned long module_kallsyms_lookup_name(const char *name) /* Called by the /proc file system to return a list of modules. */ static void *m_start(struct seq_file *m, loff_t *pos) { - struct list_head *i; - loff_t n = 0; - mutex_lock(&module_mutex); - list_for_each(i, &modules) { - if (n++ == *pos) - break; - } - if (i == &modules) - return NULL; - return i; + return seq_list_start(&modules, *pos); } static void *m_next(struct seq_file *m, void *p, loff_t *pos) { - struct list_head *i = p; - (*pos)++; - if (i->next == &modules) - return NULL; - return i->next; + return seq_list_next(p, &modules, pos); } static void m_stop(struct seq_file *m, void *p) @@ -2321,11 +2305,10 @@ const struct seq_operations modules_op = { /* Given an address, look for it in the module exception tables. */ const struct exception_table_entry *search_module_extables(unsigned long addr) { - unsigned long flags; const struct exception_table_entry *e = NULL; struct module *mod; - spin_lock_irqsave(&modlist_lock, flags); + preempt_disable(); list_for_each_entry(mod, &modules, list) { if (mod->num_exentries == 0) continue; @@ -2336,7 +2319,7 @@ const struct exception_table_entry *search_module_extables(unsigned long addr) if (e) break; } - spin_unlock_irqrestore(&modlist_lock, flags); + preempt_enable(); /* Now, if we found one, we are running inside it now, hence we cannot unload the module, hence no refcnt needed. */ @@ -2348,25 +2331,24 @@ const struct exception_table_entry *search_module_extables(unsigned long addr) */ int is_module_address(unsigned long addr) { - unsigned long flags; struct module *mod; - spin_lock_irqsave(&modlist_lock, flags); + preempt_disable(); list_for_each_entry(mod, &modules, list) { if (within(addr, mod->module_core, mod->core_size)) { - spin_unlock_irqrestore(&modlist_lock, flags); + preempt_enable(); return 1; } } - spin_unlock_irqrestore(&modlist_lock, flags); + preempt_enable(); return 0; } -/* Is this a valid kernel address? We don't grab the lock: we are oopsing. */ +/* Is this a valid kernel address? */ struct module *__module_text_address(unsigned long addr) { struct module *mod; @@ -2381,11 +2363,10 @@ struct module *__module_text_address(unsigned long addr) struct module *module_text_address(unsigned long addr) { struct module *mod; - unsigned long flags; - spin_lock_irqsave(&modlist_lock, flags); + preempt_disable(); mod = __module_text_address(addr); - spin_unlock_irqrestore(&modlist_lock, flags); + preempt_enable(); return mod; } diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 9e83b589f754..10f0bbba382b 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -21,6 +21,8 @@ #include <linux/utsname.h> #include <linux/pid_namespace.h> +static struct kmem_cache *nsproxy_cachep; + struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); static inline void get_nsproxy(struct nsproxy *ns) @@ -43,9 +45,11 @@ static inline struct nsproxy *clone_nsproxy(struct nsproxy *orig) { struct nsproxy *ns; - ns = kmemdup(orig, sizeof(struct nsproxy), GFP_KERNEL); - if (ns) + ns = kmem_cache_alloc(nsproxy_cachep, GFP_KERNEL); + if (ns) { + memcpy(ns, orig, sizeof(struct nsproxy)); atomic_set(&ns->count, 1); + } return ns; } @@ -54,33 +58,51 @@ static inline struct nsproxy *clone_nsproxy(struct nsproxy *orig) * Return the newly created nsproxy. Do not attach this to the task, * leave it to the caller to do proper locking and attach it to task. */ -static struct nsproxy *create_new_namespaces(int flags, struct task_struct *tsk, - struct fs_struct *new_fs) +static struct nsproxy *create_new_namespaces(unsigned long flags, + struct task_struct *tsk, struct fs_struct *new_fs) { struct nsproxy *new_nsp; + int err; new_nsp = clone_nsproxy(tsk->nsproxy); if (!new_nsp) return ERR_PTR(-ENOMEM); new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs); - if (IS_ERR(new_nsp->mnt_ns)) + if (IS_ERR(new_nsp->mnt_ns)) { + err = PTR_ERR(new_nsp->mnt_ns); goto out_ns; + } new_nsp->uts_ns = copy_utsname(flags, tsk->nsproxy->uts_ns); - if (IS_ERR(new_nsp->uts_ns)) + if (IS_ERR(new_nsp->uts_ns)) { + err = PTR_ERR(new_nsp->uts_ns); goto out_uts; + } new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns); - if (IS_ERR(new_nsp->ipc_ns)) + if (IS_ERR(new_nsp->ipc_ns)) { + err = PTR_ERR(new_nsp->ipc_ns); goto out_ipc; + } new_nsp->pid_ns = copy_pid_ns(flags, tsk->nsproxy->pid_ns); - if (IS_ERR(new_nsp->pid_ns)) + if (IS_ERR(new_nsp->pid_ns)) { + err = PTR_ERR(new_nsp->pid_ns); goto out_pid; + } + + new_nsp->user_ns = copy_user_ns(flags, tsk->nsproxy->user_ns); + if (IS_ERR(new_nsp->user_ns)) { + err = PTR_ERR(new_nsp->user_ns); + goto out_user; + } return new_nsp; +out_user: + if (new_nsp->pid_ns) + put_pid_ns(new_nsp->pid_ns); out_pid: if (new_nsp->ipc_ns) put_ipc_ns(new_nsp->ipc_ns); @@ -91,15 +113,15 @@ out_uts: if (new_nsp->mnt_ns) put_mnt_ns(new_nsp->mnt_ns); out_ns: - kfree(new_nsp); - return ERR_PTR(-ENOMEM); + kmem_cache_free(nsproxy_cachep, new_nsp); + return ERR_PTR(err); } /* * called from clone. This now handles copy for nsproxy and all * namespaces therein. */ -int copy_namespaces(int flags, struct task_struct *tsk) +int copy_namespaces(unsigned long flags, struct task_struct *tsk) { struct nsproxy *old_ns = tsk->nsproxy; struct nsproxy *new_ns; @@ -110,7 +132,7 @@ int copy_namespaces(int flags, struct task_struct *tsk) get_nsproxy(old_ns); - if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC))) + if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWUSER))) return 0; if (!capable(CAP_SYS_ADMIN)) { @@ -140,7 +162,9 @@ void free_nsproxy(struct nsproxy *ns) put_ipc_ns(ns->ipc_ns); if (ns->pid_ns) put_pid_ns(ns->pid_ns); - kfree(ns); + if (ns->user_ns) + put_user_ns(ns->user_ns); + kmem_cache_free(nsproxy_cachep, ns); } /* @@ -152,19 +176,10 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags, { int err = 0; - if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC))) + if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | + CLONE_NEWUSER))) return 0; -#ifndef CONFIG_IPC_NS - if (unshare_flags & CLONE_NEWIPC) - return -EINVAL; -#endif - -#ifndef CONFIG_UTS_NS - if (unshare_flags & CLONE_NEWUTS) - return -EINVAL; -#endif - if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -174,3 +189,12 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags, err = PTR_ERR(*new_nsp); return err; } + +static int __init nsproxy_cache_init(void) +{ + nsproxy_cachep = kmem_cache_create("nsproxy", sizeof(struct nsproxy), + 0, SLAB_PANIC, NULL, NULL); + return 0; +} + +module_init(nsproxy_cache_init); diff --git a/kernel/panic.c b/kernel/panic.c index 623d1828259a..f64f4c1ac11f 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -159,14 +159,15 @@ const char *print_tainted(void) { static char buf[20]; if (tainted) { - snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c", + snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c", tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G', tainted & TAINT_FORCED_MODULE ? 'F' : ' ', tainted & TAINT_UNSAFE_SMP ? 'S' : ' ', tainted & TAINT_FORCED_RMMOD ? 'R' : ' ', tainted & TAINT_MACHINE_CHECK ? 'M' : ' ', tainted & TAINT_BAD_PAGE ? 'B' : ' ', - tainted & TAINT_USER ? 'U' : ' '); + tainted & TAINT_USER ? 'U' : ' ', + tainted & TAINT_DIE ? 'D' : ' '); } else snprintf(buf, sizeof(buf), "Not tainted"); diff --git a/kernel/pid.c b/kernel/pid.c index eb66bd2953ab..c6e3f9ffff87 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -365,7 +365,7 @@ struct pid *find_ge_pid(int nr) } EXPORT_SYMBOL_GPL(find_get_pid); -struct pid_namespace *copy_pid_ns(int flags, struct pid_namespace *old_ns) +struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns) { BUG_ON(!old_ns); get_pid_ns(old_ns); diff --git a/kernel/printk.c b/kernel/printk.c index 0bbdeac2810c..051d27e36a6c 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -449,13 +449,16 @@ static int printk_time = 1; #else static int printk_time = 0; #endif -module_param(printk_time, int, S_IRUGO | S_IWUSR); +module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); static int __init printk_time_setup(char *str) { if (*str) return 0; printk_time = 1; + printk(KERN_NOTICE "The 'time' option is deprecated and " + "is scheduled for removal in early 2008\n"); + printk(KERN_NOTICE "Use 'printk.time=<value>' instead\n"); return 1; } @@ -483,6 +486,9 @@ static int have_callable_console(void) * @fmt: format string * * This is printk(). It can be called from any context. We want it to work. + * Be aware of the fact that if oops_in_progress is not set, we might try to + * wake klogd up which could deadlock on runqueue lock if printk() is called + * from scheduler code. * * We try to grab the console_sem. If we succeed, it's easy - we log the output and * call the console drivers. If we fail to get the semaphore we place the output @@ -654,7 +660,7 @@ static void call_console_drivers(unsigned long start, unsigned long end) */ static int __init console_setup(char *str) { - char name[sizeof(console_cmdline[0].name)]; + char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for index */ char *s, *options; int idx; @@ -662,27 +668,27 @@ static int __init console_setup(char *str) * Decode str into name, index, options. */ if (str[0] >= '0' && str[0] <= '9') { - strcpy(name, "ttyS"); - strncpy(name + 4, str, sizeof(name) - 5); + strcpy(buf, "ttyS"); + strncpy(buf + 4, str, sizeof(buf) - 5); } else { - strncpy(name, str, sizeof(name) - 1); + strncpy(buf, str, sizeof(buf) - 1); } - name[sizeof(name) - 1] = 0; + buf[sizeof(buf) - 1] = 0; if ((options = strchr(str, ',')) != NULL) *(options++) = 0; #ifdef __sparc__ if (!strcmp(str, "ttya")) - strcpy(name, "ttyS0"); + strcpy(buf, "ttyS0"); if (!strcmp(str, "ttyb")) - strcpy(name, "ttyS1"); + strcpy(buf, "ttyS1"); #endif - for (s = name; *s; s++) + for (s = buf; *s; s++) if ((*s >= '0' && *s <= '9') || *s == ',') break; idx = simple_strtoul(s, NULL, 10); *s = 0; - add_preferred_console(name, idx, options); + add_preferred_console(buf, idx, options); return 1; } __setup("console=", console_setup); @@ -709,7 +715,7 @@ int __init add_preferred_console(char *name, int idx, char *options) * See if this tty is not yet registered, and * if we have a slot free. */ - for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) + for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) if (strcmp(console_cmdline[i].name, name) == 0 && console_cmdline[i].index == idx) { selected_console = i; @@ -726,6 +732,25 @@ int __init add_preferred_console(char *name, int idx, char *options) return 0; } +int __init update_console_cmdline(char *name, int idx, char *name_new, int idx_new, char *options) +{ + struct console_cmdline *c; + int i; + + for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) + if (strcmp(console_cmdline[i].name, name) == 0 && + console_cmdline[i].index == idx) { + c = &console_cmdline[i]; + memcpy(c->name, name_new, sizeof(c->name)); + c->name[sizeof(c->name) - 1] = 0; + c->options = options; + c->index = idx_new; + return i; + } + /* not found */ + return -1; +} + #ifndef CONFIG_DISABLE_CONSOLE_SUSPEND /** * suspend_console - suspend the console subsystem @@ -942,6 +967,9 @@ void register_console(struct console *console) if (preferred_console < 0 || bootconsole || !console_drivers) preferred_console = selected_console; + if (console->early_setup) + console->early_setup(); + /* * See if we want to use this console driver. If we * didn't select a console we take the first one @@ -985,12 +1013,15 @@ void register_console(struct console *console) if (!(console->flags & CON_ENABLED)) return; - if (bootconsole) { + if (bootconsole && (console->flags & CON_CONSDEV)) { printk(KERN_INFO "console handover: boot [%s%d] -> real [%s%d]\n", bootconsole->name, bootconsole->index, console->name, console->index); unregister_console(bootconsole); console->flags &= ~CON_PRINTBUFFER; + } else { + printk(KERN_INFO "console [%s%d] enabled\n", + console->name, console->index); } /* diff --git a/kernel/ptrace.c b/kernel/ptrace.c index ad7949a589dd..4a1745f1dadf 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -161,6 +161,7 @@ int ptrace_may_attach(struct task_struct *task) int ptrace_attach(struct task_struct *task) { int retval; + unsigned long flags; audit_ptrace(task); @@ -181,9 +182,7 @@ repeat: * cpu's that may have task_lock). */ task_lock(task); - local_irq_disable(); - if (!write_trylock(&tasklist_lock)) { - local_irq_enable(); + if (!write_trylock_irqsave(&tasklist_lock, flags)) { task_unlock(task); do { cpu_relax(); @@ -211,7 +210,7 @@ repeat: force_sig_specific(SIGSTOP, task); bad: - write_unlock_irq(&tasklist_lock); + write_unlock_irqrestore(&tasklist_lock, flags); task_unlock(task); out: return retval; @@ -491,3 +490,22 @@ asmlinkage long sys_ptrace(long request, long pid, long addr, long data) return ret; } #endif /* __ARCH_SYS_PTRACE */ + +int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data) +{ + unsigned long tmp; + int copied; + + copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), 0); + if (copied != sizeof(tmp)) + return -EIO; + return put_user(tmp, (unsigned long __user *)data); +} + +int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data) +{ + int copied; + + copied = access_process_vm(tsk, addr, &data, sizeof(data), 1); + return (copied == sizeof(data)) ? 0 : -EIO; +} diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 55ba82a85a66..ddff33247785 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -40,6 +40,7 @@ #include <linux/moduleparam.h> #include <linux/percpu.h> #include <linux/notifier.h> +#include <linux/freezer.h> #include <linux/cpu.h> #include <linux/random.h> #include <linux/delay.h> @@ -518,7 +519,6 @@ rcu_torture_writer(void *arg) VERBOSE_PRINTK_STRING("rcu_torture_writer task started"); set_user_nice(current, 19); - current->flags |= PF_NOFREEZE; do { schedule_timeout_uninterruptible(1); @@ -558,7 +558,6 @@ rcu_torture_fakewriter(void *arg) VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started"); set_user_nice(current, 19); - current->flags |= PF_NOFREEZE; do { schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); @@ -589,7 +588,6 @@ rcu_torture_reader(void *arg) VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); set_user_nice(current, 19); - current->flags |= PF_NOFREEZE; do { idx = cur_ops->readlock(); diff --git a/kernel/relay.c b/kernel/relay.c index 3b299fb3855c..a615a8f513fc 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1061,7 +1061,7 @@ static struct pipe_buf_operations relay_pipe_buf_ops = { .get = generic_pipe_buf_get, }; -/** +/* * subbuf_splice_actor - splice up to one subbuf's worth of data */ static int subbuf_splice_actor(struct file *in, @@ -1074,7 +1074,9 @@ static int subbuf_splice_actor(struct file *in, unsigned int pidx, poff, total_len, subbuf_pages, ret; struct rchan_buf *rbuf = in->private_data; unsigned int subbuf_size = rbuf->chan->subbuf_size; - size_t read_start = ((size_t)*ppos) % rbuf->chan->alloc_size; + uint64_t pos = (uint64_t) *ppos; + uint32_t alloc_size = (uint32_t) rbuf->chan->alloc_size; + size_t read_start = (size_t) do_div(pos, alloc_size); size_t read_subbuf = read_start / subbuf_size; size_t padding = rbuf->padding[read_subbuf]; size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding; diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c index da8d6bf46457..5aedbee014df 100644 --- a/kernel/rtmutex-debug.c +++ b/kernel/rtmutex-debug.c @@ -29,12 +29,6 @@ #include "rtmutex_common.h" -#ifdef CONFIG_DEBUG_RT_MUTEXES -# include "rtmutex-debug.h" -#else -# include "rtmutex.h" -#endif - # define TRACE_WARN_ON(x) WARN_ON(x) # define TRACE_BUG_ON(x) BUG_ON(x) diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c index 015fc633c96c..e3055ba69159 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c @@ -260,6 +260,7 @@ static int test_func(void *data) int ret; current->flags |= PF_MUTEX_TESTER; + set_freezable(); allow_signal(SIGHUP); for(;;) { diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 17d28ce20300..8cd9bd2cdb34 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -17,12 +17,6 @@ #include "rtmutex_common.h" -#ifdef CONFIG_DEBUG_RT_MUTEXES -# include "rtmutex-debug.h" -#else -# include "rtmutex.h" -#endif - /* * lock->owner state tracking: * diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h index 9c75856e791e..2d3b83593ca3 100644 --- a/kernel/rtmutex_common.h +++ b/kernel/rtmutex_common.h @@ -103,7 +103,7 @@ static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock) { - return (struct task_struct *) + return (struct task_struct *) ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); } @@ -120,4 +120,11 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, struct task_struct *proxy_owner); extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, struct task_struct *proxy_owner); + +#ifdef CONFIG_DEBUG_RT_MUTEXES +# include "rtmutex-debug.h" +#else +# include "rtmutex.h" +#endif + #endif diff --git a/kernel/sched.c b/kernel/sched.c index 9fbced64bfee..cb31fb4a1379 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -736,7 +736,9 @@ static void update_curr_load(struct rq *rq, u64 now) * * The "10% effect" is relative and cumulative: from _any_ nice level, * if you go up 1 level, it's -10% CPU usage, if you go down 1 level - * it's +10% CPU usage. + * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. + * If a task goes up by ~10% and another task goes down by ~10% then + * the relative distance between them is ~25%.) */ static const int prio_to_weight[40] = { /* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921, @@ -746,15 +748,22 @@ static const int prio_to_weight[40] = { /* 10 */ 110, 87, 70, 56, 45, 36, 29, 23, 18, 15, }; +/* + * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated. + * + * In cases where the weight does not change often, we can use the + * precalculated inverse to speed up arithmetics by turning divisions + * into multiplications: + */ static const u32 prio_to_wmult[40] = { - 48356, 60446, 75558, 94446, 118058, 147573, - 184467, 230589, 288233, 360285, 450347, - 562979, 703746, 879575, 1099582, 1374389, - 717986, 2147483, 2684354, 3355443, 4194304, - 244160, 6557201, 8196502, 10250518, 12782640, - 16025997, 19976592, 24970740, 31350126, 39045157, - 49367440, 61356675, 76695844, 95443717, 119304647, - 148102320, 186737708, 238609294, 286331153, +/* -20 */ 48356, 60446, 75558, 94446, 118058, +/* -15 */ 147573, 184467, 230589, 288233, 360285, +/* -10 */ 450347, 562979, 703746, 879575, 1099582, +/* -5 */ 1374389, 1717986, 2147483, 2684354, 3355443, +/* 0 */ 4194304, 5244160, 6557201, 8196502, 10250518, +/* 5 */ 12782640, 16025997, 19976592, 24970740, 31350126, +/* 10 */ 39045157, 49367440, 61356675, 76695844, 95443717, +/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, }; static inline void @@ -4647,14 +4656,14 @@ static void show_task(struct task_struct *p) state = p->state ? __ffs(p->state) + 1 : 0; printk("%-13.13s %c", p->comm, state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); -#if (BITS_PER_LONG == 32) +#if BITS_PER_LONG == 32 if (state == TASK_RUNNING) - printk(" running "); + printk(" running "); else - printk(" %08lX ", thread_saved_pc(p)); + printk(" %08lx ", thread_saved_pc(p)); #else if (state == TASK_RUNNING) - printk(" running task "); + printk(" running task "); else printk(" %016lx ", thread_saved_pc(p)); #endif @@ -4666,11 +4675,7 @@ static void show_task(struct task_struct *p) free = (unsigned long)n - (unsigned long)end_of_stack(p); } #endif - printk("%5lu %5d %6d", free, p->pid, p->parent->pid); - if (!p->mm) - printk(" (L-TLB)\n"); - else - printk(" (NOTLB)\n"); + printk("%5lu %5d %6d\n", free, p->pid, p->parent->pid); if (state != TASK_RUNNING) show_stack(p, NULL); @@ -4680,14 +4685,12 @@ void show_state_filter(unsigned long state_filter) { struct task_struct *g, *p; -#if (BITS_PER_LONG == 32) - printk("\n" - " free sibling\n"); - printk(" task PC stack pid father child younger older\n"); +#if BITS_PER_LONG == 32 + printk(KERN_INFO + " task PC stack pid father\n"); #else - printk("\n" - " free sibling\n"); - printk(" task PC stack pid father child younger older\n"); + printk(KERN_INFO + " task PC stack pid father\n"); #endif read_lock(&tasklist_lock); do_each_thread(g, p) { @@ -4778,7 +4781,7 @@ cpumask_t nohz_cpu_mask = CPU_MASK_NONE; static inline void sched_init_granularity(void) { unsigned int factor = 1 + ilog2(num_online_cpus()); - const unsigned long gran_limit = 10000000; + const unsigned long gran_limit = 100000000; sysctl_sched_granularity *= factor; if (sysctl_sched_granularity > gran_limit) @@ -4909,8 +4912,6 @@ static int migration_thread(void *data) struct migration_req *req; struct list_head *head; - try_to_freeze(); - spin_lock_irq(&rq->lock); if (cpu_is_offline(cpu)) { @@ -5144,7 +5145,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) p = kthread_create(migration_thread, hcpu, "migration/%d", cpu); if (IS_ERR(p)) return NOTIFY_BAD; - p->flags |= PF_NOFREEZE; kthread_bind(p, cpu); /* Must be high prio: stop_machine expects to yield to it. */ rq = task_rq_lock(p, &flags); diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 1baf87cceb7c..29f2c21e7da2 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -171,7 +171,7 @@ static int sched_debug_show(struct seq_file *m, void *v) u64 now = ktime_to_ns(ktime_get()); int cpu; - SEQ_printf(m, "Sched Debug Version: v0.04, cfs-v20, %s %.*s\n", + SEQ_printf(m, "Sched Debug Version: v0.05, %s %.*s\n", init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index c3391b6020e8..ad64fcb731f2 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -10,6 +10,7 @@ #include <linux/sched.h> /* #define SECCOMP_DEBUG 1 */ +#define NR_SECCOMP_MODES 1 /* * Secure computing mode 1 allows only read/write/exit/sigreturn. @@ -54,3 +55,31 @@ void __secure_computing(int this_syscall) #endif do_exit(SIGKILL); } + +long prctl_get_seccomp(void) +{ + return current->seccomp.mode; +} + +long prctl_set_seccomp(unsigned long seccomp_mode) +{ + long ret; + + /* can set it only once to be even more secure */ + ret = -EPERM; + if (unlikely(current->seccomp.mode)) + goto out; + + ret = -EINVAL; + if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) { + current->seccomp.mode = seccomp_mode; + set_thread_flag(TIF_SECCOMP); +#ifdef TIF_NOTSC + disable_TSC(); +#endif + ret = 0; + } + + out: + return ret; +} diff --git a/kernel/signal.c b/kernel/signal.c index f9405609774e..39d122753bac 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -718,6 +718,37 @@ out_set: #define LEGACY_QUEUE(sigptr, sig) \ (((sig) < SIGRTMIN) && sigismember(&(sigptr)->signal, (sig))) +int print_fatal_signals; + +static void print_fatal_signal(struct pt_regs *regs, int signr) +{ + printk("%s/%d: potentially unexpected fatal signal %d.\n", + current->comm, current->pid, signr); + +#ifdef __i386__ + printk("code at %08lx: ", regs->eip); + { + int i; + for (i = 0; i < 16; i++) { + unsigned char insn; + + __get_user(insn, (unsigned char *)(regs->eip + i)); + printk("%02x ", insn); + } + } +#endif + printk("\n"); + show_regs(regs); +} + +static int __init setup_print_fatal_signals(char *str) +{ + get_option (&str, &print_fatal_signals); + + return 1; +} + +__setup("print-fatal-signals=", setup_print_fatal_signals); static int specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t) @@ -1855,6 +1886,8 @@ relock: * Anything else is fatal, maybe with a core dump. */ current->flags |= PF_SIGNALED; + if ((signr != SIGKILL) && print_fatal_signals) + print_fatal_signal(regs, signr); if (sig_kernel_coredump(signr)) { /* * If it was able to dump core, this kills all diff --git a/kernel/softirq.c b/kernel/softirq.c index 73217a9e2875..0f546ddea43d 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -14,6 +14,7 @@ #include <linux/notifier.h> #include <linux/percpu.h> #include <linux/cpu.h> +#include <linux/freezer.h> #include <linux/kthread.h> #include <linux/rcupdate.h> #include <linux/smp.h> @@ -488,8 +489,6 @@ void __init softirq_init(void) static int ksoftirqd(void * __bind_cpu) { - current->flags |= PF_NOFREEZE; - set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop()) { @@ -614,12 +613,16 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, kthread_bind(per_cpu(ksoftirqd, hotcpu), any_online_cpu(cpu_online_map)); case CPU_DEAD: - case CPU_DEAD_FROZEN: + case CPU_DEAD_FROZEN: { + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + p = per_cpu(ksoftirqd, hotcpu); per_cpu(ksoftirqd, hotcpu) = NULL; + sched_setscheduler(p, SCHED_FIFO, ¶m); kthread_stop(p); takeover_tasklets(hotcpu); break; + } #endif /* CONFIG_HOTPLUG_CPU */ } return NOTIFY_OK; diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 0131e296ffb4..708d4882c0c3 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -10,6 +10,7 @@ #include <linux/cpu.h> #include <linux/init.h> #include <linux/delay.h> +#include <linux/freezer.h> #include <linux/kthread.h> #include <linux/notifier.h> #include <linux/module.h> @@ -116,7 +117,6 @@ static int watchdog(void * __bind_cpu) struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; sched_setscheduler(current, SCHED_FIFO, ¶m); - current->flags |= PF_NOFREEZE; /* initialize timestamp */ touch_softlockup_watchdog(); diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index fcee2a8e6da3..319821ef78af 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -93,10 +93,6 @@ static void stopmachine_set_state(enum stopmachine_state state) static int stop_machine(void) { int i, ret = 0; - struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; - - /* One high-prio thread per cpu. We'll do this one. */ - sched_setscheduler(current, SCHED_FIFO, ¶m); atomic_set(&stopmachine_thread_ack, 0); stopmachine_num_threads = 0; @@ -189,6 +185,10 @@ struct task_struct *__stop_machine_run(int (*fn)(void *), void *data, p = kthread_create(do_stop, &smdata, "kstopmachine"); if (!IS_ERR(p)) { + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + + /* One high-prio thread per cpu. We'll do this one. */ + sched_setscheduler(p, SCHED_FIFO, ¶m); kthread_bind(p, cpu); wake_up_process(p); wait_for_completion(&smdata.done); diff --git a/kernel/sys.c b/kernel/sys.c index 872271ccc384..4d141ae3e802 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -31,10 +31,12 @@ #include <linux/cn_proc.h> #include <linux/getcpu.h> #include <linux/task_io_accounting_ops.h> +#include <linux/seccomp.h> #include <linux/compat.h> #include <linux/syscalls.h> #include <linux/kprobes.h> +#include <linux/user_namespace.h> #include <asm/uaccess.h> #include <asm/io.h> @@ -1078,13 +1080,13 @@ static int set_user(uid_t new_ruid, int dumpclear) { struct user_struct *new_user; - new_user = alloc_uid(new_ruid); + new_user = alloc_uid(current->nsproxy->user_ns, new_ruid); if (!new_user) return -EAGAIN; if (atomic_read(&new_user->processes) >= current->signal->rlim[RLIMIT_NPROC].rlim_cur && - new_user != &root_user) { + new_user != current->nsproxy->user_ns->root_user) { free_uid(new_user); return -EAGAIN; } @@ -2241,6 +2243,13 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, error = SET_ENDIAN(current, arg2); break; + case PR_GET_SECCOMP: + error = prctl_get_seccomp(); + break; + case PR_SET_SECCOMP: + error = prctl_set_seccomp(arg2); + break; + default: error = -EINVAL; break; diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 7e11e2c98bf9..b0ec498a18d9 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -14,6 +14,7 @@ asmlinkage long sys_ni_syscall(void) cond_syscall(sys_nfsservctl); cond_syscall(sys_quotactl); +cond_syscall(sys32_quotactl); cond_syscall(sys_acct); cond_syscall(sys_lookup_dcookie); cond_syscall(sys_swapon); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d93e13d93f24..7063ebc6db05 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -29,6 +29,7 @@ #include <linux/utsname.h> #include <linux/capability.h> #include <linux/smp_lock.h> +#include <linux/fs.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/kobject.h> @@ -49,9 +50,6 @@ #include <asm/uaccess.h> #include <asm/processor.h> -extern int proc_nr_files(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos); - #ifdef CONFIG_X86 #include <asm/nmi.h> #include <asm/stacktrace.h> @@ -61,6 +59,7 @@ extern int proc_nr_files(ctl_table *table, int write, struct file *filp, /* External variables not in a header file. */ extern int C_A_D; +extern int print_fatal_signals; extern int sysctl_overcommit_memory; extern int sysctl_overcommit_ratio; extern int sysctl_panic_on_oom; @@ -202,7 +201,10 @@ static ctl_table root_table[] = { .mode = 0555, .child = dev_table, }, - +/* + * NOTE: do not add new entries to this table unless you have read + * Documentation/sysctl/ctl_unnumbered.txt + */ { .ctl_name = 0 } }; @@ -340,6 +342,14 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif + { + .ctl_name = CTL_UNNUMBERED, + .procname = "print-fatal-signals", + .data = &print_fatal_signals, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, #ifdef __sparc__ { .ctl_name = KERN_SPARC_REBOOT, @@ -814,6 +824,14 @@ static ctl_table vm_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "hugepages_treat_as_movable", + .data = &hugepages_treat_as_movable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &hugetlb_treat_movable_handler, + }, #endif { .ctl_name = VM_LOWMEM_RESERVE_RATIO, @@ -958,6 +976,17 @@ static ctl_table vm_table[] = { .mode = 0644, .proc_handler = &proc_doulongvec_minmax, }, +#ifdef CONFIG_NUMA + { + .ctl_name = CTL_UNNUMBERED, + .procname = "numa_zonelist_order", + .data = &numa_zonelist_order, + .maxlen = NUMA_ZONELIST_ORDER_LEN, + .mode = 0644, + .proc_handler = &numa_zonelist_order_handler, + .strategy = &sysctl_string, + }, +#endif #endif #if defined(CONFIG_X86_32) || \ (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) @@ -972,6 +1001,10 @@ static ctl_table vm_table[] = { .extra1 = &zero, }, #endif +/* + * NOTE: do not add new entries to this table unless you have read + * Documentation/sysctl/ctl_unnumbered.txt + */ { .ctl_name = 0 } }; @@ -1112,6 +1145,10 @@ static ctl_table fs_table[] = { .child = binfmt_misc_table, }, #endif +/* + * NOTE: do not add new entries to this table unless you have read + * Documentation/sysctl/ctl_unnumbered.txt + */ { .ctl_name = 0 } }; diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 906cae771585..059431ed67db 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -196,6 +196,8 @@ static int fill_pid(pid_t pid, struct task_struct *tsk, /* fill in basic acct fields */ stats->version = TASKSTATS_VERSION; + stats->nvcsw = tsk->nvcsw; + stats->nivcsw = tsk->nivcsw; bacct_add_tsk(stats, tsk); /* fill in extended acct fields */ @@ -242,6 +244,8 @@ static int fill_tgid(pid_t tgid, struct task_struct *first, */ delayacct_add_tsk(stats, tsk); + stats->nvcsw += tsk->nvcsw; + stats->nivcsw += tsk->nivcsw; } while_each_thread(first, tsk); unlock_task_sighand(first, &flags); diff --git a/kernel/time.c b/kernel/time.c index f04791f69408..ffe19149d770 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -57,14 +57,17 @@ EXPORT_SYMBOL(sys_tz); */ asmlinkage long sys_time(time_t __user * tloc) { - time_t i; - struct timeval tv; + /* + * We read xtime.tv_sec atomically - it's updated + * atomically by update_wall_time(), so no need to + * even read-lock the xtime seqlock: + */ + time_t i = xtime.tv_sec; - do_gettimeofday(&tv); - i = tv.tv_sec; + smp_rmb(); /* sys_time() results are coherent */ if (tloc) { - if (put_user(i,tloc)) + if (put_user(i, tloc)) i = -EFAULT; } return i; @@ -373,12 +376,25 @@ void do_gettimeofday (struct timeval *tv) tv->tv_sec = sec; tv->tv_usec = usec; -} + /* + * Make sure xtime.tv_sec [returned by sys_time()] always + * follows the gettimeofday() result precisely. This + * condition is extremely unlikely, it can hit at most + * once per second: + */ + if (unlikely(xtime.tv_sec != tv->tv_sec)) { + unsigned long flags; + + write_seqlock_irqsave(&xtime_lock, flags); + update_wall_time(); + write_sequnlock_irqrestore(&xtime_lock, flags); + } +} EXPORT_SYMBOL(do_gettimeofday); +#else /* CONFIG_TIME_INTERPOLATION */ -#else #ifndef CONFIG_GENERIC_TIME /* * Simulate gettimeofday using do_gettimeofday which only allows a timeval @@ -394,7 +410,7 @@ void getnstimeofday(struct timespec *tv) } EXPORT_SYMBOL_GPL(getnstimeofday); #endif -#endif +#endif /* CONFIG_TIME_INTERPOLATION */ /* Converts Gregorian date to seconds since 1970-01-01 00:00:00. * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 76212b2a99de..2ad1c37b8dfe 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -205,47 +205,6 @@ void clockevents_exchange_device(struct clock_event_device *old, } /** - * clockevents_request_device - */ -struct clock_event_device *clockevents_request_device(unsigned int features, - cpumask_t cpumask) -{ - struct clock_event_device *cur, *dev = NULL; - struct list_head *tmp; - - spin_lock(&clockevents_lock); - - list_for_each(tmp, &clockevent_devices) { - cur = list_entry(tmp, struct clock_event_device, list); - - if ((cur->features & features) == features && - cpus_equal(cpumask, cur->cpumask)) { - if (!dev || dev->rating < cur->rating) - dev = cur; - } - } - - clockevents_exchange_device(NULL, dev); - - spin_unlock(&clockevents_lock); - - return dev; -} - -/** - * clockevents_release_device - */ -void clockevents_release_device(struct clock_event_device *dev) -{ - spin_lock(&clockevents_lock); - - clockevents_exchange_device(dev, NULL); - clockevents_notify_released(); - - spin_unlock(&clockevents_lock); -} - -/** * clockevents_notify - notification about relevant events */ void clockevents_notify(unsigned long reason, void *arg) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index cf53bb5814cb..438c6b723ee2 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -13,7 +13,7 @@ #include <linux/timex.h> #include <linux/jiffies.h> #include <linux/hrtimer.h> - +#include <linux/capability.h> #include <asm/div64.h> #include <asm/timex.h> diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 3d1042f82a68..728cedfd3cbd 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -36,9 +36,17 @@ EXPORT_SYMBOL(xtime_lock); * at zero at system boot time, so wall_to_monotonic will be negative, * however, we will ALWAYS keep the tv_nsec part positive so we can use * the usual normalization. + * + * wall_to_monotonic is moved after resume from suspend for the monotonic + * time not to jump. We need to add total_sleep_time to wall_to_monotonic + * to get the real boot based time offset. + * + * - wall_to_monotonic is no longer the boot time, getboottime must be + * used instead. */ struct timespec xtime __attribute__ ((aligned (16))); struct timespec wall_to_monotonic __attribute__ ((aligned (16))); +static unsigned long total_sleep_time; /* seconds */ EXPORT_SYMBOL(xtime); @@ -251,6 +259,7 @@ void __init timekeeping_init(void) xtime.tv_nsec = 0; set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec); + total_sleep_time = 0; write_sequnlock_irqrestore(&xtime_lock, flags); } @@ -282,6 +291,7 @@ static int timekeeping_resume(struct sys_device *dev) xtime.tv_sec += sleep_length; wall_to_monotonic.tv_sec -= sleep_length; + total_sleep_time += sleep_length; } /* re-base the last cycle value */ clock->cycle_last = clocksource_read(clock); @@ -476,3 +486,30 @@ void update_wall_time(void) change_clocksource(); update_vsyscall(&xtime, clock); } + +/** + * getboottime - Return the real time of system boot. + * @ts: pointer to the timespec to be set + * + * Returns the time of day in a timespec. + * + * This is based on the wall_to_monotonic offset and the total suspend + * time. Calls to settimeofday will affect the value returned (which + * basically means that however wrong your real time clock is at boot time, + * you get the right time here). + */ +void getboottime(struct timespec *ts) +{ + set_normalized_timespec(ts, + - (wall_to_monotonic.tv_sec + total_sleep_time), + - wall_to_monotonic.tv_nsec); +} + +/** + * monotonic_to_bootbased - Convert the monotonic time to boot based. + * @ts: pointer to the timespec to be converted + */ +void monotonic_to_bootbased(struct timespec *ts) +{ + ts->tv_sec += total_sleep_time; +} diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 8bbcfb77f7d2..e5edc3a22a08 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -38,7 +38,7 @@ DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); static void print_name_offset(struct seq_file *m, void *sym) { - char symname[KSYM_NAME_LEN+1]; + char symname[KSYM_NAME_LEN]; if (lookup_symbol_name((unsigned long)sym, symname) < 0) SEQ_printf(m, "<%p>", sym); diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 321693724ad7..8ed62fda16c6 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c @@ -68,6 +68,7 @@ struct entry { * Number of timeout events: */ unsigned long count; + unsigned int timer_flag; /* * We save the command-line string to preserve @@ -231,7 +232,8 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm) * incremented. Otherwise the timer is registered in a free slot. */ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, - void *timerf, char * comm) + void *timerf, char *comm, + unsigned int timer_flag) { /* * It doesnt matter which lock we take: @@ -249,6 +251,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, input.start_func = startf; input.expire_func = timerf; input.pid = pid; + input.timer_flag = timer_flag; spin_lock_irqsave(lock, flags); if (!active) @@ -266,7 +269,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, static void print_name_offset(struct seq_file *m, unsigned long addr) { - char symname[KSYM_NAME_LEN+1]; + char symname[KSYM_NAME_LEN]; if (lookup_symbol_name(addr, symname) < 0) seq_printf(m, "<%p>", (void *)addr); @@ -295,7 +298,7 @@ static int tstats_show(struct seq_file *m, void *v) period = ktime_to_timespec(time); ms = period.tv_nsec / 1000000; - seq_puts(m, "Timer Stats Version: v0.1\n"); + seq_puts(m, "Timer Stats Version: v0.2\n"); seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms); if (atomic_read(&overflow_count)) seq_printf(m, "Overflow: %d entries\n", @@ -303,8 +306,13 @@ static int tstats_show(struct seq_file *m, void *v) for (i = 0; i < nr_entries; i++) { entry = entries + i; - seq_printf(m, "%4lu, %5d %-16s ", + if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) { + seq_printf(m, "%4luD, %5d %-16s ", entry->count, entry->pid, entry->comm); + } else { + seq_printf(m, " %4lu, %5d %-16s ", + entry->count, entry->pid, entry->comm); + } print_name_offset(m, (unsigned long)entry->start_func); seq_puts(m, " ("); diff --git a/kernel/timer.c b/kernel/timer.c index 1a69705c2fb9..b7792fb03387 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -305,6 +305,20 @@ void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr) memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); timer->start_pid = current->pid; } + +static void timer_stats_account_timer(struct timer_list *timer) +{ + unsigned int flag = 0; + + if (unlikely(tbase_get_deferrable(timer->base))) + flag |= TIMER_STATS_FLAG_DEFERRABLE; + + timer_stats_update_stats(timer, timer->start_pid, timer->start_site, + timer->function, timer->start_comm, flag); +} + +#else +static void timer_stats_account_timer(struct timer_list *timer) {} #endif /** @@ -1114,6 +1128,7 @@ int do_sysinfo(struct sysinfo *info) getnstimeofday(&tp); tp.tv_sec += wall_to_monotonic.tv_sec; tp.tv_nsec += wall_to_monotonic.tv_nsec; + monotonic_to_bootbased(&tp); if (tp.tv_nsec - NSEC_PER_SEC >= 0) { tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC; tp.tv_sec++; @@ -1206,7 +1221,8 @@ static int __devinit init_timers_cpu(int cpu) /* * The APs use this path later in boot */ - base = kmalloc_node(sizeof(*base), GFP_KERNEL, + base = kmalloc_node(sizeof(*base), + GFP_KERNEL | __GFP_ZERO, cpu_to_node(cpu)); if (!base) return -ENOMEM; @@ -1217,7 +1233,6 @@ static int __devinit init_timers_cpu(int cpu) kfree(base); return -ENOMEM; } - memset(base, 0, sizeof(*base)); per_cpu(tvec_bases, cpu) = base; } else { /* diff --git a/kernel/user.c b/kernel/user.c index 4869563080e9..98b82507797a 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -14,20 +14,19 @@ #include <linux/bitops.h> #include <linux/key.h> #include <linux/interrupt.h> +#include <linux/module.h> +#include <linux/user_namespace.h> /* * UID task count cache, to get fast user lookup in "alloc_uid" * when changing user ID's (ie setuid() and friends). */ -#define UIDHASH_BITS (CONFIG_BASE_SMALL ? 3 : 8) -#define UIDHASH_SZ (1 << UIDHASH_BITS) #define UIDHASH_MASK (UIDHASH_SZ - 1) #define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK) -#define uidhashentry(uid) (uidhash_table + __uidhashfn((uid))) +#define uidhashentry(ns, uid) ((ns)->uidhash_table + __uidhashfn((uid))) static struct kmem_cache *uid_cachep; -static struct list_head uidhash_table[UIDHASH_SZ]; /* * The uidhash_lock is mostly taken from process context, but it is @@ -94,9 +93,10 @@ struct user_struct *find_user(uid_t uid) { struct user_struct *ret; unsigned long flags; + struct user_namespace *ns = current->nsproxy->user_ns; spin_lock_irqsave(&uidhash_lock, flags); - ret = uid_hash_find(uid, uidhashentry(uid)); + ret = uid_hash_find(uid, uidhashentry(ns, uid)); spin_unlock_irqrestore(&uidhash_lock, flags); return ret; } @@ -120,9 +120,9 @@ void free_uid(struct user_struct *up) } } -struct user_struct * alloc_uid(uid_t uid) +struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) { - struct list_head *hashent = uidhashentry(uid); + struct list_head *hashent = uidhashentry(ns, uid); struct user_struct *up; spin_lock_irq(&uidhash_lock); @@ -211,11 +211,11 @@ static int __init uid_cache_init(void) 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); for(n = 0; n < UIDHASH_SZ; ++n) - INIT_LIST_HEAD(uidhash_table + n); + INIT_LIST_HEAD(init_user_ns.uidhash_table + n); /* Insert the root user immediately (init already runs as root) */ spin_lock_irq(&uidhash_lock); - uid_hash_insert(&root_user, uidhashentry(0)); + uid_hash_insert(&root_user, uidhashentry(&init_user_ns, 0)); spin_unlock_irq(&uidhash_lock); return 0; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c new file mode 100644 index 000000000000..d055d987850c --- /dev/null +++ b/kernel/user_namespace.c @@ -0,0 +1,87 @@ +/* + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + */ + +#include <linux/module.h> +#include <linux/version.h> +#include <linux/nsproxy.h> +#include <linux/user_namespace.h> + +struct user_namespace init_user_ns = { + .kref = { + .refcount = ATOMIC_INIT(2), + }, + .root_user = &root_user, +}; + +EXPORT_SYMBOL_GPL(init_user_ns); + +#ifdef CONFIG_USER_NS + +/* + * Clone a new ns copying an original user ns, setting refcount to 1 + * @old_ns: namespace to clone + * Return NULL on error (failure to kmalloc), new ns otherwise + */ +static struct user_namespace *clone_user_ns(struct user_namespace *old_ns) +{ + struct user_namespace *ns; + struct user_struct *new_user; + int n; + + ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL); + if (!ns) + return ERR_PTR(-ENOMEM); + + kref_init(&ns->kref); + + for (n = 0; n < UIDHASH_SZ; ++n) + INIT_LIST_HEAD(ns->uidhash_table + n); + + /* Insert new root user. */ + ns->root_user = alloc_uid(ns, 0); + if (!ns->root_user) { + kfree(ns); + return ERR_PTR(-ENOMEM); + } + + /* Reset current->user with a new one */ + new_user = alloc_uid(ns, current->uid); + if (!new_user) { + free_uid(ns->root_user); + kfree(ns); + return ERR_PTR(-ENOMEM); + } + + switch_uid(new_user); + return ns; +} + +struct user_namespace * copy_user_ns(int flags, struct user_namespace *old_ns) +{ + struct user_namespace *new_ns; + + BUG_ON(!old_ns); + get_user_ns(old_ns); + + if (!(flags & CLONE_NEWUSER)) + return old_ns; + + new_ns = clone_user_ns(old_ns); + + put_user_ns(old_ns); + return new_ns; +} + +void free_user_ns(struct kref *kref) +{ + struct user_namespace *ns; + + ns = container_of(kref, struct user_namespace, kref); + kfree(ns); +} + +#endif /* CONFIG_USER_NS */ diff --git a/kernel/utsname.c b/kernel/utsname.c index 160c8c5136bd..9d8180a0f0d8 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -13,6 +13,7 @@ #include <linux/uts.h> #include <linux/utsname.h> #include <linux/version.h> +#include <linux/err.h> /* * Clone a new ns copying an original utsname, setting refcount to 1 @@ -24,10 +25,11 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) struct uts_namespace *ns; ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL); - if (ns) { - memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); - kref_init(&ns->kref); - } + if (!ns) + return ERR_PTR(-ENOMEM); + + memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); + kref_init(&ns->kref); return ns; } @@ -37,7 +39,7 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) * utsname of this process won't be seen by parent, and vice * versa. */ -struct uts_namespace *copy_utsname(int flags, struct uts_namespace *old_ns) +struct uts_namespace *copy_utsname(unsigned long flags, struct uts_namespace *old_ns) { struct uts_namespace *new_ns; diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index f22b9dbd2a9c..c76c06466bfd 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -18,10 +18,7 @@ static void *get_uts(ctl_table *table, int write) { char *which = table->data; -#ifdef CONFIG_UTS_NS - struct uts_namespace *uts_ns = current->nsproxy->uts_ns; - which = (which - (char *)&init_uts_ns) + (char *)uts_ns; -#endif + if (!write) down_read(&uts_sem); else diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 3bebf73be976..58e5c152a6bb 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -282,8 +282,8 @@ static int worker_thread(void *__cwq) struct cpu_workqueue_struct *cwq = __cwq; DEFINE_WAIT(wait); - if (!cwq->wq->freezeable) - current->flags |= PF_NOFREEZE; + if (cwq->wq->freezeable) + set_freezable(); set_user_nice(current, -5); @@ -382,16 +382,16 @@ void fastcall flush_workqueue(struct workqueue_struct *wq) EXPORT_SYMBOL_GPL(flush_workqueue); /* - * Upon a successful return, the caller "owns" WORK_STRUCT_PENDING bit, + * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit, * so this work can't be re-armed in any way. */ static int try_to_grab_pending(struct work_struct *work) { struct cpu_workqueue_struct *cwq; - int ret = 0; + int ret = -1; if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) - return 1; + return 0; /* * The queueing is in progress, or it is already queued. Try to @@ -457,10 +457,28 @@ static void wait_on_work(struct work_struct *work) wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work); } +static int __cancel_work_timer(struct work_struct *work, + struct timer_list* timer) +{ + int ret; + + do { + ret = (timer && likely(del_timer(timer))); + if (!ret) + ret = try_to_grab_pending(work); + wait_on_work(work); + } while (unlikely(ret < 0)); + + work_clear_pending(work); + return ret; +} + /** * cancel_work_sync - block until a work_struct's callback has terminated * @work: the work which is to be flushed * + * Returns true if @work was pending. + * * cancel_work_sync() will cancel the work if it is queued. If the work's * callback appears to be running, cancel_work_sync() will block until it * has completed. @@ -476,31 +494,26 @@ static void wait_on_work(struct work_struct *work) * The caller must ensure that workqueue_struct on which this work was last * queued can't be destroyed before this function returns. */ -void cancel_work_sync(struct work_struct *work) +int cancel_work_sync(struct work_struct *work) { - while (!try_to_grab_pending(work)) - cpu_relax(); - wait_on_work(work); - work_clear_pending(work); + return __cancel_work_timer(work, NULL); } EXPORT_SYMBOL_GPL(cancel_work_sync); /** - * cancel_rearming_delayed_work - reliably kill off a delayed work. + * cancel_delayed_work_sync - reliably kill off a delayed work. * @dwork: the delayed work struct * + * Returns true if @dwork was pending. + * * It is possible to use this function if @dwork rearms itself via queue_work() * or queue_delayed_work(). See also the comment for cancel_work_sync(). */ -void cancel_rearming_delayed_work(struct delayed_work *dwork) +int cancel_delayed_work_sync(struct delayed_work *dwork) { - while (!del_timer(&dwork->timer) && - !try_to_grab_pending(&dwork->work)) - cpu_relax(); - wait_on_work(&dwork->work); - work_clear_pending(&dwork->work); + return __cancel_work_timer(&dwork->work, &dwork->timer); } -EXPORT_SYMBOL(cancel_rearming_delayed_work); +EXPORT_SYMBOL(cancel_delayed_work_sync); static struct workqueue_struct *keventd_wq __read_mostly; @@ -739,18 +752,17 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) if (cwq->thread == NULL) return; + flush_cpu_workqueue(cwq); /* - * If the caller is CPU_DEAD the single flush_cpu_workqueue() - * is not enough, a concurrent flush_workqueue() can insert a - * barrier after us. + * If the caller is CPU_DEAD and cwq->worklist was not empty, + * a concurrent flush_workqueue() can insert a barrier after us. + * However, in that case run_workqueue() won't return and check + * kthread_should_stop() until it flushes all work_struct's. * When ->worklist becomes empty it is safe to exit because no * more work_structs can be queued on this cwq: flush_workqueue * checks list_empty(), and a "normal" queue_work() can't use * a dead CPU. */ - while (flush_cpu_workqueue(cwq)) - ; - kthread_stop(cwq->thread); cwq->thread = NULL; } |