diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/Makefile | 2 | ||||
-rw-r--r-- | kernel/acct.c | 4 | ||||
-rw-r--r-- | kernel/delayacct.c | 178 | ||||
-rw-r--r-- | kernel/exit.c | 10 | ||||
-rw-r--r-- | kernel/fork.c | 6 | ||||
-rw-r--r-- | kernel/kallsyms.c | 4 | ||||
-rw-r--r-- | kernel/kthread.c | 24 | ||||
-rw-r--r-- | kernel/module.c | 11 | ||||
-rw-r--r-- | kernel/power/pm.c | 37 | ||||
-rw-r--r-- | kernel/resource.c | 2 | ||||
-rw-r--r-- | kernel/rtmutex-tester.c | 1 | ||||
-rw-r--r-- | kernel/sched.c | 84 | ||||
-rw-r--r-- | kernel/softirq.c | 2 | ||||
-rw-r--r-- | kernel/sys.c | 2 | ||||
-rw-r--r-- | kernel/taskstats.c | 568 | ||||
-rw-r--r-- | kernel/timer.c | 20 |
16 files changed, 857 insertions, 98 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 47dbcd570cd8..d62ec66c1af2 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -48,6 +48,8 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_RELAY) += relay.o +obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o +obj-$(CONFIG_TASKSTATS) += taskstats.o ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is diff --git a/kernel/acct.c b/kernel/acct.c index f18e0b8df3e1..2a7c933651c7 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -488,7 +488,7 @@ static void do_acct_process(struct file *file) old_encode_dev(tty_devnum(current->signal->tty)) : 0; read_unlock(&tasklist_lock); - spin_lock(¤t->sighand->siglock); + spin_lock_irq(¤t->sighand->siglock); ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); ac.ac_flag = pacct->ac_flag; @@ -496,7 +496,7 @@ static void do_acct_process(struct file *file) ac.ac_minflt = encode_comp_t(pacct->ac_minflt); ac.ac_majflt = encode_comp_t(pacct->ac_majflt); ac.ac_exitcode = pacct->ac_exitcode; - spin_unlock(¤t->sighand->siglock); + spin_unlock_irq(¤t->sighand->siglock); ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */ ac.ac_rw = encode_comp_t(ac.ac_io / 1024); ac.ac_swaps = encode_comp_t(0); diff --git a/kernel/delayacct.c b/kernel/delayacct.c new file mode 100644 index 000000000000..f05392d64267 --- /dev/null +++ b/kernel/delayacct.c @@ -0,0 +1,178 @@ +/* delayacct.c - per-task delay accounting + * + * Copyright (C) Shailabh Nagar, IBM Corp. 2006 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/time.h> +#include <linux/sysctl.h> +#include <linux/delayacct.h> + +int delayacct_on __read_mostly; /* Delay accounting turned on/off */ +kmem_cache_t *delayacct_cache; + +static int __init delayacct_setup_enable(char *str) +{ + delayacct_on = 1; + return 1; +} +__setup("delayacct", delayacct_setup_enable); + +void delayacct_init(void) +{ + delayacct_cache = kmem_cache_create("delayacct_cache", + sizeof(struct task_delay_info), + 0, + SLAB_PANIC, + NULL, NULL); + delayacct_tsk_init(&init_task); +} + +void __delayacct_tsk_init(struct task_struct *tsk) +{ + spin_lock_init(&tsk->delays_lock); + /* No need to acquire tsk->delays_lock for allocation here unless + __delayacct_tsk_init called after tsk is attached to tasklist + */ + tsk->delays = kmem_cache_zalloc(delayacct_cache, SLAB_KERNEL); + if (tsk->delays) + spin_lock_init(&tsk->delays->lock); +} + +void __delayacct_tsk_exit(struct task_struct *tsk) +{ + struct task_delay_info *delays = tsk->delays; + spin_lock(&tsk->delays_lock); + tsk->delays = NULL; + spin_unlock(&tsk->delays_lock); + kmem_cache_free(delayacct_cache, delays); +} + +/* + * Start accounting for a delay statistic using + * its starting timestamp (@start) + */ + +static inline void delayacct_start(struct timespec *start) +{ + do_posix_clock_monotonic_gettime(start); +} + +/* + * Finish delay accounting for a statistic using + * its timestamps (@start, @end), accumalator (@total) and @count + */ + +static void delayacct_end(struct timespec *start, struct timespec *end, + u64 *total, u32 *count) +{ + struct timespec ts; + s64 ns; + + do_posix_clock_monotonic_gettime(end); + ts = timespec_sub(*end, *start); + ns = timespec_to_ns(&ts); + if (ns < 0) + return; + + spin_lock(¤t->delays->lock); + *total += ns; + (*count)++; + spin_unlock(¤t->delays->lock); +} + +void __delayacct_blkio_start(void) +{ + delayacct_start(¤t->delays->blkio_start); +} + +void __delayacct_blkio_end(void) +{ + if (current->delays->flags & DELAYACCT_PF_SWAPIN) + /* Swapin block I/O */ + delayacct_end(¤t->delays->blkio_start, + ¤t->delays->blkio_end, + ¤t->delays->swapin_delay, + ¤t->delays->swapin_count); + else /* Other block I/O */ + delayacct_end(¤t->delays->blkio_start, + ¤t->delays->blkio_end, + ¤t->delays->blkio_delay, + ¤t->delays->blkio_count); +} + +int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) +{ + s64 tmp; + struct timespec ts; + unsigned long t1,t2,t3; + + spin_lock(&tsk->delays_lock); + + /* Though tsk->delays accessed later, early exit avoids + * unnecessary returning of other data + */ + if (!tsk->delays) + goto done; + + tmp = (s64)d->cpu_run_real_total; + cputime_to_timespec(tsk->utime + tsk->stime, &ts); + tmp += timespec_to_ns(&ts); + d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp; + + /* + * No locking available for sched_info (and too expensive to add one) + * Mitigate by taking snapshot of values + */ + t1 = tsk->sched_info.pcnt; + t2 = tsk->sched_info.run_delay; + t3 = tsk->sched_info.cpu_time; + + d->cpu_count += t1; + + jiffies_to_timespec(t2, &ts); + tmp = (s64)d->cpu_delay_total + timespec_to_ns(&ts); + d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp; + + tmp = (s64)d->cpu_run_virtual_total + (s64)jiffies_to_usecs(t3) * 1000; + d->cpu_run_virtual_total = + (tmp < (s64)d->cpu_run_virtual_total) ? 0 : tmp; + + /* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */ + + spin_lock(&tsk->delays->lock); + tmp = d->blkio_delay_total + tsk->delays->blkio_delay; + d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp; + tmp = d->swapin_delay_total + tsk->delays->swapin_delay; + d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp; + d->blkio_count += tsk->delays->blkio_count; + d->swapin_count += tsk->delays->swapin_count; + spin_unlock(&tsk->delays->lock); + +done: + spin_unlock(&tsk->delays_lock); + return 0; +} + +__u64 __delayacct_blkio_ticks(struct task_struct *tsk) +{ + __u64 ret; + + spin_lock(&tsk->delays->lock); + ret = nsec_to_clock_t(tsk->delays->blkio_delay + + tsk->delays->swapin_delay); + spin_unlock(&tsk->delays->lock); + return ret; +} + diff --git a/kernel/exit.c b/kernel/exit.c index 6664c084783d..dba194a8d416 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -25,6 +25,8 @@ #include <linux/mount.h> #include <linux/proc_fs.h> #include <linux/mempolicy.h> +#include <linux/taskstats_kern.h> +#include <linux/delayacct.h> #include <linux/cpuset.h> #include <linux/syscalls.h> #include <linux/signal.h> @@ -843,7 +845,9 @@ static void exit_notify(struct task_struct *tsk) fastcall NORET_TYPE void do_exit(long code) { struct task_struct *tsk = current; + struct taskstats *tidstats; int group_dead; + unsigned int mycpu; profile_task_exit(tsk); @@ -881,6 +885,8 @@ fastcall NORET_TYPE void do_exit(long code) current->comm, current->pid, preempt_count()); + taskstats_exit_alloc(&tidstats, &mycpu); + acct_update_integrals(tsk); if (tsk->mm) { update_hiwater_rss(tsk->mm); @@ -900,6 +906,10 @@ fastcall NORET_TYPE void do_exit(long code) #endif if (unlikely(tsk->audit_context)) audit_free(tsk); + taskstats_exit_send(tsk, tidstats, group_dead, mycpu); + taskstats_exit_free(tidstats); + delayacct_tsk_exit(tsk); + exit_mm(tsk); if (group_dead) diff --git a/kernel/fork.c b/kernel/fork.c index 926e5a68ea9e..1b0f7b1e0881 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -43,6 +43,8 @@ #include <linux/rmap.h> #include <linux/acct.h> #include <linux/cn_proc.h> +#include <linux/delayacct.h> +#include <linux/taskstats_kern.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -818,6 +820,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts if (clone_flags & CLONE_THREAD) { atomic_inc(¤t->signal->count); atomic_inc(¤t->signal->live); + taskstats_tgid_alloc(current->signal); return 0; } sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); @@ -862,6 +865,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts INIT_LIST_HEAD(&sig->cpu_timers[0]); INIT_LIST_HEAD(&sig->cpu_timers[1]); INIT_LIST_HEAD(&sig->cpu_timers[2]); + taskstats_tgid_init(sig); task_lock(current->group_leader); memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); @@ -883,6 +887,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts void __cleanup_signal(struct signal_struct *sig) { exit_thread_group_keys(sig); + taskstats_tgid_free(sig); kmem_cache_free(signal_cachep, sig); } @@ -1000,6 +1005,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto bad_fork_cleanup_put_domain; p->did_exec = 0; + delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ copy_flags(clone_flags, p); p->pid = pid; retval = -EFAULT; diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 39277dd6bf90..ab16a5a4cfe9 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -275,8 +275,8 @@ static void upcase_if_global(struct kallsym_iter *iter) static int get_ksymbol_mod(struct kallsym_iter *iter) { iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms, - &iter->value, - &iter->type, iter->name); + &iter->value, &iter->type, + iter->name, sizeof(iter->name)); if (iter->owner == NULL) return 0; diff --git a/kernel/kthread.c b/kernel/kthread.c index 24be714b04c7..4f9c60ef95e8 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -216,23 +216,6 @@ EXPORT_SYMBOL(kthread_bind); */ int kthread_stop(struct task_struct *k) { - return kthread_stop_sem(k, NULL); -} -EXPORT_SYMBOL(kthread_stop); - -/** - * kthread_stop_sem - stop a thread created by kthread_create(). - * @k: thread created by kthread_create(). - * @s: semaphore that @k waits on while idle. - * - * Does essentially the same thing as kthread_stop() above, but wakes - * @k by calling up(@s). - * - * Returns the result of threadfn(), or %-EINTR if wake_up_process() - * was never called. - */ -int kthread_stop_sem(struct task_struct *k, struct semaphore *s) -{ int ret; mutex_lock(&kthread_stop_lock); @@ -246,10 +229,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s) /* Now set kthread_should_stop() to true, and wake it up. */ kthread_stop_info.k = k; - if (s) - up(s); - else - wake_up_process(k); + wake_up_process(k); put_task_struct(k); /* Once it dies, reset stop ptr, gather result and we're done. */ @@ -260,7 +240,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s) return ret; } -EXPORT_SYMBOL(kthread_stop_sem); +EXPORT_SYMBOL(kthread_stop); static __init int helper_init(void) { diff --git a/kernel/module.c b/kernel/module.c index 35e1b1f859d7..2a19cd47c046 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2019,10 +2019,8 @@ const char *module_address_lookup(unsigned long addr, return NULL; } -struct module *module_get_kallsym(unsigned int symnum, - unsigned long *value, - char *type, - char namebuf[128]) +struct module *module_get_kallsym(unsigned int symnum, unsigned long *value, + char *type, char *name, size_t namelen) { struct module *mod; @@ -2031,9 +2029,8 @@ struct module *module_get_kallsym(unsigned int symnum, if (symnum < mod->num_symtab) { *value = mod->symtab[symnum].st_value; *type = mod->symtab[symnum].st_info; - strncpy(namebuf, - mod->strtab + mod->symtab[symnum].st_name, - 127); + strlcpy(name, mod->strtab + mod->symtab[symnum].st_name, + namelen); mutex_unlock(&module_mutex); return mod; } diff --git a/kernel/power/pm.c b/kernel/power/pm.c index 84063ac8fcfc..c50d15266c10 100644 --- a/kernel/power/pm.c +++ b/kernel/power/pm.c @@ -75,42 +75,6 @@ struct pm_dev *pm_register(pm_dev_t type, return dev; } -static void __pm_unregister(struct pm_dev *dev) -{ - if (dev) { - list_del(&dev->entry); - kfree(dev); - } -} - -/** - * pm_unregister_all - unregister all devices with matching callback - * @callback: callback function pointer - * - * Unregister every device that would call the callback passed. This - * is primarily meant as a helper function for loadable modules. It - * enables a module to give up all its managed devices without keeping - * its own private list. - */ - -void pm_unregister_all(pm_callback callback) -{ - struct list_head *entry; - - if (!callback) - return; - - mutex_lock(&pm_devs_lock); - entry = pm_devs.next; - while (entry != &pm_devs) { - struct pm_dev *dev = list_entry(entry, struct pm_dev, entry); - entry = entry->next; - if (dev->callback == callback) - __pm_unregister(dev); - } - mutex_unlock(&pm_devs_lock); -} - /** * pm_send - send request to a single device * @dev: device to send to @@ -239,7 +203,6 @@ int pm_send_all(pm_request_t rqst, void *data) } EXPORT_SYMBOL(pm_register); -EXPORT_SYMBOL(pm_unregister_all); EXPORT_SYMBOL(pm_send_all); EXPORT_SYMBOL(pm_active); diff --git a/kernel/resource.c b/kernel/resource.c index 129cf046e561..0dd3a857579e 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -404,8 +404,6 @@ int insert_resource(struct resource *parent, struct resource *new) return result; } -EXPORT_SYMBOL(insert_resource); - /* * Given an existing resource, change its start and size to match the * arguments. Returns -EBUSY if it can't fit. Existing children of diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c index 494dac872a13..948bd8f643e2 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c @@ -275,6 +275,7 @@ static int test_func(void *data) /* Wait for the next command to be executed */ schedule(); + try_to_freeze(); if (signal_pending(current)) flush_signals(current); diff --git a/kernel/sched.c b/kernel/sched.c index d714611f1691..b44b9a43b0fc 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -51,6 +51,7 @@ #include <linux/times.h> #include <linux/acct.h> #include <linux/kprobes.h> +#include <linux/delayacct.h> #include <asm/tlb.h> #include <asm/unistd.h> @@ -501,9 +502,36 @@ struct file_operations proc_schedstat_operations = { .release = single_release, }; +/* + * Expects runqueue lock to be held for atomicity of update + */ +static inline void +rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies) +{ + if (rq) { + rq->rq_sched_info.run_delay += delta_jiffies; + rq->rq_sched_info.pcnt++; + } +} + +/* + * Expects runqueue lock to be held for atomicity of update + */ +static inline void +rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies) +{ + if (rq) + rq->rq_sched_info.cpu_time += delta_jiffies; +} # define schedstat_inc(rq, field) do { (rq)->field++; } while (0) # define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) #else /* !CONFIG_SCHEDSTATS */ +static inline void +rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies) +{} +static inline void +rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies) +{} # define schedstat_inc(rq, field) do { } while (0) # define schedstat_add(rq, field, amt) do { } while (0) #endif @@ -523,7 +551,7 @@ static inline struct rq *this_rq_lock(void) return rq; } -#ifdef CONFIG_SCHEDSTATS +#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) /* * Called when a process is dequeued from the active array and given * the cpu. We should note that with the exception of interactive @@ -551,21 +579,16 @@ static inline void sched_info_dequeued(struct task_struct *t) */ static void sched_info_arrive(struct task_struct *t) { - unsigned long now = jiffies, diff = 0; - struct rq *rq = task_rq(t); + unsigned long now = jiffies, delta_jiffies = 0; if (t->sched_info.last_queued) - diff = now - t->sched_info.last_queued; + delta_jiffies = now - t->sched_info.last_queued; sched_info_dequeued(t); - t->sched_info.run_delay += diff; + t->sched_info.run_delay += delta_jiffies; t->sched_info.last_arrival = now; t->sched_info.pcnt++; - if (!rq) - return; - - rq->rq_sched_info.run_delay += diff; - rq->rq_sched_info.pcnt++; + rq_sched_info_arrive(task_rq(t), delta_jiffies); } /* @@ -585,8 +608,9 @@ static void sched_info_arrive(struct task_struct *t) */ static inline void sched_info_queued(struct task_struct *t) { - if (!t->sched_info.last_queued) - t->sched_info.last_queued = jiffies; + if (unlikely(sched_info_on())) + if (!t->sched_info.last_queued) + t->sched_info.last_queued = jiffies; } /* @@ -595,13 +619,10 @@ static inline void sched_info_queued(struct task_struct *t) */ static inline void sched_info_depart(struct task_struct *t) { - struct rq *rq = task_rq(t); - unsigned long diff = jiffies - t->sched_info.last_arrival; - - t->sched_info.cpu_time += diff; + unsigned long delta_jiffies = jiffies - t->sched_info.last_arrival; - if (rq) - rq->rq_sched_info.cpu_time += diff; + t->sched_info.cpu_time += delta_jiffies; + rq_sched_info_depart(task_rq(t), delta_jiffies); } /* @@ -610,7 +631,7 @@ static inline void sched_info_depart(struct task_struct *t) * the idle task.) We are only called when prev != next. */ static inline void -sched_info_switch(struct task_struct *prev, struct task_struct *next) +__sched_info_switch(struct task_struct *prev, struct task_struct *next) { struct rq *rq = task_rq(prev); @@ -625,10 +646,16 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next) if (next != rq->idle) sched_info_arrive(next); } +static inline void +sched_info_switch(struct task_struct *prev, struct task_struct *next) +{ + if (unlikely(sched_info_on())) + __sched_info_switch(prev, next); +} #else #define sched_info_queued(t) do { } while (0) #define sched_info_switch(t, next) do { } while (0) -#endif /* CONFIG_SCHEDSTATS */ +#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ /* * Adding/removing a task to/from a priority array: @@ -1530,8 +1557,9 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags) INIT_LIST_HEAD(&p->run_list); p->array = NULL; -#ifdef CONFIG_SCHEDSTATS - memset(&p->sched_info, 0, sizeof(p->sched_info)); +#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) + if (unlikely(sched_info_on())) + memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) p->oncpu = 0; @@ -1788,7 +1816,15 @@ context_switch(struct rq *rq, struct task_struct *prev, WARN_ON(rq->prev_mm); rq->prev_mm = oldmm; } + /* + * Since the runqueue lock will be released by the next + * task (which is an invalid locking op but in the case + * of the scheduler it's an obvious special-case), so we + * do an early lockdep release here: + */ +#ifndef __ARCH_WANT_UNLOCKED_CTXSW spin_release(&rq->lock.dep_map, 1, _THIS_IP_); +#endif /* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); @@ -4526,9 +4562,11 @@ void __sched io_schedule(void) { struct rq *rq = &__raw_get_cpu_var(runqueues); + delayacct_blkio_start(); atomic_inc(&rq->nr_iowait); schedule(); atomic_dec(&rq->nr_iowait); + delayacct_blkio_end(); } EXPORT_SYMBOL(io_schedule); @@ -4537,9 +4575,11 @@ long __sched io_schedule_timeout(long timeout) struct rq *rq = &__raw_get_cpu_var(runqueues); long ret; + delayacct_blkio_start(); atomic_inc(&rq->nr_iowait); ret = schedule_timeout(timeout); atomic_dec(&rq->nr_iowait); + delayacct_blkio_end(); return ret; } diff --git a/kernel/softirq.c b/kernel/softirq.c index fd12f2556f0d..0f08a84ae307 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -311,8 +311,6 @@ void open_softirq(int nr, void (*action)(struct softirq_action*), void *data) softirq_vec[nr].action = action; } -EXPORT_UNUSED_SYMBOL(open_softirq); /* June 2006 */ - /* Tasklets */ struct tasklet_head { diff --git a/kernel/sys.c b/kernel/sys.c index dbb3b9c7ea64..e236f98f7ec5 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1983,7 +1983,7 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, error = current->mm->dumpable; break; case PR_SET_DUMPABLE: - if (arg2 < 0 || arg2 > 2) { + if (arg2 < 0 || arg2 > 1) { error = -EINVAL; break; } diff --git a/kernel/taskstats.c b/kernel/taskstats.c new file mode 100644 index 000000000000..f45179ce028e --- /dev/null +++ b/kernel/taskstats.c @@ -0,0 +1,568 @@ +/* + * taskstats.c - Export per-task statistics to userland + * + * Copyright (C) Shailabh Nagar, IBM Corp. 2006 + * (C) Balbir Singh, IBM Corp. 2006 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include <linux/kernel.h> +#include <linux/taskstats_kern.h> +#include <linux/delayacct.h> +#include <linux/cpumask.h> +#include <linux/percpu.h> +#include <net/genetlink.h> +#include <asm/atomic.h> + +/* + * Maximum length of a cpumask that can be specified in + * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute + */ +#define TASKSTATS_CPUMASK_MAXLEN (100+6*NR_CPUS) + +static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 }; +static int family_registered; +kmem_cache_t *taskstats_cache; + +static struct genl_family family = { + .id = GENL_ID_GENERATE, + .name = TASKSTATS_GENL_NAME, + .version = TASKSTATS_GENL_VERSION, + .maxattr = TASKSTATS_CMD_ATTR_MAX, +}; + +static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] +__read_mostly = { + [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, + [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, + [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, + [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; + +struct listener { + struct list_head list; + pid_t pid; + char valid; +}; + +struct listener_list { + struct rw_semaphore sem; + struct list_head list; +}; +static DEFINE_PER_CPU(struct listener_list, listener_array); + +enum actions { + REGISTER, + DEREGISTER, + CPU_DONT_CARE +}; + +static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, + void **replyp, size_t size) +{ + struct sk_buff *skb; + void *reply; + + /* + * If new attributes are added, please revisit this allocation + */ + skb = nlmsg_new(size); + if (!skb) + return -ENOMEM; + + if (!info) { + int seq = get_cpu_var(taskstats_seqnum)++; + put_cpu_var(taskstats_seqnum); + + reply = genlmsg_put(skb, 0, seq, + family.id, 0, 0, + cmd, family.version); + } else + reply = genlmsg_put(skb, info->snd_pid, info->snd_seq, + family.id, 0, 0, + cmd, family.version); + if (reply == NULL) { + nlmsg_free(skb); + return -EINVAL; + } + + *skbp = skb; + *replyp = reply; + return 0; +} + +/* + * Send taskstats data in @skb to listener with nl_pid @pid + */ +static int send_reply(struct sk_buff *skb, pid_t pid) +{ + struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); + void *reply = genlmsg_data(genlhdr); + int rc; + + rc = genlmsg_end(skb, reply); + if (rc < 0) { + nlmsg_free(skb); + return rc; + } + + return genlmsg_unicast(skb, pid); +} + +/* + * Send taskstats data in @skb to listeners registered for @cpu's exit data + */ +static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu) +{ + struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); + struct listener_list *listeners; + struct listener *s, *tmp; + struct sk_buff *skb_next, *skb_cur = skb; + void *reply = genlmsg_data(genlhdr); + int rc, ret, delcount = 0; + + rc = genlmsg_end(skb, reply); + if (rc < 0) { + nlmsg_free(skb); + return rc; + } + + rc = 0; + listeners = &per_cpu(listener_array, cpu); + down_read(&listeners->sem); + list_for_each_entry_safe(s, tmp, &listeners->list, list) { + skb_next = NULL; + if (!list_is_last(&s->list, &listeners->list)) { + skb_next = skb_clone(skb_cur, GFP_KERNEL); + if (!skb_next) { + nlmsg_free(skb_cur); + rc = -ENOMEM; + break; + } + } + ret = genlmsg_unicast(skb_cur, s->pid); + if (ret == -ECONNREFUSED) { + s->valid = 0; + delcount++; + rc = ret; + } + skb_cur = skb_next; + } + up_read(&listeners->sem); + + if (!delcount) + return rc; + + /* Delete invalidated entries */ + down_write(&listeners->sem); + list_for_each_entry_safe(s, tmp, &listeners->list, list) { + if (!s->valid) { + list_del(&s->list); + kfree(s); + } + } + up_write(&listeners->sem); + return rc; +} + +static int fill_pid(pid_t pid, struct task_struct *pidtsk, + struct taskstats *stats) +{ + int rc; + struct task_struct *tsk = pidtsk; + + if (!pidtsk) { + read_lock(&tasklist_lock); + tsk = find_task_by_pid(pid); + if (!tsk) { + read_unlock(&tasklist_lock); + return -ESRCH; + } + get_task_struct(tsk); + read_unlock(&tasklist_lock); + } else + get_task_struct(tsk); + + /* + * Each accounting subsystem adds calls to its functions to + * fill in relevant parts of struct taskstsats as follows + * + * rc = per-task-foo(stats, tsk); + * if (rc) + * goto err; + */ + + rc = delayacct_add_tsk(stats, tsk); + stats->version = TASKSTATS_VERSION; + + /* Define err: label here if needed */ + put_task_struct(tsk); + return rc; + +} + +static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk, + struct taskstats *stats) +{ + struct task_struct *tsk, *first; + unsigned long flags; + + /* + * Add additional stats from live tasks except zombie thread group + * leaders who are already counted with the dead tasks + */ + first = tgidtsk; + if (!first) { + read_lock(&tasklist_lock); + first = find_task_by_pid(tgid); + if (!first) { + read_unlock(&tasklist_lock); + return -ESRCH; + } + get_task_struct(first); + read_unlock(&tasklist_lock); + } else + get_task_struct(first); + + /* Start with stats from dead tasks */ + spin_lock_irqsave(&first->signal->stats_lock, flags); + if (first->signal->stats) + memcpy(stats, first->signal->stats, sizeof(*stats)); + spin_unlock_irqrestore(&first->signal->stats_lock, flags); + + tsk = first; + read_lock(&tasklist_lock); + do { + if (tsk->exit_state == EXIT_ZOMBIE && thread_group_leader(tsk)) + continue; + /* + * Accounting subsystem can call its functions here to + * fill in relevant parts of struct taskstsats as follows + * + * per-task-foo(stats, tsk); + */ + delayacct_add_tsk(stats, tsk); + + } while_each_thread(first, tsk); + read_unlock(&tasklist_lock); + stats->version = TASKSTATS_VERSION; + + /* + * Accounting subsytems can also add calls here to modify + * fields of taskstats. + */ + + return 0; +} + + +static void fill_tgid_exit(struct task_struct *tsk) +{ + unsigned long flags; + + spin_lock_irqsave(&tsk->signal->stats_lock, flags); + if (!tsk->signal->stats) + goto ret; + + /* + * Each accounting subsystem calls its functions here to + * accumalate its per-task stats for tsk, into the per-tgid structure + * + * per-task-foo(tsk->signal->stats, tsk); + */ + delayacct_add_tsk(tsk->signal->stats, tsk); +ret: + spin_unlock_irqrestore(&tsk->signal->stats_lock, flags); + return; +} + +static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd) +{ + struct listener_list *listeners; + struct listener *s, *tmp; + unsigned int cpu; + cpumask_t mask = *maskp; + + if (!cpus_subset(mask, cpu_possible_map)) + return -EINVAL; + + if (isadd == REGISTER) { + for_each_cpu_mask(cpu, mask) { + s = kmalloc_node(sizeof(struct listener), GFP_KERNEL, + cpu_to_node(cpu)); + if (!s) + goto cleanup; + s->pid = pid; + INIT_LIST_HEAD(&s->list); + s->valid = 1; + + listeners = &per_cpu(listener_array, cpu); + down_write(&listeners->sem); + list_add(&s->list, &listeners->list); + up_write(&listeners->sem); + } + return 0; + } + + /* Deregister or cleanup */ +cleanup: + for_each_cpu_mask(cpu, mask) { + listeners = &per_cpu(listener_array, cpu); + down_write(&listeners->sem); + list_for_each_entry_safe(s, tmp, &listeners->list, list) { + if (s->pid == pid) { + list_del(&s->list); + kfree(s); + break; + } + } + up_write(&listeners->sem); + } + return 0; +} + +static int parse(struct nlattr *na, cpumask_t *mask) +{ + char *data; + int len; + int ret; + + if (na == NULL) + return 1; + len = nla_len(na); + if (len > TASKSTATS_CPUMASK_MAXLEN) + return -E2BIG; + if (len < 1) + return -EINVAL; + data = kmalloc(len, GFP_KERNEL); + if (!data) + return -ENOMEM; + nla_strlcpy(data, na, len); + ret = cpulist_parse(data, *mask); + kfree(data); + return ret; +} + +static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) +{ + int rc = 0; + struct sk_buff *rep_skb; + struct taskstats stats; + void *reply; + size_t size; + struct nlattr *na; + cpumask_t mask; + + rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask); + if (rc < 0) + return rc; + if (rc == 0) + return add_del_listener(info->snd_pid, &mask, REGISTER); + + rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], &mask); + if (rc < 0) + return rc; + if (rc == 0) + return add_del_listener(info->snd_pid, &mask, DEREGISTER); + + /* + * Size includes space for nested attributes + */ + size = nla_total_size(sizeof(u32)) + + nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); + + memset(&stats, 0, sizeof(stats)); + rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, &reply, size); + if (rc < 0) + return rc; + + if (info->attrs[TASKSTATS_CMD_ATTR_PID]) { + u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); + rc = fill_pid(pid, NULL, &stats); + if (rc < 0) + goto err; + + na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID); + NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, pid); + NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, + stats); + } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) { + u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); + rc = fill_tgid(tgid, NULL, &stats); + if (rc < 0) + goto err; + + na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID); + NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, tgid); + NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, + stats); + } else { + rc = -EINVAL; + goto err; + } + + nla_nest_end(rep_skb, na); + + return send_reply(rep_skb, info->snd_pid); + +nla_put_failure: + return genlmsg_cancel(rep_skb, reply); +err: + nlmsg_free(rep_skb); + return rc; +} + +void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu) +{ + struct listener_list *listeners; + struct taskstats *tmp; + /* + * This is the cpu on which the task is exiting currently and will + * be the one for which the exit event is sent, even if the cpu + * on which this function is running changes later. + */ + *mycpu = raw_smp_processor_id(); + + *ptidstats = NULL; + tmp = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL); + if (!tmp) + return; + + listeners = &per_cpu(listener_array, *mycpu); + down_read(&listeners->sem); + if (!list_empty(&listeners->list)) { + *ptidstats = tmp; + tmp = NULL; + } + up_read(&listeners->sem); + kfree(tmp); +} + +/* Send pid data out on exit */ +void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, + int group_dead, unsigned int mycpu) +{ + int rc; + struct sk_buff *rep_skb; + void *reply; + size_t size; + int is_thread_group; + struct nlattr *na; + unsigned long flags; + + if (!family_registered || !tidstats) + return; + + spin_lock_irqsave(&tsk->signal->stats_lock, flags); + is_thread_group = tsk->signal->stats ? 1 : 0; + spin_unlock_irqrestore(&tsk->signal->stats_lock, flags); + + rc = 0; + /* + * Size includes space for nested attributes + */ + size = nla_total_size(sizeof(u32)) + + nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); + + if (is_thread_group) + size = 2 * size; /* PID + STATS + TGID + STATS */ + + rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size); + if (rc < 0) + goto ret; + + rc = fill_pid(tsk->pid, tsk, tidstats); + if (rc < 0) + goto err_skb; + + na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID); + NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, (u32)tsk->pid); + NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, + *tidstats); + nla_nest_end(rep_skb, na); + + if (!is_thread_group) + goto send; + + /* + * tsk has/had a thread group so fill the tsk->signal->stats structure + * Doesn't matter if tsk is the leader or the last group member leaving + */ + + fill_tgid_exit(tsk); + if (!group_dead) + goto send; + + na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID); + NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, (u32)tsk->tgid); + /* No locking needed for tsk->signal->stats since group is dead */ + NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, + *tsk->signal->stats); + nla_nest_end(rep_skb, na); + +send: + send_cpu_listeners(rep_skb, mycpu); + return; + +nla_put_failure: + genlmsg_cancel(rep_skb, reply); + goto ret; +err_skb: + nlmsg_free(rep_skb); +ret: + return; +} + +static struct genl_ops taskstats_ops = { + .cmd = TASKSTATS_CMD_GET, + .doit = taskstats_user_cmd, + .policy = taskstats_cmd_get_policy, +}; + +/* Needed early in initialization */ +void __init taskstats_init_early(void) +{ + unsigned int i; + + taskstats_cache = kmem_cache_create("taskstats_cache", + sizeof(struct taskstats), + 0, SLAB_PANIC, NULL, NULL); + for_each_possible_cpu(i) { + INIT_LIST_HEAD(&(per_cpu(listener_array, i).list)); + init_rwsem(&(per_cpu(listener_array, i).sem)); + } +} + +static int __init taskstats_init(void) +{ + int rc; + + rc = genl_register_family(&family); + if (rc) + return rc; + + rc = genl_register_ops(&family, &taskstats_ops); + if (rc < 0) + goto err; + + family_registered = 1; + return 0; +err: + genl_unregister_family(&family); + return rc; +} + +/* + * late initcall ensures initialization of statistics collection + * mechanisms precedes initialization of the taskstats interface + */ +late_initcall(taskstats_init); diff --git a/kernel/timer.c b/kernel/timer.c index 2a87430a58d4..05809c2e2fd6 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -374,6 +374,7 @@ int del_timer_sync(struct timer_list *timer) int ret = try_to_del_timer_sync(timer); if (ret >= 0) return ret; + cpu_relax(); } } @@ -968,6 +969,7 @@ void __init timekeeping_init(void) } +static int timekeeping_suspended; /* * timekeeping_resume - Resumes the generic timekeeping subsystem. * @dev: unused @@ -983,6 +985,18 @@ static int timekeeping_resume(struct sys_device *dev) write_seqlock_irqsave(&xtime_lock, flags); /* restart the last cycle value */ clock->cycle_last = clocksource_read(clock); + clock->error = 0; + timekeeping_suspended = 0; + write_sequnlock_irqrestore(&xtime_lock, flags); + return 0; +} + +static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) +{ + unsigned long flags; + + write_seqlock_irqsave(&xtime_lock, flags); + timekeeping_suspended = 1; write_sequnlock_irqrestore(&xtime_lock, flags); return 0; } @@ -990,6 +1004,7 @@ static int timekeeping_resume(struct sys_device *dev) /* sysfs resume/suspend bits for timekeeping */ static struct sysdev_class timekeeping_sysclass = { .resume = timekeeping_resume, + .suspend = timekeeping_suspend, set_kset_name("timekeeping"), }; @@ -1100,13 +1115,16 @@ static void update_wall_time(void) { cycle_t offset; - clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift; + /* Make sure we're fully resumed: */ + if (unlikely(timekeeping_suspended)) + return; #ifdef CONFIG_GENERIC_TIME offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask; #else offset = clock->cycle_interval; #endif + clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift; /* normally this loop will run just once, however in the * case of lost or late ticks, it will accumulate correctly. |