diff options
author | James Morris <james.l.morris@oracle.com> | 2012-09-28 13:37:32 +1000 |
---|---|---|
committer | James Morris <james.l.morris@oracle.com> | 2012-09-28 13:37:32 +1000 |
commit | bf5308344527d015ac9a6d2bda4ad4d40fd7d943 (patch) | |
tree | 566e61e2cfc648c374d15cfc8c661b73e1a471f8 /kernel/sched | |
parent | 3585e96cd1049682b8a19a0b699422156e9d735b (diff) | |
parent | 979570e02981d4a8fc20b3cc8fd651856c98ee9d (diff) |
Merge tag 'v3.6-rc7' into next
Linux 3.6-rc7
Requested by David Howells so he can merge his key susbsystem work into
my tree with requisite -linus changesets.
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/core.c | 108 | ||||
-rw-r--r-- | kernel/sched/fair.c | 48 | ||||
-rw-r--r-- | kernel/sched/rt.c | 14 | ||||
-rw-r--r-- | kernel/sched/sched.h | 9 | ||||
-rw-r--r-- | kernel/sched/stop_task.c | 22 |
5 files changed, 108 insertions, 93 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 82ad284f823b..649c9f876cb1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3142,6 +3142,20 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) # define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) #endif +static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) +{ + u64 temp = (__force u64) rtime; + + temp *= (__force u64) utime; + + if (sizeof(cputime_t) == 4) + temp = div_u64(temp, (__force u32) total); + else + temp = div64_u64(temp, (__force u64) total); + + return (__force cputime_t) temp; +} + void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) { cputime_t rtime, utime = p->utime, total = utime + p->stime; @@ -3151,13 +3165,9 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) */ rtime = nsecs_to_cputime(p->se.sum_exec_runtime); - if (total) { - u64 temp = (__force u64) rtime; - - temp *= (__force u64) utime; - do_div(temp, (__force u32) total); - utime = (__force cputime_t) temp; - } else + if (total) + utime = scale_utime(utime, rtime, total); + else utime = rtime; /* @@ -3184,13 +3194,9 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) total = cputime.utime + cputime.stime; rtime = nsecs_to_cputime(cputime.sum_exec_runtime); - if (total) { - u64 temp = (__force u64) rtime; - - temp *= (__force u64) cputime.utime; - do_div(temp, (__force u32) total); - utime = (__force cputime_t) temp; - } else + if (total) + utime = scale_utime(cputime.utime, rtime, total); + else utime = rtime; sig->prev_utime = max(sig->prev_utime, utime); @@ -5298,27 +5304,17 @@ void idle_task_exit(void) } /* - * While a dead CPU has no uninterruptible tasks queued at this point, - * it might still have a nonzero ->nr_uninterruptible counter, because - * for performance reasons the counter is not stricly tracking tasks to - * their home CPUs. So we just add the counter to another CPU's counter, - * to keep the global sum constant after CPU-down: - */ -static void migrate_nr_uninterruptible(struct rq *rq_src) -{ - struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); - - rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; - rq_src->nr_uninterruptible = 0; -} - -/* - * remove the tasks which were accounted by rq from calc_load_tasks. + * Since this CPU is going 'away' for a while, fold any nr_active delta + * we might have. Assumes we're called after migrate_tasks() so that the + * nr_active count is stable. + * + * Also see the comment "Global load-average calculations". */ -static void calc_global_load_remove(struct rq *rq) +static void calc_load_migrate(struct rq *rq) { - atomic_long_sub(rq->calc_load_active, &calc_load_tasks); - rq->calc_load_active = 0; + long delta = calc_load_fold_active(rq); + if (delta) + atomic_long_add(delta, &calc_load_tasks); } /* @@ -5346,9 +5342,6 @@ static void migrate_tasks(unsigned int dead_cpu) */ rq->stop = NULL; - /* Ensure any throttled groups are reachable by pick_next_task */ - unthrottle_offline_cfs_rqs(rq); - for ( ; ; ) { /* * There's this thread running, bail when that's the only @@ -5612,8 +5605,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) BUG_ON(rq->nr_running != 1); /* the migration thread */ raw_spin_unlock_irqrestore(&rq->lock, flags); - migrate_nr_uninterruptible(rq); - calc_global_load_remove(rq); + calc_load_migrate(rq); break; #endif } @@ -6022,11 +6014,6 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this * allows us to avoid some pointer chasing select_idle_sibling(). * - * Iterate domains and sched_groups downward, assigning CPUs to be - * select_idle_sibling() hw buddy. Cross-wiring hw makes bouncing - * due to random perturbation self canceling, ie sw buddies pull - * their counterpart to their CPU's hw counterpart. - * * Also keep a unique ID per domain (we use the first cpu number in * the cpumask of the domain), this allows us to quickly tell if * two cpus are in the same cache domain, see cpus_share_cache(). @@ -6040,40 +6027,8 @@ static void update_top_cache_domain(int cpu) int id = cpu; sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); - if (sd) { - struct sched_domain *tmp = sd; - struct sched_group *sg, *prev; - bool right; - - /* - * Traverse to first CPU in group, and count hops - * to cpu from there, switching direction on each - * hop, never ever pointing the last CPU rightward. - */ - do { - id = cpumask_first(sched_domain_span(tmp)); - prev = sg = tmp->groups; - right = 1; - - while (cpumask_first(sched_group_cpus(sg)) != id) - sg = sg->next; - - while (!cpumask_test_cpu(cpu, sched_group_cpus(sg))) { - prev = sg; - sg = sg->next; - right = !right; - } - - /* A CPU went down, never point back to domain start. */ - if (right && cpumask_first(sched_group_cpus(sg->next)) == id) - right = false; - - sg = right ? sg->next : prev; - tmp->idle_buddy = cpumask_first(sched_group_cpus(sg)); - } while ((tmp = tmp->child)); - + if (sd) id = cpumask_first(sched_domain_span(sd)); - } rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); per_cpu(sd_llc_id, cpu) = id; @@ -7246,6 +7201,7 @@ int in_sched_functions(unsigned long addr) #ifdef CONFIG_CGROUP_SCHED struct task_group root_task_group; +LIST_HEAD(task_groups); #endif DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d0cc03b3e70b..96e2b18b6283 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2052,7 +2052,7 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) hrtimer_cancel(&cfs_b->slack_timer); } -void unthrottle_offline_cfs_rqs(struct rq *rq) +static void unthrottle_offline_cfs_rqs(struct rq *rq) { struct cfs_rq *cfs_rq; @@ -2106,7 +2106,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) return NULL; } static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} -void unthrottle_offline_cfs_rqs(struct rq *rq) {} +static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {} #endif /* CONFIG_CFS_BANDWIDTH */ @@ -2637,6 +2637,8 @@ static int select_idle_sibling(struct task_struct *p, int target) int cpu = smp_processor_id(); int prev_cpu = task_cpu(p); struct sched_domain *sd; + struct sched_group *sg; + int i; /* * If the task is going to be woken-up on this cpu and if it is @@ -2653,17 +2655,29 @@ static int select_idle_sibling(struct task_struct *p, int target) return prev_cpu; /* - * Otherwise, check assigned siblings to find an elegible idle cpu. + * Otherwise, iterate the domains and find an elegible idle cpu. */ sd = rcu_dereference(per_cpu(sd_llc, target)); - for_each_lower_domain(sd) { - if (!cpumask_test_cpu(sd->idle_buddy, tsk_cpus_allowed(p))) - continue; - if (idle_cpu(sd->idle_buddy)) - return sd->idle_buddy; - } + sg = sd->groups; + do { + if (!cpumask_intersects(sched_group_cpus(sg), + tsk_cpus_allowed(p))) + goto next; + for_each_cpu(i, sched_group_cpus(sg)) { + if (!idle_cpu(i)) + goto next; + } + + target = cpumask_first_and(sched_group_cpus(sg), + tsk_cpus_allowed(p)); + goto done; +next: + sg = sg->next; + } while (sg != sd->groups); + } +done: return target; } @@ -3387,6 +3401,14 @@ static int tg_load_down(struct task_group *tg, void *data) static void update_h_load(long cpu) { + struct rq *rq = cpu_rq(cpu); + unsigned long now = jiffies; + + if (rq->h_load_throttle == now) + return; + + rq->h_load_throttle = now; + rcu_read_lock(); walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); rcu_read_unlock(); @@ -3650,7 +3672,6 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) * @group: sched_group whose statistics are to be updated. * @load_idx: Load index of sched_domain of this_cpu for load calc. * @local_group: Does group contain this_cpu. - * @cpus: Set of cpus considered for load balancing. * @balance: Should we balance. * @sgs: variable to hold the statistics for this group. */ @@ -3797,7 +3818,6 @@ static bool update_sd_pick_busiest(struct lb_env *env, /** * update_sd_lb_stats - Update sched_domain's statistics for load balancing. * @env: The load balancing environment. - * @cpus: Set of cpus considered for load balancing. * @balance: Should we balance. * @sds: variable to hold the statistics for this sched_domain. */ @@ -4293,11 +4313,10 @@ redo: env.src_rq = busiest; env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); + update_h_load(env.src_cpu); more_balance: local_irq_save(flags); double_rq_lock(this_rq, busiest); - if (!env.loop) - update_h_load(env.src_cpu); /* * cur_ld_moved - load moved in current iteration @@ -4949,6 +4968,9 @@ static void rq_online_fair(struct rq *rq) static void rq_offline_fair(struct rq *rq) { update_sysctl(); + + /* Ensure any throttled groups are reachable by pick_next_task */ + unthrottle_offline_cfs_rqs(rq); } #endif /* CONFIG_SMP */ diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 573e1ca01102..e0b7ba9c040f 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -691,6 +691,7 @@ balanced: * runtime - in which case borrowing doesn't make sense. */ rt_rq->rt_runtime = RUNTIME_INF; + rt_rq->rt_throttled = 0; raw_spin_unlock(&rt_rq->rt_runtime_lock); raw_spin_unlock(&rt_b->rt_runtime_lock); } @@ -788,6 +789,19 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) const struct cpumask *span; span = sched_rt_period_mask(); +#ifdef CONFIG_RT_GROUP_SCHED + /* + * FIXME: isolated CPUs should really leave the root task group, + * whether they are isolcpus or were isolated via cpusets, lest + * the timer run on a CPU which does not service all runqueues, + * potentially leaving other CPUs indefinitely throttled. If + * isolation is really required, the user will turn the throttle + * off to kill the perturbations it causes anyway. Meanwhile, + * this maintains functionality for boot and/or troubleshooting. + */ + if (rt_b == &root_task_group.rt_bandwidth) + span = cpu_online_mask; +#endif for_each_cpu(i, span) { int enqueue = 0; struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c35a1a7dd4d6..0848fa36c383 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -80,7 +80,7 @@ extern struct mutex sched_domains_mutex; struct cfs_rq; struct rt_rq; -static LIST_HEAD(task_groups); +extern struct list_head task_groups; struct cfs_bandwidth { #ifdef CONFIG_CFS_BANDWIDTH @@ -374,7 +374,11 @@ struct rq { #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ struct list_head leaf_cfs_rq_list; -#endif +#ifdef CONFIG_SMP + unsigned long h_load_throttle; +#endif /* CONFIG_SMP */ +#endif /* CONFIG_FAIR_GROUP_SCHED */ + #ifdef CONFIG_RT_GROUP_SCHED struct list_head leaf_rt_rq_list; #endif @@ -1140,7 +1144,6 @@ extern void print_rt_stats(struct seq_file *m, int cpu); extern void init_cfs_rq(struct cfs_rq *cfs_rq); extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); -extern void unthrottle_offline_cfs_rqs(struct rq *rq); extern void account_cfs_bandwidth_used(int enabled, int was_enabled); diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 7b386e86fd23..da5eb5bed84a 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -27,8 +27,10 @@ static struct task_struct *pick_next_task_stop(struct rq *rq) { struct task_struct *stop = rq->stop; - if (stop && stop->on_rq) + if (stop && stop->on_rq) { + stop->se.exec_start = rq->clock_task; return stop; + } return NULL; } @@ -52,6 +54,21 @@ static void yield_task_stop(struct rq *rq) static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) { + struct task_struct *curr = rq->curr; + u64 delta_exec; + + delta_exec = rq->clock_task - curr->se.exec_start; + if (unlikely((s64)delta_exec < 0)) + delta_exec = 0; + + schedstat_set(curr->se.statistics.exec_max, + max(curr->se.statistics.exec_max, delta_exec)); + + curr->se.sum_exec_runtime += delta_exec; + account_group_exec_runtime(curr, delta_exec); + + curr->se.exec_start = rq->clock_task; + cpuacct_charge(curr, delta_exec); } static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) @@ -60,6 +77,9 @@ static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) static void set_curr_task_stop(struct rq *rq) { + struct task_struct *stop = rq->stop; + + stop->se.exec_start = rq->clock_task; } static void switched_to_stop(struct rq *rq, struct task_struct *p) |