diff options
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/Makefile | 5 | ||||
-rw-r--r-- | kernel/sched/boost.c | 217 | ||||
-rw-r--r-- | kernel/sched/clock.c | 2 | ||||
-rw-r--r-- | kernel/sched/core.c | 863 | ||||
-rw-r--r-- | kernel/sched/core_ctl.c | 1171 | ||||
-rw-r--r-- | kernel/sched/cpupri.c | 48 | ||||
-rw-r--r-- | kernel/sched/cputime.c | 17 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 50 | ||||
-rw-r--r-- | kernel/sched/debug.c | 46 | ||||
-rw-r--r-- | kernel/sched/energy.c | 10 | ||||
-rw-r--r-- | kernel/sched/fair.c | 2024 | ||||
-rw-r--r-- | kernel/sched/features.h | 2 | ||||
-rw-r--r-- | kernel/sched/hmp.c | 4416 | ||||
-rw-r--r-- | kernel/sched/idle_task.c | 25 | ||||
-rw-r--r-- | kernel/sched/rt.c | 322 | ||||
-rw-r--r-- | kernel/sched/sched.h | 865 | ||||
-rw-r--r-- | kernel/sched/sched_avg.c | 199 | ||||
-rw-r--r-- | kernel/sched/stop_task.c | 45 | ||||
-rw-r--r-- | kernel/sched/tune.c | 191 |
19 files changed, 10118 insertions, 400 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 99378130a42f..7dde1b9918e4 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -17,13 +17,14 @@ endif obj-y += core.o loadavg.o clock.o cputime.o obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o -obj-y += wait.o completion.o idle.o +obj-y += wait.o completion.o idle.o sched_avg.o obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o energy.o -obj-$(CONFIG_SCHED_WALT) += walt.o +obj-$(CONFIG_SCHED_HMP) += hmp.o boost.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_SCHED_TUNE) += tune.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o +obj-$(CONFIG_SCHED_CORE_CTL) += core_ctl.o obj-$(CONFIG_CPU_FREQ) += cpufreq.o obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o diff --git a/kernel/sched/boost.c b/kernel/sched/boost.c new file mode 100644 index 000000000000..5bdd51b1e55e --- /dev/null +++ b/kernel/sched/boost.c @@ -0,0 +1,217 @@ +/* Copyright (c) 2012-2016, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include "sched.h" +#include <linux/of.h> +#include <linux/sched/core_ctl.h> +#include <trace/events/sched.h> + +/* + * Scheduler boost is a mechanism to temporarily place tasks on CPUs + * with higher capacity than those where a task would have normally + * ended up with their load characteristics. Any entity enabling + * boost is responsible for disabling it as well. + */ + +unsigned int sysctl_sched_boost; +static enum sched_boost_policy boost_policy; +static enum sched_boost_policy boost_policy_dt = SCHED_BOOST_NONE; +static DEFINE_MUTEX(boost_mutex); +static unsigned int freq_aggr_threshold_backup; + +static inline void boost_kick(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (!test_and_set_bit(BOOST_KICK, &rq->hmp_flags)) + smp_send_reschedule(cpu); +} + +static void boost_kick_cpus(void) +{ + int i; + struct cpumask kick_mask; + + if (boost_policy != SCHED_BOOST_ON_BIG) + return; + + cpumask_andnot(&kick_mask, cpu_online_mask, cpu_isolated_mask); + + for_each_cpu(i, &kick_mask) { + if (cpu_capacity(i) != max_capacity) + boost_kick(i); + } +} + +int got_boost_kick(void) +{ + int cpu = smp_processor_id(); + struct rq *rq = cpu_rq(cpu); + + return test_bit(BOOST_KICK, &rq->hmp_flags); +} + +void clear_boost_kick(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + clear_bit(BOOST_KICK, &rq->hmp_flags); +} + +/* + * Scheduler boost type and boost policy might at first seem unrelated, + * however, there exists a connection between them that will allow us + * to use them interchangeably during placement decisions. We'll explain + * the connection here in one possible way so that the implications are + * clear when looking at placement policies. + * + * When policy = SCHED_BOOST_NONE, type is either none or RESTRAINED + * When policy = SCHED_BOOST_ON_ALL or SCHED_BOOST_ON_BIG, type can + * neither be none nor RESTRAINED. + */ +static void set_boost_policy(int type) +{ + if (type == SCHED_BOOST_NONE || type == RESTRAINED_BOOST) { + boost_policy = SCHED_BOOST_NONE; + return; + } + + if (boost_policy_dt) { + boost_policy = boost_policy_dt; + return; + } + + if (min_possible_efficiency != max_possible_efficiency) { + boost_policy = SCHED_BOOST_ON_BIG; + return; + } + + boost_policy = SCHED_BOOST_ON_ALL; +} + +enum sched_boost_policy sched_boost_policy(void) +{ + return boost_policy; +} + +static bool verify_boost_params(int old_val, int new_val) +{ + /* + * Boost can only be turned on or off. There is no possiblity of + * switching from one boost type to another or to set the same + * kind of boost several times. + */ + return !(!!old_val == !!new_val); +} + +static void _sched_set_boost(int old_val, int type) +{ + switch (type) { + case NO_BOOST: + if (old_val == FULL_THROTTLE_BOOST) + core_ctl_set_boost(false); + else if (old_val == CONSERVATIVE_BOOST) + restore_cgroup_boost_settings(); + else + update_freq_aggregate_threshold( + freq_aggr_threshold_backup); + break; + + case FULL_THROTTLE_BOOST: + core_ctl_set_boost(true); + boost_kick_cpus(); + break; + + case CONSERVATIVE_BOOST: + update_cgroup_boost_settings(); + boost_kick_cpus(); + break; + + case RESTRAINED_BOOST: + freq_aggr_threshold_backup = + update_freq_aggregate_threshold(1); + break; + + default: + WARN_ON(1); + return; + } + + set_boost_policy(type); + sysctl_sched_boost = type; + trace_sched_set_boost(type); +} + +void sched_boost_parse_dt(void) +{ + struct device_node *sn; + const char *boost_policy; + + sn = of_find_node_by_path("/sched-hmp"); + if (!sn) + return; + + if (!of_property_read_string(sn, "boost-policy", &boost_policy)) { + if (!strcmp(boost_policy, "boost-on-big")) + boost_policy_dt = SCHED_BOOST_ON_BIG; + else if (!strcmp(boost_policy, "boost-on-all")) + boost_policy_dt = SCHED_BOOST_ON_ALL; + } +} + +int sched_set_boost(int type) +{ + int ret = 0; + + mutex_lock(&boost_mutex); + + if (verify_boost_params(sysctl_sched_boost, type)) + _sched_set_boost(sysctl_sched_boost, type); + else + ret = -EINVAL; + + mutex_unlock(&boost_mutex); + return ret; +} + +int sched_boost_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + unsigned int *data = (unsigned int *)table->data; + unsigned int old_val; + + mutex_lock(&boost_mutex); + + old_val = *data; + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + if (ret || !write) + goto done; + + if (verify_boost_params(old_val, *data)) { + _sched_set_boost(old_val, *data); + } else { + *data = old_val; + ret = -EINVAL; + } + +done: + mutex_unlock(&boost_mutex); + return ret; +} + +int sched_boost(void) +{ + return sysctl_sched_boost; +} diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index caf4041f5b0a..bc54e84675da 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -354,7 +354,7 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) return; sched_clock_tick(); - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sched(); } EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e3ff9a6d11f8..473293dd40a3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -26,6 +26,7 @@ * Thomas Gleixner, Mike Kravetz */ +#include <linux/kasan.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/nmi.h> @@ -74,6 +75,8 @@ #include <linux/binfmts.h> #include <linux/context_tracking.h> #include <linux/compiler.h> +#include <linux/irq.h> +#include <linux/sched/core_ctl.h> #include <linux/cpufreq_times.h> #include <asm/switch_to.h> @@ -83,14 +86,19 @@ #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> #endif +#ifdef CONFIG_MSM_APP_SETTINGS +#include <asm/app_api.h> +#endif #include "sched.h" #include "../workqueue_internal.h" #include "../smpboot.h" +#include "../time/tick-internal.h" #define CREATE_TRACE_POINTS #include <trace/events/sched.h> -#include "walt.h" + +ATOMIC_NOTIFIER_HEAD(load_alert_notifier_head); DEFINE_MUTEX(sched_domains_mutex); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); @@ -856,6 +864,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) if (!(flags & ENQUEUE_RESTORE)) sched_info_queued(rq, p); p->sched_class->enqueue_task(rq, p, flags); + trace_sched_enq_deq_task(p, 1, cpumask_bits(&p->cpus_allowed)[0]); } static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) @@ -864,6 +873,7 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) if (!(flags & DEQUEUE_SAVE)) sched_info_dequeued(rq, p); p->sched_class->dequeue_task(rq, p, flags); + trace_sched_enq_deq_task(p, 0, cpumask_bits(&p->cpus_allowed)[0]); } void activate_task(struct rq *rq, struct task_struct *p, int flags) @@ -879,6 +889,9 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags) if (task_contributes_to_load(p)) rq->nr_uninterruptible++; + if (flags & DEQUEUE_SLEEP) + clear_ed_task(p, rq); + dequeue_task(rq, p, flags); } @@ -1094,8 +1107,8 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new { lockdep_assert_held(&rq->lock); - dequeue_task(rq, p, 0); p->on_rq = TASK_ON_RQ_MIGRATING; + dequeue_task(rq, p, 0); double_lock_balance(rq, cpu_rq(new_cpu)); set_task_cpu(p, new_cpu); double_unlock_balance(rq, cpu_rq(new_cpu)); @@ -1105,8 +1118,8 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new raw_spin_lock(&rq->lock); BUG_ON(task_cpu(p) != new_cpu); - p->on_rq = TASK_ON_RQ_QUEUED; enqueue_task(rq, p, 0); + p->on_rq = TASK_ON_RQ_QUEUED; check_preempt_curr(rq, p, 0); return rq; @@ -1128,6 +1141,8 @@ struct migration_arg { */ static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_cpu) { + int src_cpu; + if (unlikely(!cpu_active(dest_cpu))) return rq; @@ -1135,6 +1150,7 @@ static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_ if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) return rq; + src_cpu = cpu_of(rq); rq = move_queued_task(rq, p, dest_cpu); return rq; @@ -1150,6 +1166,8 @@ static int migration_cpu_stop(void *data) struct migration_arg *arg = data; struct task_struct *p = arg->task; struct rq *rq = this_rq(); + int src_cpu = cpu_of(rq); + bool moved = false; /* * The original target cpu might have gone down and we might @@ -1170,12 +1188,18 @@ static int migration_cpu_stop(void *data) * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because * we're holding p->pi_lock. */ - if (task_rq(p) == rq && task_on_rq_queued(p)) + if (task_rq(p) == rq && task_on_rq_queued(p)) { rq = __migrate_task(rq, p, arg->dest_cpu); + moved = true; + } raw_spin_unlock(&rq->lock); raw_spin_unlock(&p->pi_lock); local_irq_enable(); + + if (moved) + notify_migration(src_cpu, arg->dest_cpu, false, p); + return 0; } @@ -1234,6 +1258,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, struct rq *rq; unsigned int dest_cpu; int ret = 0; + cpumask_t allowed_mask; rq = task_rq_lock(p, &flags); @@ -1249,18 +1274,25 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, if (cpumask_equal(&p->cpus_allowed, new_mask)) goto out; - if (!cpumask_intersects(new_mask, cpu_active_mask)) { - ret = -EINVAL; - goto out; + cpumask_andnot(&allowed_mask, new_mask, cpu_isolated_mask); + cpumask_and(&allowed_mask, &allowed_mask, cpu_active_mask); + + dest_cpu = cpumask_any(&allowed_mask); + if (dest_cpu >= nr_cpu_ids) { + cpumask_and(&allowed_mask, cpu_active_mask, new_mask); + dest_cpu = cpumask_any(&allowed_mask); + if (dest_cpu >= nr_cpu_ids) { + ret = -EINVAL; + goto out; + } } do_set_cpus_allowed(p, new_mask); /* Can the task run on the task's current CPU? If so, we're done */ - if (cpumask_test_cpu(task_cpu(p), new_mask)) + if (cpumask_test_cpu(task_cpu(p), &allowed_mask)) goto out; - dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); if (task_running(rq, p) || p->state == TASK_WAKING) { struct migration_arg arg = { p, dest_cpu }; /* Need help from migration thread: drop lock and wait. */ @@ -1299,6 +1331,15 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && !p->on_rq); + /* + * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING, + * because schedstat_wait_{start,end} rebase migrating task's wait_start + * time relying on p->on_rq. + */ + WARN_ON_ONCE(p->state == TASK_RUNNING && + p->sched_class == &fair_sched_class && + (p->on_rq && !task_on_rq_migrating(p))); + #ifdef CONFIG_LOCKDEP /* * The caller should hold either p->pi_lock or rq->lock, when changing @@ -1315,7 +1356,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) #endif #endif - trace_sched_migrate_task(p, new_cpu); + trace_sched_migrate_task(p, new_cpu, pct_task_load(p)); if (task_cpu(p) != new_cpu) { if (p->sched_class->migrate_task_rq) @@ -1323,7 +1364,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) p->se.nr_migrations++; perf_event_task_migrate(p); - walt_fixup_busy_time(p, new_cpu); + fixup_busy_time(p, new_cpu); } __set_task_cpu(p, new_cpu); @@ -1337,11 +1378,13 @@ static void __migrate_swap_task(struct task_struct *p, int cpu) src_rq = task_rq(p); dst_rq = cpu_rq(cpu); + p->on_rq = TASK_ON_RQ_MIGRATING; deactivate_task(src_rq, p, 0); p->on_rq = TASK_ON_RQ_MIGRATING; set_task_cpu(p, cpu); p->on_rq = TASK_ON_RQ_QUEUED; activate_task(dst_rq, p, 0); + p->on_rq = TASK_ON_RQ_QUEUED; check_preempt_curr(dst_rq, p, 0); } else { /* @@ -1527,7 +1570,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) * yield - it could be a while. */ if (unlikely(queued)) { - ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); + ktime_t to = ktime_set(0, NSEC_PER_MSEC); set_current_state(TASK_UNINTERRUPTIBLE); schedule_hrtimeout(&to, HRTIMER_MODE_REL); @@ -1573,12 +1616,13 @@ EXPORT_SYMBOL_GPL(kick_process); /* * ->cpus_allowed is protected by both rq->lock and p->pi_lock */ -static int select_fallback_rq(int cpu, struct task_struct *p) +static int select_fallback_rq(int cpu, struct task_struct *p, bool allow_iso) { int nid = cpu_to_node(cpu); const struct cpumask *nodemask = NULL; - enum { cpuset, possible, fail } state = cpuset; + enum { cpuset, possible, fail, bug } state = cpuset; int dest_cpu; + int isolated_candidate = -1; /* * If the node that the cpu is on has been offlined, cpu_to_node() @@ -1594,6 +1638,8 @@ static int select_fallback_rq(int cpu, struct task_struct *p) continue; if (!cpu_active(dest_cpu)) continue; + if (cpu_isolated(dest_cpu)) + continue; if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) return dest_cpu; } @@ -1606,6 +1652,16 @@ static int select_fallback_rq(int cpu, struct task_struct *p) continue; if (!cpu_active(dest_cpu)) continue; + if (cpu_isolated(dest_cpu)) { + if (allow_iso) + isolated_candidate = dest_cpu; + continue; + } + goto out; + } + + if (isolated_candidate != -1) { + dest_cpu = isolated_candidate; goto out; } @@ -1624,6 +1680,11 @@ static int select_fallback_rq(int cpu, struct task_struct *p) break; case fail: + allow_iso = true; + state = bug; + break; + + case bug: BUG(); break; } @@ -1652,6 +1713,8 @@ static inline int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags, int sibling_count_hint) { + bool allow_isolated = (p->flags & PF_KTHREAD); + lockdep_assert_held(&p->pi_lock); if (p->nr_cpus_allowed > 1) @@ -1669,13 +1732,14 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags, * not worry about this generic constraint ] */ if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) || - !cpu_online(cpu))) - cpu = select_fallback_rq(task_cpu(p), p); + !cpu_online(cpu)) || + (cpu_isolated(cpu) && !allow_isolated)) + cpu = select_fallback_rq(task_cpu(p), p, allow_isolated); return cpu; } -static void update_avg(u64 *avg, u64 sample) +void update_avg(u64 *avg, u64 sample) { s64 diff = sample - *avg; *avg += diff >> 3; @@ -1748,6 +1812,7 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) { check_preempt_curr(rq, p, wake_flags); + p->state = TASK_RUNNING; trace_sched_wakeup(p); @@ -1839,6 +1904,8 @@ void sched_ttwu_pending(void) void scheduler_ipi(void) { + int cpu = smp_processor_id(); + /* * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting * TIF_NEED_RESCHED remotely (for the first time) will also send @@ -1846,9 +1913,18 @@ void scheduler_ipi(void) */ preempt_fold_need_resched(); - if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) + if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick() && + !got_boost_kick()) return; + if (got_boost_kick()) { + struct rq *rq = cpu_rq(cpu); + + if (rq->curr->sched_class == &fair_sched_class) + check_for_migration(rq, rq->curr); + clear_boost_kick(cpu); + } + /* * Not all reschedule IPI handlers call irq_enter/irq_exit, since * traditionally all their work was done from the interrupt return @@ -1868,7 +1944,7 @@ void scheduler_ipi(void) /* * Check if someone kicked us for doing the nohz idle load balance. */ - if (unlikely(got_nohz_idle_kick())) { + if (unlikely(got_nohz_idle_kick()) && !cpu_isolated(cpu)) { this_rq()->idle_balance = 1; raise_softirq_irqoff(SCHED_SOFTIRQ); } @@ -1958,11 +2034,17 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags, int sibling_count_hint) { unsigned long flags; - int cpu, success = 0; + int cpu, src_cpu, success = 0; #ifdef CONFIG_SMP + unsigned int old_load; struct rq *rq; u64 wallclock; + struct related_thread_group *grp = NULL; #endif + bool freq_notif_allowed = !(wake_flags & WF_NO_NOTIFIER); + bool check_group = false; + + wake_flags &= ~WF_NO_NOTIFIER; /* * If we are going to wake up a thread waiting for CONDITION we @@ -1972,13 +2054,14 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags, */ smp_mb__before_spinlock(); raw_spin_lock_irqsave(&p->pi_lock, flags); + src_cpu = cpu = task_cpu(p); + if (!(p->state & state)) goto out; trace_sched_waking(p); success = 1; /* we're going to change ->state */ - cpu = task_cpu(p); /* * Ensure we load p->on_rq _after_ p->state, otherwise it would @@ -2045,11 +2128,20 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags, rq = cpu_rq(task_cpu(p)); raw_spin_lock(&rq->lock); - wallclock = walt_ktime_clock(); - walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); - walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0); + old_load = task_load(p); + wallclock = sched_ktime_clock(); + update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); + update_task_ravg(p, rq, TASK_WAKE, wallclock, 0); + cpufreq_update_util(rq, 0); raw_spin_unlock(&rq->lock); + rcu_read_lock(); + grp = task_related_thread_group(p); + if (update_preferred_cluster(grp, p, old_load)) + set_preferred_cluster(grp); + rcu_read_unlock(); + check_group = grp != NULL; + p->sched_contributes_to_load = !!task_contributes_to_load(p); p->state = TASK_WAKING; @@ -2058,19 +2150,33 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags, cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags, sibling_count_hint); - if (task_cpu(p) != cpu) { + + /* Refresh src_cpu as it could have changed since we last read it */ + src_cpu = task_cpu(p); + if (src_cpu != cpu) { wake_flags |= WF_MIGRATED; set_task_cpu(p, cpu); } + note_task_waking(p, wallclock); #endif /* CONFIG_SMP */ - ttwu_queue(p, cpu); stat: ttwu_stat(p, cpu, wake_flags); out: raw_spin_unlock_irqrestore(&p->pi_lock, flags); + if (freq_notif_allowed) { + if (!same_freq_domain(src_cpu, cpu)) { + check_for_freq_change(cpu_rq(cpu), + false, check_group); + check_for_freq_change(cpu_rq(src_cpu), + false, check_group); + } else if (success) { + check_for_freq_change(cpu_rq(cpu), true, false); + } + } + return success; } @@ -2086,9 +2192,13 @@ static void try_to_wake_up_local(struct task_struct *p) { struct rq *rq = task_rq(p); - if (WARN_ON_ONCE(rq != this_rq()) || - WARN_ON_ONCE(p == current)) + if (rq != this_rq() || p == current) { + printk_deferred("%s: Failed to wakeup task %d (%s), rq = %p," + " this_rq = %p, p = %p, current = %p\n", + __func__, task_pid_nr(p), p->comm, rq, + this_rq(), p, current); return; + } lockdep_assert_held(&rq->lock); @@ -2112,17 +2222,20 @@ static void try_to_wake_up_local(struct task_struct *p) trace_sched_waking(p); if (!task_on_rq_queued(p)) { - u64 wallclock = walt_ktime_clock(); + u64 wallclock = sched_ktime_clock(); - walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); - walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0); + update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); + update_task_ravg(p, rq, TASK_WAKE, wallclock, 0); + cpufreq_update_util(rq, 0); ttwu_activate(rq, p, ENQUEUE_WAKEUP); + note_task_waking(p, wallclock); } ttwu_do_wakeup(rq, p, 0); ttwu_stat(p, smp_processor_id(), 0); out: raw_spin_unlock(&p->pi_lock); + /* Todo : Send cpufreq notifier */ } /** @@ -2143,6 +2256,26 @@ int wake_up_process(struct task_struct *p) } EXPORT_SYMBOL(wake_up_process); +/** + * wake_up_process_no_notif - Wake up a specific process without notifying + * governor + * @p: The process to be woken up. + * + * Attempt to wake up the nominated process and move it to the set of runnable + * processes. + * + * Return: 1 if the process was woken up, 0 if it was already running. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ +int wake_up_process_no_notif(struct task_struct *p) +{ + WARN_ON(task_is_stopped_or_traced(p)); + return try_to_wake_up(p, TASK_NORMAL, WF_NO_NOTIFIER, 1); +} +EXPORT_SYMBOL(wake_up_process_no_notif); + int wake_up_state(struct task_struct *p, unsigned int state) { return try_to_wake_up(p, state, 0, 1); @@ -2167,6 +2300,44 @@ void __dl_clear_params(struct task_struct *p) dl_se->dl_yielded = 0; } +#ifdef CONFIG_SCHED_HMP +/* + * sched_exit() - Set EXITING_TASK_MARKER in task's ravg.demand field + * + * Stop accounting (exiting) task's future cpu usage + * + * We need this so that reset_all_windows_stats() can function correctly. + * reset_all_window_stats() depends on do_each_thread/for_each_thread task + * iterators to reset *all* task's statistics. Exiting tasks however become + * invisible to those iterators. sched_exit() is called on a exiting task prior + * to being removed from task_list, which will let reset_all_window_stats() + * function correctly. + */ +void sched_exit(struct task_struct *p) +{ + unsigned long flags; + struct rq *rq; + u64 wallclock; + + sched_set_group_id(p, 0); + + rq = task_rq_lock(p, &flags); + + /* rq->curr == p */ + wallclock = sched_ktime_clock(); + update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); + dequeue_task(rq, p, 0); + reset_task_stats(p); + p->ravg.mark_start = wallclock; + p->ravg.sum_history[0] = EXITING_TASK_MARKER; + + enqueue_task(rq, p, 0); + clear_ed_task(p, rq); + task_rq_unlock(rq, p, &flags); + free_task_load_ptrs(p); +} +#endif /* CONFIG_SCHED_HMP */ + /* * Perform scheduler related setup for a newly forked process p. * p is forked by current. @@ -2188,7 +2359,6 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) #endif INIT_LIST_HEAD(&p->se.group_node); - walt_init_new_task_load(p); #ifdef CONFIG_FAIR_GROUP_SCHED p->se.cfs_rq = NULL; @@ -2204,6 +2374,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) init_rt_schedtune_timer(&p->rt); INIT_LIST_HEAD(&p->rt.run_list); + p->rt.timeout = 0; + p->rt.time_slice = sched_rr_timeslice; + p->rt.on_rq = 0; + p->rt.on_list = 0; #ifdef CONFIG_PREEMPT_NOTIFIERS INIT_HLIST_HEAD(&p->preempt_notifiers); @@ -2273,7 +2447,10 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, int sched_fork(unsigned long clone_flags, struct task_struct *p) { unsigned long flags; - int cpu = get_cpu(); + int cpu; + + init_new_task_load(p); + cpu = get_cpu(); __sched_fork(clone_flags, p); /* @@ -2465,11 +2642,10 @@ void wake_up_new_task(struct task_struct *p) unsigned long flags; struct rq *rq; + add_new_task_to_grp(p); raw_spin_lock_irqsave(&p->pi_lock, flags); p->state = TASK_RUNNING; - walt_init_new_task_load(p); - /* Initialize new task's runnable average */ init_entity_runnable_average(&p->se); #ifdef CONFIG_SMP @@ -2484,10 +2660,9 @@ void wake_up_new_task(struct task_struct *p) __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0, 1)); #endif rq = __task_rq_lock(p); + mark_task_starting(p); update_rq_clock(rq); post_init_entity_util_avg(&p->se); - - walt_mark_task_starting(p); activate_task(rq, p, ENQUEUE_WAKEUP_NEW); p->on_rq = TASK_ON_RQ_QUEUED; trace_sched_wakeup_new(p); @@ -2615,6 +2790,14 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, fire_sched_out_preempt_notifiers(prev, next); prepare_lock_switch(rq, next); prepare_arch_switch(next); + +#ifdef CONFIG_MSM_APP_SETTINGS + if (use_app_setting) + switch_app_setting_bit(prev, next); + + if (use_32bit_app_setting || use_32bit_app_setting_pro) + switch_32bit_app_setting_bit(prev, next); +#endif } /** @@ -2906,7 +3089,7 @@ void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) *load = rq->load.weight; } -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) /* * sched_exec - execve() is a valuable balancing opportunity, because at @@ -2916,18 +3099,23 @@ void sched_exec(void) { struct task_struct *p = current; unsigned long flags; - int dest_cpu; + int dest_cpu, curr_cpu; + +#ifdef CONFIG_SCHED_HMP + return; +#endif raw_spin_lock_irqsave(&p->pi_lock, flags); + curr_cpu = task_cpu(p); dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0, 1); if (dest_cpu == smp_processor_id()) goto unlock; - if (likely(cpu_active(dest_cpu))) { + if (likely(cpu_active(dest_cpu) && likely(!cpu_isolated(dest_cpu)))) { struct migration_arg arg = { p, dest_cpu }; raw_spin_unlock_irqrestore(&p->pi_lock, flags); - stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); + stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); return; } unlock: @@ -2994,19 +3182,31 @@ void scheduler_tick(void) int cpu = smp_processor_id(); struct rq *rq = cpu_rq(cpu); struct task_struct *curr = rq->curr; + u64 wallclock; + bool early_notif; + u32 old_load; + struct related_thread_group *grp; sched_clock_tick(); raw_spin_lock(&rq->lock); - walt_set_window_start(rq); - walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, - walt_ktime_clock(), 0); + old_load = task_load(curr); + set_window_start(rq); update_rq_clock(rq); curr->sched_class->task_tick(rq, curr, 0); update_cpu_load_active(rq); calc_global_load_tick(rq); + wallclock = sched_ktime_clock(); + update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); + + cpufreq_update_util(rq, 0); + early_notif = early_detection_notify(rq, wallclock); raw_spin_unlock(&rq->lock); + if (early_notif) + atomic_notifier_call_chain(&load_alert_notifier_head, + 0, (void *)(long)cpu); + perf_event_task_tick(); #ifdef CONFIG_SMP @@ -3015,8 +3215,17 @@ void scheduler_tick(void) #endif rq_last_tick_reset(rq); + rcu_read_lock(); + grp = task_related_thread_group(curr); + if (update_preferred_cluster(grp, curr, old_load)) + set_preferred_cluster(grp); + rcu_read_unlock(); + if (curr->sched_class == &fair_sched_class) check_for_migration(rq, curr); + + if (cpu == tick_do_timer_cpu) + core_ctl_check(wallclock); } #ifdef CONFIG_NO_HZ_FULL @@ -3059,9 +3268,24 @@ notrace unsigned long get_parent_ip(unsigned long addr) #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ defined(CONFIG_PREEMPT_TRACER)) +/* + * preemptoff stack tracing threshold in ns. + * default: 1ms + */ +unsigned int sysctl_preemptoff_tracing_threshold_ns = 1000000UL; + +struct preempt_store { + u64 ts; + unsigned long caddr[4]; + bool irqs_disabled; +}; + +static DEFINE_PER_CPU(struct preempt_store, the_ps); void preempt_count_add(int val) { + struct preempt_store *ps = &per_cpu(the_ps, raw_smp_processor_id()); + #ifdef CONFIG_DEBUG_PREEMPT /* * Underflow? @@ -3082,6 +3306,13 @@ void preempt_count_add(int val) #ifdef CONFIG_DEBUG_PREEMPT current->preempt_disable_ip = ip; #endif + ps->ts = sched_clock(); + ps->caddr[0] = CALLER_ADDR0; + ps->caddr[1] = CALLER_ADDR1; + ps->caddr[2] = CALLER_ADDR2; + ps->caddr[3] = CALLER_ADDR3; + ps->irqs_disabled = irqs_disabled(); + trace_preempt_off(CALLER_ADDR0, ip); } } @@ -3104,8 +3335,22 @@ void preempt_count_sub(int val) return; #endif - if (preempt_count() == val) + if (preempt_count() == val) { + struct preempt_store *ps = &per_cpu(the_ps, + raw_smp_processor_id()); + u64 delta = sched_clock() - ps->ts; + + /* + * Trace preempt disable stack if preemption + * is disabled for more than the threshold. + */ + if (delta > sysctl_preemptoff_tracing_threshold_ns) + trace_sched_preempt_disable(delta, ps->irqs_disabled, + ps->caddr[0], ps->caddr[1], + ps->caddr[2], ps->caddr[3]); + trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); + } __preempt_count_sub(val); } EXPORT_SYMBOL(preempt_count_sub); @@ -3118,6 +3363,9 @@ NOKPROBE_SYMBOL(preempt_count_sub); */ static noinline void __schedule_bug(struct task_struct *prev) { + /* Save this before calling printk(), since that will clobber it */ + unsigned long preempt_disable_ip = get_preempt_disable_ip(current); + if (oops_in_progress) return; @@ -3128,12 +3376,14 @@ static noinline void __schedule_bug(struct task_struct *prev) print_modules(); if (irqs_disabled()) print_irqtrace_events(prev); -#ifdef CONFIG_DEBUG_PREEMPT - if (in_atomic_preempt_off()) { + if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) + && in_atomic_preempt_off()) { pr_err("Preemption disabled at:"); - print_ip_sym(current->preempt_disable_ip); + print_ip_sym(preempt_disable_ip); pr_cont("\n"); } +#ifdef CONFIG_PANIC_ON_SCHED_BUG + BUG(); #endif dump_stack(); add_taint(TAINT_WARN, LOCKDEP_STILL_OK); @@ -3248,7 +3498,6 @@ static void __sched notrace __schedule(bool preempt) cpu = smp_processor_id(); rq = cpu_rq(cpu); - rcu_note_context_switch(); prev = rq->curr; /* @@ -3267,13 +3516,16 @@ static void __sched notrace __schedule(bool preempt) if (sched_feat(HRTICK)) hrtick_clear(rq); + local_irq_disable(); + rcu_note_context_switch(); + /* * Make sure that signal_pending_state()->signal_pending() below * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) * done by the caller to avoid the race with signal_wake_up(). */ smp_mb__before_spinlock(); - raw_spin_lock_irq(&rq->lock); + raw_spin_lock(&rq->lock); lockdep_pin_lock(&rq->lock); rq->clock_skip_update <<= 1; /* promote REQ to ACT */ @@ -3306,14 +3558,20 @@ static void __sched notrace __schedule(bool preempt) update_rq_clock(rq); next = pick_next_task(rq, prev); - wallclock = walt_ktime_clock(); - walt_update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0); - walt_update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0); clear_tsk_need_resched(prev); clear_preempt_need_resched(); rq->clock_skip_update = 0; + BUG_ON(task_cpu(next) != cpu_of(rq)); + + wallclock = sched_ktime_clock(); if (likely(prev != next)) { + update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0); + update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0); + cpufreq_update_util(rq, 0); + if (!is_idle_task(prev) && !prev->on_rq) + update_avg_burst(prev); + #ifdef CONFIG_SCHED_WALT if (!prev->on_rq) prev->last_sleep_ts = wallclock; @@ -3322,10 +3580,14 @@ static void __sched notrace __schedule(bool preempt) rq->curr = next; ++*switch_count; + set_task_last_switch_out(prev, wallclock); + trace_sched_switch(preempt, prev, next); rq = context_switch(rq, prev, next); /* unlocks the rq */ cpu = cpu_of(rq); } else { + update_task_ravg(prev, rq, TASK_UPDATE, wallclock, 0); + cpufreq_update_util(rq, 0); lockdep_unpin_lock(&rq->lock); raw_spin_unlock_irq(&rq->lock); } @@ -3510,7 +3772,7 @@ EXPORT_SYMBOL(default_wake_function); */ void rt_mutex_setprio(struct task_struct *p, int prio) { - int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE; + int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE; struct rq *rq; const struct sched_class *prev_class; @@ -3539,11 +3801,15 @@ void rt_mutex_setprio(struct task_struct *p, int prio) trace_sched_pi_setprio(p, prio); oldprio = p->prio; + + if (oldprio == prio) + queue_flag &= ~DEQUEUE_MOVE; + prev_class = p->sched_class; queued = task_on_rq_queued(p); running = task_current(rq, p); if (queued) - dequeue_task(rq, p, DEQUEUE_SAVE); + dequeue_task(rq, p, queue_flag); if (running) put_prev_task(rq, p); @@ -3562,7 +3828,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) (pi_task && dl_prio(pi_task->prio) && dl_entity_preempt(&pi_task->dl, &p->dl))) { p->dl.dl_boosted = 1; - enqueue_flag |= ENQUEUE_REPLENISH; + queue_flag |= ENQUEUE_REPLENISH; } else p->dl.dl_boosted = 0; p->sched_class = &dl_sched_class; @@ -3570,7 +3836,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (dl_prio(oldprio)) p->dl.dl_boosted = 0; if (oldprio < prio) - enqueue_flag |= ENQUEUE_HEAD; + queue_flag |= ENQUEUE_HEAD; p->sched_class = &rt_sched_class; } else { if (dl_prio(oldprio)) @@ -3585,7 +3851,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (running) p->sched_class->set_curr_task(rq); if (queued) - enqueue_task(rq, p, enqueue_flag); + enqueue_task(rq, p, queue_flag); check_class_changed(rq, p, prev_class, oldprio); out_unlock: @@ -3944,6 +4210,7 @@ static int __sched_setscheduler(struct task_struct *p, const struct sched_class *prev_class; struct rq *rq; int reset_on_fork; + int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; /* The pi code expects interrupts enabled */ BUG_ON(pi && in_interrupt()); @@ -4127,17 +4394,14 @@ change: * itself. */ new_effective_prio = rt_mutex_get_effective_prio(p, newprio); - if (new_effective_prio == oldprio) { - __setscheduler_params(p, attr); - task_rq_unlock(rq, p, &flags); - return 0; - } + if (new_effective_prio == oldprio) + queue_flags &= ~DEQUEUE_MOVE; } queued = task_on_rq_queued(p); running = task_current(rq, p); if (queued) - dequeue_task(rq, p, DEQUEUE_SAVE); + dequeue_task(rq, p, queue_flags); if (running) put_prev_task(rq, p); @@ -4147,15 +4411,14 @@ change: if (running) p->sched_class->set_curr_task(rq); if (queued) { - int enqueue_flags = ENQUEUE_RESTORE; /* * We enqueue to tail when the priority of a task is * increased (user space view). */ - if (oldprio <= p->prio) - enqueue_flags |= ENQUEUE_HEAD; + if (oldprio < p->prio) + queue_flags |= ENQUEUE_HEAD; - enqueue_task(rq, p, enqueue_flags); + enqueue_task(rq, p, queue_flags); } check_class_changed(rq, p, prev_class, oldprio); @@ -4233,7 +4496,7 @@ int sched_setscheduler_nocheck(struct task_struct *p, int policy, { return _sched_setscheduler(p, policy, param, false); } -EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); +EXPORT_SYMBOL(sched_setscheduler_nocheck); static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) @@ -4553,6 +4816,8 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) cpumask_var_t cpus_allowed, new_mask; struct task_struct *p; int retval; + int dest_cpu; + cpumask_t allowed_mask; rcu_read_lock(); @@ -4614,20 +4879,26 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) } #endif again: - retval = __set_cpus_allowed_ptr(p, new_mask, true); - - if (!retval) { - cpuset_cpus_allowed(p, cpus_allowed); - if (!cpumask_subset(new_mask, cpus_allowed)) { - /* - * We must have raced with a concurrent cpuset - * update. Just reset the cpus_allowed to the - * cpuset's cpus_allowed - */ - cpumask_copy(new_mask, cpus_allowed); - goto again; + cpumask_andnot(&allowed_mask, new_mask, cpu_isolated_mask); + dest_cpu = cpumask_any_and(cpu_active_mask, &allowed_mask); + if (dest_cpu < nr_cpu_ids) { + retval = __set_cpus_allowed_ptr(p, new_mask, true); + if (!retval) { + cpuset_cpus_allowed(p, cpus_allowed); + if (!cpumask_subset(new_mask, cpus_allowed)) { + /* + * We must have raced with a concurrent cpuset + * update. Just reset the cpus_allowed to the + * cpuset's cpus_allowed + */ + cpumask_copy(new_mask, cpus_allowed); + goto again; + } } + } else { + retval = -EINVAL; } + out_free_new_mask: free_cpumask_var(new_mask); out_free_cpus_allowed: @@ -4691,6 +4962,15 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) raw_spin_lock_irqsave(&p->pi_lock, flags); cpumask_and(mask, &p->cpus_allowed, cpu_active_mask); + + /* + * The userspace tasks are forbidden to run on + * isolated CPUs. So exclude isolated CPUs from + * the getaffinity. + */ + if (!(p->flags & PF_KTHREAD)) + cpumask_andnot(mask, mask, cpu_isolated_mask); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); out_unlock: @@ -5112,6 +5392,8 @@ void show_state_filter(unsigned long state_filter) sched_show_task(p); } + touch_all_softlockup_watchdogs(); + #ifdef CONFIG_SCHED_DEBUG sysrq_sched_debug_show(); #endif @@ -5149,6 +5431,8 @@ void init_idle(struct task_struct *idle, int cpu) idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); + kasan_unpoison_task_stack(idle); + #ifdef CONFIG_SMP /* * Its possible that init_idle() gets called multiple times on a task, @@ -5371,18 +5655,54 @@ static struct task_struct fake_task = { }; /* - * Migrate all tasks from the rq, sleeping tasks will be migrated by - * try_to_wake_up()->select_task_rq(). + * Remove a task from the runqueue and pretend that it's migrating. This + * should prevent migrations for the detached task and disallow further + * changes to tsk_cpus_allowed. + */ +static void +detach_one_task(struct task_struct *p, struct rq *rq, struct list_head *tasks) +{ + lockdep_assert_held(&rq->lock); + + p->on_rq = TASK_ON_RQ_MIGRATING; + deactivate_task(rq, p, 0); + list_add(&p->se.group_node, tasks); +} + +static void attach_tasks(struct list_head *tasks, struct rq *rq) +{ + struct task_struct *p; + + lockdep_assert_held(&rq->lock); + + while (!list_empty(tasks)) { + p = list_first_entry(tasks, struct task_struct, se.group_node); + list_del_init(&p->se.group_node); + + BUG_ON(task_rq(p) != rq); + activate_task(rq, p, 0); + p->on_rq = TASK_ON_RQ_QUEUED; + } +} + +/* + * Migrate all tasks (not pinned if pinned argument say so) from the rq, + * sleeping tasks will be migrated by try_to_wake_up()->select_task_rq(). * * Called with rq->lock held even though we'er in stop_machine() and * there's no concurrency possible, we hold the required locks anyway * because of lock validation efforts. */ -static void migrate_tasks(struct rq *dead_rq) +static void migrate_tasks(struct rq *dead_rq, bool migrate_pinned_tasks) { struct rq *rq = dead_rq; struct task_struct *next, *stop = rq->stop; int dest_cpu; + unsigned int num_pinned_kthreads = 1; /* this thread */ + LIST_HEAD(tasks); + cpumask_t avail_cpus; + + cpumask_andnot(&avail_cpus, cpu_online_mask, cpu_isolated_mask); /* * Fudge the rq selection such that the below task selection loop @@ -5418,6 +5738,14 @@ static void migrate_tasks(struct rq *dead_rq) BUG_ON(!next); next->sched_class->put_prev_task(rq, next); + if (!migrate_pinned_tasks && next->flags & PF_KTHREAD && + !cpumask_intersects(&avail_cpus, &next->cpus_allowed)) { + detach_one_task(next, rq, &tasks); + num_pinned_kthreads += 1; + lockdep_unpin_lock(&rq->lock); + continue; + } + /* * Rules for changing task_struct::cpus_allowed are holding * both pi_lock and rq->lock, such that holding either @@ -5436,26 +5764,271 @@ static void migrate_tasks(struct rq *dead_rq) * Since we're inside stop-machine, _nothing_ should have * changed the task, WARN if weird stuff happened, because in * that case the above rq->lock drop is a fail too. + * However, during cpu isolation the load balancer might have + * interferred since we don't stop all CPUs. Ignore warning for + * this case. */ - if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) { + if (task_rq(next) != rq || !task_on_rq_queued(next)) { + WARN_ON(migrate_pinned_tasks); raw_spin_unlock(&next->pi_lock); continue; } /* Find suitable destination for @next, with force if needed. */ - dest_cpu = select_fallback_rq(dead_rq->cpu, next); + dest_cpu = select_fallback_rq(dead_rq->cpu, next, false); rq = __migrate_task(rq, next, dest_cpu); if (rq != dead_rq) { + raw_spin_unlock(&next->pi_lock); raw_spin_unlock(&rq->lock); + notify_migration(dead_rq->cpu, dest_cpu, true, next); rq = dead_rq; + raw_spin_lock(&next->pi_lock); raw_spin_lock(&rq->lock); } raw_spin_unlock(&next->pi_lock); } rq->stop = stop; + + if (num_pinned_kthreads > 1) + attach_tasks(&tasks, rq); +} + +static void set_rq_online(struct rq *rq); +static void set_rq_offline(struct rq *rq); + +int do_isolation_work_cpu_stop(void *data) +{ + unsigned int cpu = smp_processor_id(); + struct rq *rq = cpu_rq(cpu); + + watchdog_disable(cpu); + + irq_migrate_all_off_this_cpu(); + + local_irq_disable(); + + sched_ttwu_pending(); + + raw_spin_lock(&rq->lock); + + /* + * Temporarily mark the rq as offline. This will allow us to + * move tasks off the CPU. + */ + if (rq->rd) { + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + set_rq_offline(rq); + } + + migrate_tasks(rq, false); + + if (rq->rd) + set_rq_online(rq); + raw_spin_unlock(&rq->lock); + + /* + * We might have been in tickless state. Clear NOHZ flags to avoid + * us being kicked for helping out with balancing + */ + nohz_balance_clear_nohz_mask(cpu); + + clear_hmp_request(cpu); + local_irq_enable(); + return 0; +} + +int do_unisolation_work_cpu_stop(void *data) +{ + watchdog_enable(smp_processor_id()); + return 0; +} + +static void init_sched_groups_capacity(int cpu, struct sched_domain *sd); + +static void sched_update_group_capacities(int cpu) +{ + struct sched_domain *sd; + + mutex_lock(&sched_domains_mutex); + rcu_read_lock(); + + for_each_domain(cpu, sd) { + int balance_cpu = group_balance_cpu(sd->groups); + + init_sched_groups_capacity(cpu, sd); + /* + * Need to ensure this is also called with balancing + * cpu. + */ + if (cpu != balance_cpu) + init_sched_groups_capacity(balance_cpu, sd); + } + + rcu_read_unlock(); + mutex_unlock(&sched_domains_mutex); +} + +static unsigned int cpu_isolation_vote[NR_CPUS]; + +int sched_isolate_count(const cpumask_t *mask, bool include_offline) +{ + cpumask_t count_mask = CPU_MASK_NONE; + + if (include_offline) { + cpumask_complement(&count_mask, cpu_online_mask); + cpumask_or(&count_mask, &count_mask, cpu_isolated_mask); + cpumask_and(&count_mask, &count_mask, mask); + } else { + cpumask_and(&count_mask, mask, cpu_isolated_mask); + } + + return cpumask_weight(&count_mask); } + +/* + * 1) CPU is isolated and cpu is offlined: + * Unisolate the core. + * 2) CPU is not isolated and CPU is offlined: + * No action taken. + * 3) CPU is offline and request to isolate + * Request ignored. + * 4) CPU is offline and isolated: + * Not a possible state. + * 5) CPU is online and request to isolate + * Normal case: Isolate the CPU + * 6) CPU is not isolated and comes back online + * Nothing to do + * + * Note: The client calling sched_isolate_cpu() is repsonsible for ONLY + * calling sched_unisolate_cpu() on a CPU that the client previously isolated. + * Client is also responsible for unisolating when a core goes offline + * (after CPU is marked offline). + */ +int sched_isolate_cpu(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + cpumask_t avail_cpus; + int ret_code = 0; + u64 start_time = 0; + + if (trace_sched_isolate_enabled()) + start_time = sched_clock(); + + cpu_maps_update_begin(); + + cpumask_andnot(&avail_cpus, cpu_online_mask, cpu_isolated_mask); + + /* We cannot isolate ALL cpus in the system */ + if (cpumask_weight(&avail_cpus) == 1) { + ret_code = -EINVAL; + goto out; + } + + if (!cpu_online(cpu)) { + ret_code = -EINVAL; + goto out; + } + + if (++cpu_isolation_vote[cpu] > 1) + goto out; + + /* + * There is a race between watchdog being enabled by hotplug and + * core isolation disabling the watchdog. When a CPU is hotplugged in + * and the hotplug lock has been released the watchdog thread might + * not have run yet to enable the watchdog. + * We have to wait for the watchdog to be enabled before proceeding. + */ + if (!watchdog_configured(cpu)) { + msleep(20); + if (!watchdog_configured(cpu)) { + --cpu_isolation_vote[cpu]; + ret_code = -EBUSY; + goto out; + } + } + + set_cpu_isolated(cpu, true); + cpumask_clear_cpu(cpu, &avail_cpus); + + /* Migrate timers */ + smp_call_function_any(&avail_cpus, hrtimer_quiesce_cpu, &cpu, 1); + smp_call_function_any(&avail_cpus, timer_quiesce_cpu, &cpu, 1); + + stop_cpus(cpumask_of(cpu), do_isolation_work_cpu_stop, 0); + + calc_load_migrate(rq); + update_max_interval(); + sched_update_group_capacities(cpu); + +out: + cpu_maps_update_done(); + trace_sched_isolate(cpu, cpumask_bits(cpu_isolated_mask)[0], + start_time, 1); + return ret_code; +} + +/* + * Note: The client calling sched_isolate_cpu() is repsonsible for ONLY + * calling sched_unisolate_cpu() on a CPU that the client previously isolated. + * Client is also responsible for unisolating when a core goes offline + * (after CPU is marked offline). + */ +int sched_unisolate_cpu_unlocked(int cpu) +{ + int ret_code = 0; + struct rq *rq = cpu_rq(cpu); + u64 start_time = 0; + + if (trace_sched_isolate_enabled()) + start_time = sched_clock(); + + if (!cpu_isolation_vote[cpu]) { + ret_code = -EINVAL; + goto out; + } + + if (--cpu_isolation_vote[cpu]) + goto out; + + if (cpu_online(cpu)) { + unsigned long flags; + + raw_spin_lock_irqsave(&rq->lock, flags); + rq->age_stamp = sched_clock_cpu(cpu); + raw_spin_unlock_irqrestore(&rq->lock, flags); + } + + set_cpu_isolated(cpu, false); + update_max_interval(); + sched_update_group_capacities(cpu); + + if (cpu_online(cpu)) { + stop_cpus(cpumask_of(cpu), do_unisolation_work_cpu_stop, 0); + + /* Kick CPU to immediately do load balancing */ + if (!test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu))) + smp_send_reschedule(cpu); + } + +out: + trace_sched_isolate(cpu, cpumask_bits(cpu_isolated_mask)[0], + start_time, 0); + return ret_code; +} + +int sched_unisolate_cpu(int cpu) +{ + int ret_code; + + cpu_maps_update_begin(); + ret_code = sched_unisolate_cpu_unlocked(cpu); + cpu_maps_update_done(); + return ret_code; +} + #endif /* CONFIG_HOTPLUG_CPU */ #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) @@ -5743,7 +6316,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_UP_PREPARE: raw_spin_lock_irqsave(&rq->lock, flags); - walt_set_window_start(rq); + set_window_start(rq); raw_spin_unlock_irqrestore(&rq->lock, flags); rq->calc_load_update = calc_load_update; break; @@ -5764,17 +6337,18 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) sched_ttwu_pending(); /* Update our root-domain */ raw_spin_lock_irqsave(&rq->lock, flags); - walt_migrate_sync_cpu(cpu); + if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); } - migrate_tasks(rq); + migrate_tasks(rq, true); BUG_ON(rq->nr_running != 1); /* the migration thread */ raw_spin_unlock_irqrestore(&rq->lock, flags); break; case CPU_DEAD: + clear_hmp_request(cpu); calc_load_migrate(rq); break; #endif @@ -6319,6 +6893,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) { struct rq *rq = cpu_rq(cpu); struct sched_domain *tmp; + unsigned long next_balance = rq->next_balance; /* Remove the sched domains which do not contribute to scheduling. */ for (tmp = sd; tmp; ) { @@ -6350,6 +6925,17 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) sd->child = NULL; } + for (tmp = sd; tmp; ) { + unsigned long interval; + + interval = msecs_to_jiffies(tmp->balance_interval); + if (time_after(next_balance, tmp->last_balance + interval)) + next_balance = tmp->last_balance + interval; + + tmp = tmp->parent; + } + rq->next_balance = next_balance; + sched_domain_debug(sd, cpu); rq_attach_root(rq, rd); @@ -6599,11 +7185,14 @@ build_sched_groups(struct sched_domain *sd, int cpu) static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) { struct sched_group *sg = sd->groups; + cpumask_t avail_mask; WARN_ON(!sg); do { - sg->group_weight = cpumask_weight(sched_group_cpus(sg)); + cpumask_andnot(&avail_mask, sched_group_cpus(sg), + cpu_isolated_mask); + sg->group_weight = cpumask_weight(&avail_mask); sg = sg->next; } while (sg != sd->groups); @@ -7336,6 +7925,9 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, pr_err(" the %s domain not a subset of the %s domain\n", child->name, sd->name); #endif +#ifdef CONFIG_PANIC_ON_SCHED_BUG + BUG(); +#endif /* Fixup, ensure @sd has at least @child cpus. */ cpumask_or(sched_domain_span(sd), sched_domain_span(sd), @@ -7400,7 +7992,8 @@ static int build_sched_domains(const struct cpumask *cpu_map, continue; for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent, tl++) { - init_sched_energy(i, sd, tl->energy); + if (energy_aware()) + init_sched_energy(i, sd, tl->energy); claim_allocations(i, sd); init_sched_groups_capacity(i, sd); } @@ -7722,6 +8315,8 @@ void __init sched_init_smp(void) hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); + update_cluster_topology(); + init_hrtick(); /* Move init over to a non-isolated CPU */ @@ -7740,6 +8335,7 @@ void __init sched_init_smp(void) } #endif /* CONFIG_SMP */ + int in_sched_functions(unsigned long addr) { return in_lock_functions(addr) || @@ -7763,6 +8359,15 @@ void __init sched_init(void) int i, j; unsigned long alloc_size = 0, ptr; +#ifdef CONFIG_SCHED_HMP + pr_info("HMP scheduling enabled.\n"); +#endif + + BUG_ON(num_possible_cpus() > BITS_PER_LONG); + + sched_boost_parse_dt(); + init_clusters(); + #ifdef CONFIG_FAIR_GROUP_SCHED alloc_size += 2 * nr_cpu_ids * sizeof(void **); #endif @@ -7879,12 +8484,53 @@ void __init sched_init(void) rq->online = 0; rq->idle_stamp = 0; rq->avg_idle = 2*sysctl_sched_migration_cost; - rq->max_idle_balance_cost = sysctl_sched_migration_cost; -#ifdef CONFIG_SCHED_WALT +#ifdef CONFIG_SCHED_HMP + cpumask_set_cpu(i, &rq->freq_domain_cpumask); + rq->hmp_stats.cumulative_runnable_avg = 0; + rq->window_start = 0; + rq->hmp_stats.nr_big_tasks = 0; + rq->hmp_flags = 0; rq->cur_irqload = 0; rq->avg_irqload = 0; rq->irqload_ts = 0; + rq->static_cpu_pwr_cost = 0; + rq->cc.cycles = 1; + rq->cc.time = 1; + rq->cstate = 0; + rq->wakeup_latency = 0; + rq->wakeup_energy = 0; + + /* + * All cpus part of same cluster by default. This avoids the + * need to check for rq->cluster being non-NULL in hot-paths + * like select_best_cpu() + */ + rq->cluster = &init_cluster; + rq->curr_runnable_sum = rq->prev_runnable_sum = 0; + rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0; + memset(&rq->grp_time, 0, sizeof(struct group_cpu_time)); + rq->old_busy_time = 0; + rq->old_estimated_time = 0; + rq->old_busy_time_group = 0; + rq->hmp_stats.pred_demands_sum = 0; + rq->curr_table = 0; + rq->prev_top = 0; + rq->curr_top = 0; + + for (j = 0; j < NUM_TRACKED_WINDOWS; j++) { + memset(&rq->load_subs[j], 0, + sizeof(struct load_subtractions)); + + rq->top_tasks[j] = kcalloc(NUM_LOAD_INDICES, + sizeof(u8), GFP_NOWAIT); + + /* No other choice */ + BUG_ON(!rq->top_tasks[j]); + + clear_top_tasks_bitmap(rq->top_tasks_bitmap[j]); + } #endif + rq->max_idle_balance_cost = sysctl_sched_migration_cost; INIT_LIST_HEAD(&rq->cfs_tasks); @@ -7900,6 +8546,11 @@ void __init sched_init(void) atomic_set(&rq->nr_iowait, 0); } + i = alloc_related_thread_groups(); + BUG_ON(i); + + set_hmp_defaults(); + set_load_weight(&init_task); #ifdef CONFIG_PREEMPT_NOTIFIERS @@ -7924,6 +8575,7 @@ void __init sched_init(void) * when this runqueue becomes "idle". */ init_idle(current, smp_processor_id()); + init_new_task_load(current); calc_load_update = jiffies + LOAD_FREQ; @@ -7977,6 +8629,7 @@ EXPORT_SYMBOL(__might_sleep); void ___might_sleep(const char *file, int line, int preempt_offset) { static unsigned long prev_jiffy; /* ratelimiting */ + unsigned long preempt_disable_ip; rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && @@ -7989,6 +8642,9 @@ void ___might_sleep(const char *file, int line, int preempt_offset) return; prev_jiffy = jiffies; + /* Save this before calling printk(), since that will clobber it */ + preempt_disable_ip = get_preempt_disable_ip(current); + printk(KERN_ERR "BUG: sleeping function called from invalid context at %s:%d\n", file, line); @@ -8003,12 +8659,14 @@ void ___might_sleep(const char *file, int line, int preempt_offset) debug_show_held_locks(current); if (irqs_disabled()) print_irqtrace_events(current); -#ifdef CONFIG_DEBUG_PREEMPT - if (!preempt_count_equals(preempt_offset)) { + if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) + && !preempt_count_equals(preempt_offset)) { pr_err("Preemption disabled at:"); - print_ip_sym(current->preempt_disable_ip); + print_ip_sym(preempt_disable_ip); pr_cont("\n"); } +#ifdef CONFIG_PANIC_ON_SCHED_BUG + BUG(); #endif dump_stack(); } @@ -8220,7 +8878,7 @@ void sched_move_task(struct task_struct *tsk) queued = task_on_rq_queued(tsk); if (queued) - dequeue_task(rq, tsk, DEQUEUE_SAVE); + dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE); if (unlikely(running)) put_prev_task(rq, tsk); @@ -8229,7 +8887,7 @@ void sched_move_task(struct task_struct *tsk) if (unlikely(running)) tsk->sched_class->set_curr_task(rq); if (queued) - enqueue_task(rq, tsk, ENQUEUE_RESTORE); + enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE); task_rq_unlock(rq, tsk, &flags); } @@ -8611,7 +9269,7 @@ int sched_rr_handler(struct ctl_table *table, int write, #ifdef CONFIG_CGROUP_SCHED -static inline struct task_group *css_tg(struct cgroup_subsys_state *css) +inline struct task_group *css_tg(struct cgroup_subsys_state *css) { return css ? container_of(css, struct task_group, css) : NULL; } @@ -9011,6 +9669,13 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, #endif /* CONFIG_RT_GROUP_SCHED */ static struct cftype cpu_files[] = { +#ifdef CONFIG_SCHED_HMP + { + .name = "upmigrate_discourage", + .read_u64 = cpu_upmigrate_discourage_read_u64, + .write_u64 = cpu_upmigrate_discourage_write_u64, + }, +#endif #ifdef CONFIG_FAIR_GROUP_SCHED { .name = "shares", diff --git a/kernel/sched/core_ctl.c b/kernel/sched/core_ctl.c new file mode 100644 index 000000000000..2f060a570061 --- /dev/null +++ b/kernel/sched/core_ctl.c @@ -0,0 +1,1171 @@ +/* Copyright (c) 2014-2017, 2020 The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#define pr_fmt(fmt) "core_ctl: " fmt + +#include <linux/init.h> +#include <linux/notifier.h> +#include <linux/cpu.h> +#include <linux/cpumask.h> +#include <linux/cpufreq.h> +#include <linux/kthread.h> +#include <linux/sched.h> +#include <linux/sched/rt.h> + +#include <trace/events/sched.h> +#include "sched.h" + +#define MAX_CPUS_PER_CLUSTER 4 +#define MAX_CLUSTERS 2 + +struct cluster_data { + bool inited; + unsigned int min_cpus; + unsigned int max_cpus; + unsigned int offline_delay_ms; + unsigned int busy_up_thres[MAX_CPUS_PER_CLUSTER]; + unsigned int busy_down_thres[MAX_CPUS_PER_CLUSTER]; + unsigned int active_cpus; + unsigned int num_cpus; + unsigned int nr_isolated_cpus; + cpumask_t cpu_mask; + unsigned int need_cpus; + unsigned int task_thres; + unsigned int max_nr; + s64 need_ts; + struct list_head lru; + bool pending; + spinlock_t pending_lock; + bool is_big_cluster; + bool enable; + int nrrun; + bool nrrun_changed; + struct task_struct *core_ctl_thread; + unsigned int first_cpu; + unsigned int boost; + struct kobject kobj; +}; + +struct cpu_data { + bool is_busy; + unsigned int busy; + unsigned int cpu; + bool not_preferred; + struct cluster_data *cluster; + struct list_head sib; + bool isolated_by_us; + unsigned int max_nr; +}; + +static DEFINE_PER_CPU(struct cpu_data, cpu_state); +static struct cluster_data cluster_state[MAX_CLUSTERS]; +static unsigned int num_clusters; + +#define for_each_cluster(cluster, idx) \ + for (; (idx) < num_clusters && ((cluster) = &cluster_state[idx]);\ + (idx)++) + +static DEFINE_SPINLOCK(state_lock); +static void apply_need(struct cluster_data *state); +static void wake_up_core_ctl_thread(struct cluster_data *state); +static bool initialized; + +static unsigned int get_active_cpu_count(const struct cluster_data *cluster); + +/* ========================= sysfs interface =========================== */ + +static ssize_t store_min_cpus(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + state->min_cpus = min(val, state->max_cpus); + wake_up_core_ctl_thread(state); + + return count; +} + +static ssize_t show_min_cpus(const struct cluster_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->min_cpus); +} + +static ssize_t store_max_cpus(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + val = min(val, state->num_cpus); + state->max_cpus = val; + state->min_cpus = min(state->min_cpus, state->max_cpus); + wake_up_core_ctl_thread(state); + + return count; +} + +static ssize_t show_max_cpus(const struct cluster_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->max_cpus); +} + +static ssize_t store_offline_delay_ms(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + state->offline_delay_ms = val; + apply_need(state); + + return count; +} + +static ssize_t show_task_thres(const struct cluster_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->task_thres); +} + +static ssize_t store_task_thres(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + if (val < state->num_cpus) + return -EINVAL; + + state->task_thres = val; + apply_need(state); + + return count; +} + +static ssize_t show_offline_delay_ms(const struct cluster_data *state, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->offline_delay_ms); +} + +static ssize_t store_busy_up_thres(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val[MAX_CPUS_PER_CLUSTER]; + int ret, i; + + ret = sscanf(buf, "%u %u %u %u\n", &val[0], &val[1], &val[2], &val[3]); + if (ret != 1 && ret != state->num_cpus) + return -EINVAL; + + if (ret == 1) { + for (i = 0; i < state->num_cpus; i++) + state->busy_up_thres[i] = val[0]; + } else { + for (i = 0; i < state->num_cpus; i++) + state->busy_up_thres[i] = val[i]; + } + apply_need(state); + return count; +} + +static ssize_t show_busy_up_thres(const struct cluster_data *state, char *buf) +{ + int i, count = 0; + + for (i = 0; i < state->num_cpus; i++) + count += snprintf(buf + count, PAGE_SIZE - count, "%u ", + state->busy_up_thres[i]); + + count += snprintf(buf + count, PAGE_SIZE - count, "\n"); + return count; +} + +static ssize_t store_busy_down_thres(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val[MAX_CPUS_PER_CLUSTER]; + int ret, i; + + ret = sscanf(buf, "%u %u %u %u\n", &val[0], &val[1], &val[2], &val[3]); + if (ret != 1 && ret != state->num_cpus) + return -EINVAL; + + if (ret == 1) { + for (i = 0; i < state->num_cpus; i++) + state->busy_down_thres[i] = val[0]; + } else { + for (i = 0; i < state->num_cpus; i++) + state->busy_down_thres[i] = val[i]; + } + apply_need(state); + return count; +} + +static ssize_t show_busy_down_thres(const struct cluster_data *state, char *buf) +{ + int i, count = 0; + + for (i = 0; i < state->num_cpus; i++) + count += snprintf(buf + count, PAGE_SIZE - count, "%u ", + state->busy_down_thres[i]); + + count += snprintf(buf + count, PAGE_SIZE - count, "\n"); + return count; +} + +static ssize_t store_is_big_cluster(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + state->is_big_cluster = val ? 1 : 0; + return count; +} + +static ssize_t show_is_big_cluster(const struct cluster_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->is_big_cluster); +} + +static ssize_t store_enable(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val; + bool bval; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + bval = !!val; + if (bval != state->enable) { + state->enable = bval; + apply_need(state); + } + + return count; +} + +static ssize_t show_enable(const struct cluster_data *state, char *buf) +{ + return scnprintf(buf, PAGE_SIZE, "%u\n", state->enable); +} + +static ssize_t show_need_cpus(const struct cluster_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->need_cpus); +} + +static ssize_t show_active_cpus(const struct cluster_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->active_cpus); +} + +static ssize_t show_global_state(const struct cluster_data *state, char *buf) +{ + struct cpu_data *c; + struct cluster_data *cluster; + ssize_t count = 0; + unsigned int cpu; + + spin_lock_irq(&state_lock); + for_each_possible_cpu(cpu) { + c = &per_cpu(cpu_state, cpu); + cluster = c->cluster; + if (!cluster || !cluster->inited) + continue; + + count += snprintf(buf + count, PAGE_SIZE - count, + "CPU%u\n", cpu); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tCPU: %u\n", c->cpu); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tOnline: %u\n", + cpu_online(c->cpu)); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tIsolated: %u\n", + cpu_isolated(c->cpu)); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tFirst CPU: %u\n", + cluster->first_cpu); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tBusy%%: %u\n", c->busy); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tIs busy: %u\n", c->is_busy); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tNot preferred: %u\n", + c->not_preferred); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tNr running: %u\n", cluster->nrrun); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tActive CPUs: %u\n", get_active_cpu_count(cluster)); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tNeed CPUs: %u\n", cluster->need_cpus); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tNr isolated CPUs: %u\n", + cluster->nr_isolated_cpus); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tBoost: %u\n", (unsigned int) cluster->boost); + } + spin_unlock_irq(&state_lock); + + return count; +} + +static ssize_t store_not_preferred(struct cluster_data *state, + const char *buf, size_t count) +{ + struct cpu_data *c; + unsigned int i; + unsigned int val[MAX_CPUS_PER_CLUSTER]; + unsigned long flags; + int ret; + + ret = sscanf(buf, "%u %u %u %u\n", &val[0], &val[1], &val[2], &val[3]); + if (ret != state->num_cpus) + return -EINVAL; + + spin_lock_irqsave(&state_lock, flags); + for (i = 0; i < state->num_cpus; i++) { + c = &per_cpu(cpu_state, i + state->first_cpu); + c->not_preferred = val[i]; + } + spin_unlock_irqrestore(&state_lock, flags); + + return count; +} + +static ssize_t show_not_preferred(const struct cluster_data *state, char *buf) +{ + struct cpu_data *c; + ssize_t count = 0; + unsigned long flags; + int i; + + spin_lock_irqsave(&state_lock, flags); + for (i = 0; i < state->num_cpus; i++) { + c = &per_cpu(cpu_state, i + state->first_cpu); + count += scnprintf(buf + count, PAGE_SIZE - count, + "CPU#%d: %u\n", c->cpu, c->not_preferred); + } + spin_unlock_irqrestore(&state_lock, flags); + + return count; +} + + +struct core_ctl_attr { + struct attribute attr; + ssize_t (*show)(const struct cluster_data *, char *); + ssize_t (*store)(struct cluster_data *, const char *, size_t count); +}; + +#define core_ctl_attr_ro(_name) \ +static struct core_ctl_attr _name = \ +__ATTR(_name, 0444, show_##_name, NULL) + +#define core_ctl_attr_rw(_name) \ +static struct core_ctl_attr _name = \ +__ATTR(_name, 0644, show_##_name, store_##_name) + +core_ctl_attr_rw(min_cpus); +core_ctl_attr_rw(max_cpus); +core_ctl_attr_rw(offline_delay_ms); +core_ctl_attr_rw(busy_up_thres); +core_ctl_attr_rw(busy_down_thres); +core_ctl_attr_rw(task_thres); +core_ctl_attr_rw(is_big_cluster); +core_ctl_attr_ro(need_cpus); +core_ctl_attr_ro(active_cpus); +core_ctl_attr_ro(global_state); +core_ctl_attr_rw(not_preferred); +core_ctl_attr_rw(enable); + +static struct attribute *default_attrs[] = { + &min_cpus.attr, + &max_cpus.attr, + &offline_delay_ms.attr, + &busy_up_thres.attr, + &busy_down_thres.attr, + &task_thres.attr, + &is_big_cluster.attr, + &enable.attr, + &need_cpus.attr, + &active_cpus.attr, + &global_state.attr, + ¬_preferred.attr, + NULL +}; + +#define to_cluster_data(k) container_of(k, struct cluster_data, kobj) +#define to_attr(a) container_of(a, struct core_ctl_attr, attr) +static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + struct cluster_data *data = to_cluster_data(kobj); + struct core_ctl_attr *cattr = to_attr(attr); + ssize_t ret = -EIO; + + if (cattr->show) + ret = cattr->show(data, buf); + + return ret; +} + +static ssize_t store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + struct cluster_data *data = to_cluster_data(kobj); + struct core_ctl_attr *cattr = to_attr(attr); + ssize_t ret = -EIO; + + if (cattr->store) + ret = cattr->store(data, buf, count); + + return ret; +} + +static const struct sysfs_ops sysfs_ops = { + .show = show, + .store = store, +}; + +static struct kobj_type ktype_core_ctl = { + .sysfs_ops = &sysfs_ops, + .default_attrs = default_attrs, +}; + +/* ==================== runqueue based core count =================== */ + +#define RQ_AVG_TOLERANCE 2 +#define RQ_AVG_DEFAULT_MS 20 +static unsigned int rq_avg_period_ms = RQ_AVG_DEFAULT_MS; + +static s64 rq_avg_timestamp_ms; + +static void update_running_avg(bool trigger_update) +{ + int avg, iowait_avg, big_avg, old_nrrun; + int old_max_nr, max_nr, big_max_nr; + s64 now; + unsigned long flags; + struct cluster_data *cluster; + unsigned int index = 0; + + spin_lock_irqsave(&state_lock, flags); + + now = ktime_to_ms(ktime_get()); + if (now - rq_avg_timestamp_ms < rq_avg_period_ms - RQ_AVG_TOLERANCE) { + spin_unlock_irqrestore(&state_lock, flags); + return; + } + rq_avg_timestamp_ms = now; + sched_get_nr_running_avg(&avg, &iowait_avg, &big_avg, + &max_nr, &big_max_nr); + + spin_unlock_irqrestore(&state_lock, flags); + + for_each_cluster(cluster, index) { + if (!cluster->inited) + continue; + + old_nrrun = cluster->nrrun; + old_max_nr = cluster->max_nr; + cluster->nrrun = cluster->is_big_cluster ? big_avg : avg; + cluster->max_nr = cluster->is_big_cluster ? big_max_nr : max_nr; + + if (cluster->nrrun != old_nrrun || + cluster->max_nr != old_max_nr) { + + if (trigger_update) + apply_need(cluster); + else + cluster->nrrun_changed = true; + } + } + return; +} + +#define MAX_NR_THRESHOLD 4 +/* adjust needed CPUs based on current runqueue information */ +static unsigned int apply_task_need(const struct cluster_data *cluster, + unsigned int new_need) +{ + /* unisolate all cores if there are enough tasks */ + if (cluster->nrrun >= cluster->task_thres) + return cluster->num_cpus; + + /* only unisolate more cores if there are tasks to run */ + if (cluster->nrrun > new_need) + new_need = new_need + 1; + + /* + * We don't want tasks to be overcrowded in a cluster. + * If any CPU has more than MAX_NR_THRESHOLD in the last + * window, bring another CPU to help out. + */ + if (cluster->max_nr > MAX_NR_THRESHOLD) + new_need = new_need + 1; + + return new_need; +} + +/* ======================= load based core count ====================== */ + +static unsigned int apply_limits(const struct cluster_data *cluster, + unsigned int need_cpus) +{ + return min(max(cluster->min_cpus, need_cpus), cluster->max_cpus); +} + +static unsigned int get_active_cpu_count(const struct cluster_data *cluster) +{ + return cluster->num_cpus - + sched_isolate_count(&cluster->cpu_mask, true); +} + +static bool is_active(const struct cpu_data *state) +{ + return cpu_online(state->cpu) && !cpu_isolated(state->cpu); +} + +static bool adjustment_possible(const struct cluster_data *cluster, + unsigned int need) +{ + return (need < cluster->active_cpus || (need > cluster->active_cpus && + cluster->nr_isolated_cpus)); +} + +static bool eval_need(struct cluster_data *cluster) +{ + unsigned long flags; + struct cpu_data *c; + unsigned int need_cpus = 0, last_need, thres_idx; + int ret = 0; + bool need_flag = false; + unsigned int new_need; + s64 now, elapsed; + + if (unlikely(!cluster->inited)) + return 0; + + spin_lock_irqsave(&state_lock, flags); + + if (cluster->boost || !cluster->enable) { + need_cpus = cluster->max_cpus; + } else { + cluster->active_cpus = get_active_cpu_count(cluster); + thres_idx = cluster->active_cpus ? cluster->active_cpus - 1 : 0; + list_for_each_entry(c, &cluster->lru, sib) { + if (c->busy >= cluster->busy_up_thres[thres_idx] || + sched_cpu_high_irqload(c->cpu)) + c->is_busy = true; + else if (c->busy < cluster->busy_down_thres[thres_idx]) + c->is_busy = false; + need_cpus += c->is_busy; + } + need_cpus = apply_task_need(cluster, need_cpus); + } + new_need = apply_limits(cluster, need_cpus); + need_flag = adjustment_possible(cluster, new_need); + + last_need = cluster->need_cpus; + now = ktime_to_ms(ktime_get()); + + if (new_need > cluster->active_cpus) { + ret = 1; + } else { + if (new_need == last_need) { + cluster->need_ts = now; + spin_unlock_irqrestore(&state_lock, flags); + return 0; + } + + elapsed = now - cluster->need_ts; + ret = elapsed >= cluster->offline_delay_ms; + } + + if (ret) { + cluster->need_ts = now; + cluster->need_cpus = new_need; + } + trace_core_ctl_eval_need(cluster->first_cpu, last_need, new_need, + ret && need_flag); + spin_unlock_irqrestore(&state_lock, flags); + + return ret && need_flag; +} + +static void apply_need(struct cluster_data *cluster) +{ + if (eval_need(cluster)) + wake_up_core_ctl_thread(cluster); +} + +static int core_ctl_set_busy(unsigned int cpu, unsigned int busy) +{ + struct cpu_data *c = &per_cpu(cpu_state, cpu); + struct cluster_data *cluster = c->cluster; + unsigned int old_is_busy = c->is_busy; + + if (!cluster || !cluster->inited) + return 0; + + update_running_avg(false); + if (c->busy == busy && !cluster->nrrun_changed) + return 0; + c->busy = busy; + cluster->nrrun_changed = false; + + apply_need(cluster); + trace_core_ctl_set_busy(cpu, busy, old_is_busy, c->is_busy); + return 0; +} + +/* ========================= core count enforcement ==================== */ + +static void wake_up_core_ctl_thread(struct cluster_data *cluster) +{ + unsigned long flags; + + spin_lock_irqsave(&cluster->pending_lock, flags); + cluster->pending = true; + spin_unlock_irqrestore(&cluster->pending_lock, flags); + + wake_up_process_no_notif(cluster->core_ctl_thread); +} + +static u64 core_ctl_check_timestamp; +static u64 core_ctl_check_interval; + +static bool do_check(u64 wallclock) +{ + bool do_check = false; + unsigned long flags; + + spin_lock_irqsave(&state_lock, flags); + if ((wallclock - core_ctl_check_timestamp) >= core_ctl_check_interval) { + core_ctl_check_timestamp = wallclock; + do_check = true; + } + spin_unlock_irqrestore(&state_lock, flags); + return do_check; +} + +int core_ctl_set_boost(bool boost) +{ + unsigned int index = 0; + struct cluster_data *cluster; + unsigned long flags; + int ret = 0; + bool boost_state_changed = false; + + if (unlikely(!initialized)) + return 0; + + spin_lock_irqsave(&state_lock, flags); + for_each_cluster(cluster, index) { + if (cluster->is_big_cluster) { + if (boost) { + boost_state_changed = !cluster->boost; + ++cluster->boost; + } else { + if (!cluster->boost) { + pr_err("Error turning off boost. Boost already turned off\n"); + ret = -EINVAL; + } else { + --cluster->boost; + boost_state_changed = !cluster->boost; + } + } + break; + } + } + spin_unlock_irqrestore(&state_lock, flags); + + if (boost_state_changed) + apply_need(cluster); + + trace_core_ctl_set_boost(cluster->boost, ret); + + return ret; +} +EXPORT_SYMBOL(core_ctl_set_boost); + +void core_ctl_check(u64 wallclock) +{ + if (unlikely(!initialized)) + return; + + if (do_check(wallclock)) { + unsigned int index = 0; + struct cluster_data *cluster; + + update_running_avg(true); + + for_each_cluster(cluster, index) { + if (eval_need(cluster)) + wake_up_core_ctl_thread(cluster); + } + } +} + +static void move_cpu_lru(struct cpu_data *cpu_data) +{ + unsigned long flags; + + spin_lock_irqsave(&state_lock, flags); + list_del(&cpu_data->sib); + list_add_tail(&cpu_data->sib, &cpu_data->cluster->lru); + spin_unlock_irqrestore(&state_lock, flags); +} + +static void try_to_isolate(struct cluster_data *cluster, unsigned int need) +{ + struct cpu_data *c, *tmp; + unsigned long flags; + unsigned int num_cpus = cluster->num_cpus; + unsigned int nr_isolated = 0; + + /* + * Protect against entry being removed (and added at tail) by other + * thread (hotplug). + */ + spin_lock_irqsave(&state_lock, flags); + list_for_each_entry_safe(c, tmp, &cluster->lru, sib) { + if (!num_cpus--) + break; + + if (!is_active(c)) + continue; + if (cluster->active_cpus == need) + break; + /* Don't offline busy CPUs. */ + if (c->is_busy) + continue; + + spin_unlock_irqrestore(&state_lock, flags); + + pr_debug("Trying to isolate CPU%u\n", c->cpu); + if (!sched_isolate_cpu(c->cpu)) { + c->isolated_by_us = true; + move_cpu_lru(c); + nr_isolated++; + } else { + pr_debug("Unable to isolate CPU%u\n", c->cpu); + } + cluster->active_cpus = get_active_cpu_count(cluster); + spin_lock_irqsave(&state_lock, flags); + } + cluster->nr_isolated_cpus += nr_isolated; + spin_unlock_irqrestore(&state_lock, flags); + + /* + * If the number of active CPUs is within the limits, then + * don't force isolation of any busy CPUs. + */ + if (cluster->active_cpus <= cluster->max_cpus) + return; + + nr_isolated = 0; + num_cpus = cluster->num_cpus; + spin_lock_irqsave(&state_lock, flags); + list_for_each_entry_safe(c, tmp, &cluster->lru, sib) { + if (!num_cpus--) + break; + + if (!is_active(c)) + continue; + if (cluster->active_cpus <= cluster->max_cpus) + break; + + spin_unlock_irqrestore(&state_lock, flags); + + pr_debug("Trying to isolate CPU%u\n", c->cpu); + if (!sched_isolate_cpu(c->cpu)) { + c->isolated_by_us = true; + move_cpu_lru(c); + nr_isolated++; + } else { + pr_debug("Unable to isolate CPU%u\n", c->cpu); + } + cluster->active_cpus = get_active_cpu_count(cluster); + spin_lock_irqsave(&state_lock, flags); + } + cluster->nr_isolated_cpus += nr_isolated; + spin_unlock_irqrestore(&state_lock, flags); + +} + +static void __try_to_unisolate(struct cluster_data *cluster, + unsigned int need, bool force) +{ + struct cpu_data *c, *tmp; + unsigned long flags; + unsigned int num_cpus = cluster->num_cpus; + unsigned int nr_unisolated = 0; + + /* + * Protect against entry being removed (and added at tail) by other + * thread (hotplug). + */ + spin_lock_irqsave(&state_lock, flags); + list_for_each_entry_safe(c, tmp, &cluster->lru, sib) { + if (!num_cpus--) + break; + + if (!c->isolated_by_us) + continue; + if ((cpu_online(c->cpu) && !cpu_isolated(c->cpu)) || + (!force && c->not_preferred)) + continue; + if (cluster->active_cpus == need) + break; + + spin_unlock_irqrestore(&state_lock, flags); + + pr_debug("Trying to unisolate CPU%u\n", c->cpu); + if (!sched_unisolate_cpu(c->cpu)) { + c->isolated_by_us = false; + move_cpu_lru(c); + nr_unisolated++; + } else { + pr_debug("Unable to unisolate CPU%u\n", c->cpu); + } + cluster->active_cpus = get_active_cpu_count(cluster); + spin_lock_irqsave(&state_lock, flags); + } + cluster->nr_isolated_cpus -= nr_unisolated; + spin_unlock_irqrestore(&state_lock, flags); +} + +static void try_to_unisolate(struct cluster_data *cluster, unsigned int need) +{ + bool force_use_non_preferred = false; + + __try_to_unisolate(cluster, need, force_use_non_preferred); + + if (cluster->active_cpus == need) + return; + + force_use_non_preferred = true; + __try_to_unisolate(cluster, need, force_use_non_preferred); +} + +static void __ref do_core_ctl(struct cluster_data *cluster) +{ + unsigned int need; + + need = apply_limits(cluster, cluster->need_cpus); + + if (adjustment_possible(cluster, need)) { + pr_debug("Trying to adjust group %u from %u to %u\n", + cluster->first_cpu, cluster->active_cpus, need); + + if (cluster->active_cpus > need) + try_to_isolate(cluster, need); + else if (cluster->active_cpus < need) + try_to_unisolate(cluster, need); + } +} + +static int __ref try_core_ctl(void *data) +{ + struct cluster_data *cluster = data; + unsigned long flags; + + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + spin_lock_irqsave(&cluster->pending_lock, flags); + if (!cluster->pending) { + spin_unlock_irqrestore(&cluster->pending_lock, flags); + schedule(); + if (kthread_should_stop()) + break; + spin_lock_irqsave(&cluster->pending_lock, flags); + } + set_current_state(TASK_RUNNING); + cluster->pending = false; + spin_unlock_irqrestore(&cluster->pending_lock, flags); + + do_core_ctl(cluster); + } + + return 0; +} + +static int __ref cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + uint32_t cpu = (uintptr_t)hcpu; + struct cpu_data *state = &per_cpu(cpu_state, cpu); + struct cluster_data *cluster = state->cluster; + unsigned int need; + bool do_wakeup, unisolated = false; + unsigned long flags; + + if (unlikely(!cluster || !cluster->inited)) + return NOTIFY_DONE; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_ONLINE: + cluster->active_cpus = get_active_cpu_count(cluster); + + /* + * Moving to the end of the list should only happen in + * CPU_ONLINE and not on CPU_UP_PREPARE to prevent an + * infinite list traversal when thermal (or other entities) + * reject trying to online CPUs. + */ + move_cpu_lru(state); + break; + + case CPU_DEAD: + /* + * We don't want to have a CPU both offline and isolated. + * So unisolate a CPU that went down if it was isolated by us. + */ + if (state->isolated_by_us) { + sched_unisolate_cpu_unlocked(cpu); + state->isolated_by_us = false; + unisolated = true; + } + + /* Move a CPU to the end of the LRU when it goes offline. */ + move_cpu_lru(state); + + state->busy = 0; + cluster->active_cpus = get_active_cpu_count(cluster); + break; + default: + return NOTIFY_DONE; + } + + need = apply_limits(cluster, cluster->need_cpus); + spin_lock_irqsave(&state_lock, flags); + if (unisolated) + cluster->nr_isolated_cpus--; + do_wakeup = adjustment_possible(cluster, need); + spin_unlock_irqrestore(&state_lock, flags); + if (do_wakeup) + wake_up_core_ctl_thread(cluster); + + return NOTIFY_OK; +} + +static struct notifier_block __refdata cpu_notifier = { + .notifier_call = cpu_callback, +}; + +/* ============================ init code ============================== */ + +static cpumask_var_t core_ctl_disable_cpumask; +static bool core_ctl_disable_cpumask_present; + +static int __init core_ctl_disable_setup(char *str) +{ + if (!*str) + return -EINVAL; + + alloc_bootmem_cpumask_var(&core_ctl_disable_cpumask); + + if (cpulist_parse(str, core_ctl_disable_cpumask) < 0) { + free_bootmem_cpumask_var(core_ctl_disable_cpumask); + return -EINVAL; + } + + core_ctl_disable_cpumask_present = true; + pr_info("disable_cpumask=%*pbl\n", + cpumask_pr_args(core_ctl_disable_cpumask)); + + return 0; +} +early_param("core_ctl_disable_cpumask", core_ctl_disable_setup); + +static bool should_skip(const struct cpumask *mask) +{ + if (!core_ctl_disable_cpumask_present) + return false; + + /* + * We operate on a cluster basis. Disable the core_ctl for + * a cluster, if all of it's cpus are specified in + * core_ctl_disable_cpumask + */ + return cpumask_subset(mask, core_ctl_disable_cpumask); +} + +static struct cluster_data *find_cluster_by_first_cpu(unsigned int first_cpu) +{ + unsigned int i; + + for (i = 0; i < num_clusters; ++i) { + if (cluster_state[i].first_cpu == first_cpu) + return &cluster_state[i]; + } + + return NULL; +} + +static int cluster_init(const struct cpumask *mask) +{ + struct device *dev; + unsigned int first_cpu = cpumask_first(mask); + struct cluster_data *cluster; + struct cpu_data *state; + unsigned int cpu; + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + + if (should_skip(mask)) + return 0; + + if (find_cluster_by_first_cpu(first_cpu)) + return 0; + + dev = get_cpu_device(first_cpu); + if (!dev) + return -ENODEV; + + pr_info("Creating CPU group %d\n", first_cpu); + + if (num_clusters == MAX_CLUSTERS) { + pr_err("Unsupported number of clusters. Only %u supported\n", + MAX_CLUSTERS); + return -EINVAL; + } + cluster = &cluster_state[num_clusters]; + ++num_clusters; + + cpumask_copy(&cluster->cpu_mask, mask); + cluster->num_cpus = cpumask_weight(mask); + if (cluster->num_cpus > MAX_CPUS_PER_CLUSTER) { + pr_err("HW configuration not supported\n"); + return -EINVAL; + } + cluster->first_cpu = first_cpu; + cluster->min_cpus = 1; + cluster->max_cpus = cluster->num_cpus; + cluster->need_cpus = cluster->num_cpus; + cluster->offline_delay_ms = 100; + cluster->task_thres = UINT_MAX; + cluster->nrrun = cluster->num_cpus; + cluster->enable = true; + INIT_LIST_HEAD(&cluster->lru); + spin_lock_init(&cluster->pending_lock); + + for_each_cpu(cpu, mask) { + pr_info("Init CPU%u state\n", cpu); + + state = &per_cpu(cpu_state, cpu); + state->cluster = cluster; + state->cpu = cpu; + list_add_tail(&state->sib, &cluster->lru); + } + cluster->active_cpus = get_active_cpu_count(cluster); + + cluster->core_ctl_thread = kthread_run(try_core_ctl, (void *) cluster, + "core_ctl/%d", first_cpu); + if (IS_ERR(cluster->core_ctl_thread)) + return PTR_ERR(cluster->core_ctl_thread); + + sched_setscheduler_nocheck(cluster->core_ctl_thread, SCHED_FIFO, + ¶m); + + cluster->inited = true; + + kobject_init(&cluster->kobj, &ktype_core_ctl); + return kobject_add(&cluster->kobj, &dev->kobj, "core_ctl"); +} + +static int cpufreq_policy_cb(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct cpufreq_policy *policy = data; + int ret; + + switch (val) { + case CPUFREQ_CREATE_POLICY: + ret = cluster_init(policy->related_cpus); + if (ret) + pr_warn("unable to create core ctl group: %d\n", ret); + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block cpufreq_pol_nb = { + .notifier_call = cpufreq_policy_cb, +}; + +static int cpufreq_gov_cb(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct cpufreq_govinfo *info = data; + + switch (val) { + case CPUFREQ_LOAD_CHANGE: + core_ctl_set_busy(info->cpu, info->load); + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block cpufreq_gov_nb = { + .notifier_call = cpufreq_gov_cb, +}; + +static int __init core_ctl_init(void) +{ + unsigned int cpu; + + if (should_skip(cpu_possible_mask)) + return 0; + + core_ctl_check_interval = (rq_avg_period_ms - RQ_AVG_TOLERANCE) + * NSEC_PER_MSEC; + + register_cpu_notifier(&cpu_notifier); + cpufreq_register_notifier(&cpufreq_pol_nb, CPUFREQ_POLICY_NOTIFIER); + cpufreq_register_notifier(&cpufreq_gov_nb, CPUFREQ_GOVINFO_NOTIFIER); + + cpu_maps_update_begin(); + for_each_online_cpu(cpu) { + struct cpufreq_policy *policy; + int ret; + + policy = cpufreq_cpu_get(cpu); + if (policy) { + ret = cluster_init(policy->related_cpus); + if (ret) + pr_warn("unable to create core ctl group: %d\n" + , ret); + cpufreq_cpu_put(policy); + } + } + cpu_maps_update_done(); + initialized = true; + return 0; +} + +late_initcall(core_ctl_init); diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 981fcd7dc394..14225d5d8617 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -27,6 +27,8 @@ * of the License. */ +#include "sched.h" + #include <linux/gfp.h> #include <linux/sched.h> #include <linux/sched/rt.h> @@ -51,6 +53,27 @@ static int convert_prio(int prio) } /** + * drop_nopreempt_cpus - remove a cpu from the mask if it is likely + * non-preemptible + * @lowest_mask: mask with selected CPUs (non-NULL) + */ +static void +drop_nopreempt_cpus(struct cpumask *lowest_mask) +{ + unsigned int cpu = cpumask_first(lowest_mask); + + while (cpu < nr_cpu_ids) { + /* unlocked access */ + struct task_struct *task = READ_ONCE(cpu_rq(cpu)->curr); + + if (task_may_not_preempt(task, cpu)) + cpumask_clear_cpu(cpu, lowest_mask); + + cpu = cpumask_next(cpu, lowest_mask); + } +} + +/** * cpupri_find - find the best (lowest-pri) CPU in the system * @cp: The cpupri context * @p: The task @@ -70,9 +93,11 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, { int idx = 0; int task_pri = convert_prio(p->prio); + bool drop_nopreempts = task_pri <= MAX_RT_PRIO; BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES); +retry: for (idx = 0; idx < task_pri; idx++) { struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; int skip = 0; @@ -108,7 +133,8 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, if (lowest_mask) { cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask); - + if (drop_nopreempts) + drop_nopreempt_cpus(lowest_mask); /* * We have to ensure that we have at least one bit * still set in the array, since the map could have @@ -123,7 +149,14 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, return 1; } - + /* + * If we can't find any non-preemptible cpu's, retry so we can + * find the lowest priority target and avoid priority inversion. + */ + if (drop_nopreempts) { + drop_nopreempts = false; + goto retry; + } return 0; } @@ -246,3 +279,14 @@ void cpupri_cleanup(struct cpupri *cp) for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) free_cpumask_var(cp->pri_to_cpu[i].mask); } + +/* + * cpupri_check_rt - check if CPU has a RT task + * should be called from rcu-sched read section. + */ +bool cpupri_check_rt(void) +{ + int cpu = raw_smp_processor_id(); + + return cpu_rq(cpu)->rd->cpupri.cpu_to_pri[cpu] > CPUPRI_NORMAL; +} diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index fc2cfd6b2941..e6ec68c15aa3 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -6,7 +6,6 @@ #include <linux/context_tracking.h> #include <linux/cpufreq_times.h> #include "sched.h" -#include "walt.h" #ifdef CONFIG_IRQ_TIME_ACCOUNTING @@ -51,10 +50,8 @@ void irqtime_account_irq(struct task_struct *curr) unsigned long flags; s64 delta; int cpu; -#ifdef CONFIG_SCHED_WALT u64 wallclock; bool account = true; -#endif if (!sched_clock_irqtime) return; @@ -62,10 +59,8 @@ void irqtime_account_irq(struct task_struct *curr) local_irq_save(flags); cpu = smp_processor_id(); -#ifdef CONFIG_SCHED_WALT wallclock = sched_clock_cpu(cpu); -#endif - delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); + delta = wallclock - __this_cpu_read(irq_start_time); __this_cpu_add(irq_start_time, delta); irq_time_write_begin(); @@ -79,16 +74,16 @@ void irqtime_account_irq(struct task_struct *curr) __this_cpu_add(cpu_hardirq_time, delta); else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) __this_cpu_add(cpu_softirq_time, delta); -#ifdef CONFIG_SCHED_WALT else account = false; -#endif irq_time_write_end(); -#ifdef CONFIG_SCHED_WALT + if (account) - walt_account_irqtime(cpu, curr, delta, wallclock); -#endif + sched_account_irqtime(cpu, curr, delta, wallclock); + else if (curr != this_cpu_ksoftirqd()) + sched_account_irqstart(cpu, curr, wallclock); + local_irq_restore(flags); } EXPORT_SYMBOL_GPL(irqtime_account_irq); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 5c6ffddcafcd..188c8388a63f 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -291,9 +291,11 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p /* * By now the task is replenished and enqueued; migrate it. */ + p->on_rq = TASK_ON_RQ_MIGRATING; deactivate_task(rq, p, 0); set_task_cpu(p, later_rq->cpu); activate_task(later_rq, p, 0); + p->on_rq = TASK_ON_RQ_QUEUED; if (!fallback) resched_curr(later_rq); @@ -992,6 +994,41 @@ static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {} #endif /* CONFIG_SMP */ +#ifdef CONFIG_SCHED_HMP + +static void +inc_hmp_sched_stats_dl(struct rq *rq, struct task_struct *p) +{ + inc_cumulative_runnable_avg(&rq->hmp_stats, p); +} + +static void +dec_hmp_sched_stats_dl(struct rq *rq, struct task_struct *p) +{ + dec_cumulative_runnable_avg(&rq->hmp_stats, p); +} + +static void +fixup_hmp_sched_stats_dl(struct rq *rq, struct task_struct *p, + u32 new_task_load, u32 new_pred_demand) +{ + s64 task_load_delta = (s64)new_task_load - task_load(p); + s64 pred_demand_delta = PRED_DEMAND_DELTA; + + fixup_cumulative_runnable_avg(&rq->hmp_stats, p, task_load_delta, + pred_demand_delta); +} + +#else /* CONFIG_SCHED_HMP */ + +static inline void +inc_hmp_sched_stats_dl(struct rq *rq, struct task_struct *p) { } + +static inline void +dec_hmp_sched_stats_dl(struct rq *rq, struct task_struct *p) { } + +#endif /* CONFIG_SCHED_HMP */ + static inline void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { @@ -1001,7 +1038,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) WARN_ON(!dl_prio(prio)); dl_rq->dl_nr_running++; add_nr_running(rq_of_dl_rq(dl_rq), 1); - walt_inc_cumulative_runnable_avg(rq_of_dl_rq(dl_rq), dl_task_of(dl_se)); + inc_hmp_sched_stats_dl(rq_of_dl_rq(dl_rq), dl_task_of(dl_se)); inc_dl_deadline(dl_rq, deadline); inc_dl_migration(dl_se, dl_rq); @@ -1016,7 +1053,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) WARN_ON(!dl_rq->dl_nr_running); dl_rq->dl_nr_running--; sub_nr_running(rq_of_dl_rq(dl_rq), 1); - walt_dec_cumulative_runnable_avg(rq_of_dl_rq(dl_rq), dl_task_of(dl_se)); + dec_hmp_sched_stats_dl(rq_of_dl_rq(dl_rq), dl_task_of(dl_se)); dec_dl_deadline(dl_rq, dl_se->deadline); dec_dl_migration(dl_se, dl_rq); @@ -1712,6 +1749,7 @@ retry: goto retry; } + next_task->on_rq = TASK_ON_RQ_MIGRATING; deactivate_task(rq, next_task, 0); clear_average_bw(&next_task->dl, &rq->dl); next_task->on_rq = TASK_ON_RQ_MIGRATING; @@ -1719,6 +1757,7 @@ retry: next_task->on_rq = TASK_ON_RQ_QUEUED; add_average_bw(&next_task->dl, &later_rq->dl); activate_task(later_rq, next_task, 0); + next_task->on_rq = TASK_ON_RQ_QUEUED; ret = 1; resched_curr(later_rq); @@ -1804,6 +1843,7 @@ static void pull_dl_task(struct rq *this_rq) resched = true; + p->on_rq = TASK_ON_RQ_MIGRATING; deactivate_task(src_rq, p, 0); clear_average_bw(&p->dl, &src_rq->dl); p->on_rq = TASK_ON_RQ_MIGRATING; @@ -1811,6 +1851,7 @@ static void pull_dl_task(struct rq *this_rq) p->on_rq = TASK_ON_RQ_QUEUED; add_average_bw(&p->dl, &this_rq->dl); activate_task(this_rq, p, 0); + p->on_rq = TASK_ON_RQ_QUEUED; dmin = p->dl.deadline; /* Is there any other task even earlier? */ @@ -2012,6 +2053,11 @@ const struct sched_class dl_sched_class = { .switched_to = switched_to_dl, .update_curr = update_curr_dl, +#ifdef CONFIG_SCHED_HMP + .inc_hmp_sched_stats = inc_hmp_sched_stats_dl, + .dec_hmp_sched_stats = dec_hmp_sched_stats_dl, + .fixup_hmp_sched_stats = fixup_hmp_sched_stats_dl, +#endif }; #ifdef CONFIG_SCHED_DEBUG diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 7f7116622631..ed8e6bb4531b 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -227,6 +227,14 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cfs_rq->throttled); SEQ_printf(m, " .%-30s: %d\n", "throttle_count", cfs_rq->throttle_count); + SEQ_printf(m, " .%-30s: %d\n", "runtime_enabled", + cfs_rq->runtime_enabled); +#ifdef CONFIG_SCHED_HMP + SEQ_printf(m, " .%-30s: %d\n", "nr_big_tasks", + cfs_rq->hmp_stats.nr_big_tasks); + SEQ_printf(m, " .%-30s: %llu\n", "cumulative_runnable_avg", + cfs_rq->hmp_stats.cumulative_runnable_avg); +#endif #endif #ifdef CONFIG_FAIR_GROUP_SCHED @@ -306,6 +314,23 @@ do { \ P(cpu_load[2]); P(cpu_load[3]); P(cpu_load[4]); +#ifdef CONFIG_SMP + P(cpu_capacity); +#endif +#ifdef CONFIG_SCHED_HMP + P(static_cpu_pwr_cost); + P(cluster->static_cluster_pwr_cost); + P(cluster->load_scale_factor); + P(cluster->capacity); + P(cluster->max_possible_capacity); + P(cluster->efficiency); + P(cluster->cur_freq); + P(cluster->max_freq); + P(cluster->exec_scale_factor); + P(hmp_stats.nr_big_tasks); + SEQ_printf(m, " .%-30s: %llu\n", "hmp_stats.cumulative_runnable_avg", + rq->hmp_stats.cumulative_runnable_avg); +#endif #undef P #undef PN @@ -386,6 +411,15 @@ static void sched_debug_header(struct seq_file *m) PN(sysctl_sched_wakeup_granularity); P(sysctl_sched_child_runs_first); P(sysctl_sched_features); +#ifdef CONFIG_SCHED_HMP + P(sched_upmigrate); + P(sched_downmigrate); + P(sched_init_task_load_windows); + P(min_capacity); + P(max_capacity); + P(sched_ravg_window); + P(sched_load_granule); +#endif #undef PN #undef P @@ -408,6 +442,7 @@ static int sched_debug_show(struct seq_file *m, void *v) return 0; } +#ifdef CONFIG_SYSRQ_SCHED_DEBUG void sysrq_sched_debug_show(void) { int cpu; @@ -417,6 +452,7 @@ void sysrq_sched_debug_show(void) print_cpu(NULL, cpu); } +#endif /* * This itererator needs some explanation. @@ -547,6 +583,9 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m) void proc_sched_show_task(struct task_struct *p, struct seq_file *m) { unsigned long nr_switches; + unsigned int load_avg; + + load_avg = pct_task_load(p); SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr(p), get_nr_threads(p)); @@ -624,6 +663,13 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) P(se.statistics.nr_wakeups_cas_attempts); P(se.statistics.nr_wakeups_cas_count); +#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) + __P(load_avg); +#ifdef CONFIG_SCHED_HMP + P(ravg.demand); +#endif +#endif + { u64 avg_atom, avg_per_cpu; diff --git a/kernel/sched/energy.c b/kernel/sched/energy.c index b0656b7a93e3..50d183b1e156 100644 --- a/kernel/sched/energy.c +++ b/kernel/sched/energy.c @@ -27,7 +27,10 @@ #include <linux/sched_energy.h> #include <linux/stddef.h> +#include "sched.h" + struct sched_group_energy *sge_array[NR_CPUS][NR_SD_LEVELS]; +bool sched_energy_aware; static void free_resources(void) { @@ -56,6 +59,13 @@ void init_sched_energy_costs(void) int sd_level, i, nstates, cpu; const __be32 *val; + if (!energy_aware()) { + sched_energy_aware = false; + return; + } + + sched_energy_aware = true; + for_each_possible_cpu(cpu) { cn = of_get_cpu_node(cpu, NULL); if (!cn) { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 125ff775fe05..78bd960c3527 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -32,9 +32,8 @@ #include <linux/task_work.h> #include <linux/module.h> -#include <trace/events/sched.h> - #include "sched.h" +#include <trace/events/sched.h> #include "tune.h" #include "walt.h" @@ -56,12 +55,6 @@ unsigned int normalized_sysctl_sched_latency = 6000000ULL; unsigned int sysctl_sched_sync_hint_enable = 1; unsigned int sysctl_sched_cstate_aware = 1; -#ifdef CONFIG_SCHED_WALT -unsigned int sysctl_sched_use_walt_cpu_util = 1; -unsigned int sysctl_sched_use_walt_task_util = 1; -__read_mostly unsigned int sysctl_sched_walt_cpu_high_irqload = - (10 * NSEC_PER_MSEC); -#endif /* * The initial- and re-scaling of tunables is configurable * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) @@ -254,6 +247,9 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight return mul_u64_u32_shr(delta_exec, fact, shift); } +#ifdef CONFIG_SMP +static int active_load_balance_cpu_stop(void *data); +#endif const struct sched_class fair_sched_class; @@ -891,12 +887,56 @@ static void update_curr_fair(struct rq *rq) update_curr(cfs_rq_of(&rq->curr->se)); } +#ifdef CONFIG_SCHEDSTATS +static inline void +update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + u64 wait_start = rq_clock(rq_of(cfs_rq)); + + if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) && + likely(wait_start > se->statistics.wait_start)) + wait_start -= se->statistics.wait_start; + + se->statistics.wait_start = wait_start; +} + +static void +update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + struct task_struct *p; + u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start; + + if (entity_is_task(se)) { + p = task_of(se); + if (task_on_rq_migrating(p)) { + /* + * Preserve migrating task's wait time so wait_start + * time stamp can be adjusted to accumulate wait time + * prior to migration. + */ + se->statistics.wait_start = delta; + return; + } + trace_sched_stat_wait(p, delta); + } + + se->statistics.wait_max = max(se->statistics.wait_max, delta); + se->statistics.wait_count++; + se->statistics.wait_sum += delta; + se->statistics.wait_start = 0; +} +#else static inline void update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) { - schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq))); } +static inline void +update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +} +#endif + /* * Task is being enqueued - update stats: */ @@ -910,23 +950,6 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) update_stats_wait_start(cfs_rq, se); } -static void -update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max, - rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start)); - schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1); - schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum + - rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start); -#ifdef CONFIG_SCHEDSTATS - if (entity_is_task(se)) { - trace_sched_stat_wait(task_of(se), - rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start); - } -#endif - schedstat_set(se->statistics.wait_start, 0); -} - static inline void update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -2633,7 +2656,27 @@ static inline void update_cfs_shares(struct sched_entity *se) #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_SMP -/* Precomputed fixed inverse multiplies for multiplication by y^n */ +u32 sched_get_wake_up_idle(struct task_struct *p) +{ + u32 enabled = p->flags & PF_WAKE_UP_IDLE; + + return !!enabled; +} +EXPORT_SYMBOL(sched_get_wake_up_idle); + +int sched_set_wake_up_idle(struct task_struct *p, int wake_up_idle) +{ + int enable = !!wake_up_idle; + + if (enable) + p->flags |= PF_WAKE_UP_IDLE; + else + p->flags &= ~PF_WAKE_UP_IDLE; + + return 0; +} +EXPORT_SYMBOL(sched_set_wake_up_idle); + static const u32 runnable_avg_yN_inv[] = { 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85, @@ -2713,6 +2756,1068 @@ static u32 __compute_runnable_contrib(u64 n) return contrib + runnable_avg_yN_sum[n]; } +#ifdef CONFIG_SCHED_HMP + +/* CPU selection flag */ +#define SBC_FLAG_PREV_CPU 0x1 +#define SBC_FLAG_BEST_CAP_CPU 0x2 +#define SBC_FLAG_CPU_COST 0x4 +#define SBC_FLAG_MIN_COST 0x8 +#define SBC_FLAG_IDLE_LEAST_LOADED 0x10 +#define SBC_FLAG_IDLE_CSTATE 0x20 +#define SBC_FLAG_COST_CSTATE_TIE_BREAKER 0x40 +#define SBC_FLAG_COST_CSTATE_PREV_CPU_TIE_BREAKER 0x80 +#define SBC_FLAG_CSTATE_LOAD 0x100 +#define SBC_FLAG_BEST_SIBLING 0x200 +#define SBC_FLAG_WAKER_CPU 0x400 +#define SBC_FLAG_PACK_TASK 0x800 + +/* Cluster selection flag */ +#define SBC_FLAG_COLOC_CLUSTER 0x10000 +#define SBC_FLAG_WAKER_CLUSTER 0x20000 +#define SBC_FLAG_BACKUP_CLUSTER 0x40000 +#define SBC_FLAG_BOOST_CLUSTER 0x80000 + +struct cpu_select_env { + struct task_struct *p; + struct related_thread_group *rtg; + u8 reason; + u8 need_idle:1; + u8 need_waker_cluster:1; + u8 sync:1; + enum sched_boost_policy boost_policy; + u8 pack_task:1; + int prev_cpu; + DECLARE_BITMAP(candidate_list, NR_CPUS); + DECLARE_BITMAP(backup_list, NR_CPUS); + u64 task_load; + u64 cpu_load; + u32 sbc_best_flag; + u32 sbc_best_cluster_flag; + struct cpumask search_cpus; +}; + +struct cluster_cpu_stats { + int best_idle_cpu, least_loaded_cpu; + int best_capacity_cpu, best_cpu, best_sibling_cpu; + int min_cost, best_sibling_cpu_cost; + int best_cpu_wakeup_latency; + u64 min_load, best_load, best_sibling_cpu_load; + s64 highest_spare_capacity; +}; + +/* + * Should task be woken to any available idle cpu? + * + * Waking tasks to idle cpu has mixed implications on both performance and + * power. In many cases, scheduler can't estimate correctly impact of using idle + * cpus on either performance or power. PF_WAKE_UP_IDLE allows external kernel + * module to pass a strong hint to scheduler that the task in question should be + * woken to idle cpu, generally to improve performance. + */ +static inline int wake_to_idle(struct task_struct *p) +{ + return (current->flags & PF_WAKE_UP_IDLE) || + (p->flags & PF_WAKE_UP_IDLE); +} + +static int spill_threshold_crossed(struct cpu_select_env *env, struct rq *rq) +{ + u64 total_load; + + total_load = env->task_load + env->cpu_load; + + if (total_load > sched_spill_load || + (rq->nr_running + 1) > sysctl_sched_spill_nr_run) + return 1; + + return 0; +} + +static int skip_cpu(int cpu, struct cpu_select_env *env) +{ + int tcpu = task_cpu(env->p); + int skip = 0; + + if (!env->reason) + return 0; + + if (is_reserved(cpu)) + return 1; + + switch (env->reason) { + case UP_MIGRATION: + skip = !idle_cpu(cpu); + break; + case IRQLOAD_MIGRATION: + /* Purposely fall through */ + default: + skip = (cpu == tcpu); + break; + } + + return skip; +} + +static inline int +acceptable_capacity(struct sched_cluster *cluster, struct cpu_select_env *env) +{ + int tcpu; + + if (!env->reason) + return 1; + + tcpu = task_cpu(env->p); + switch (env->reason) { + case UP_MIGRATION: + return cluster->capacity > cpu_capacity(tcpu); + + case DOWN_MIGRATION: + return cluster->capacity < cpu_capacity(tcpu); + + default: + break; + } + + return 1; +} + +static int +skip_cluster(struct sched_cluster *cluster, struct cpu_select_env *env) +{ + if (!test_bit(cluster->id, env->candidate_list)) + return 1; + + if (!acceptable_capacity(cluster, env)) { + __clear_bit(cluster->id, env->candidate_list); + return 1; + } + + return 0; +} + +static struct sched_cluster * +select_least_power_cluster(struct cpu_select_env *env) +{ + struct sched_cluster *cluster; + + if (env->rtg) { + int cpu = cluster_first_cpu(env->rtg->preferred_cluster); + + env->task_load = scale_load_to_cpu(task_load(env->p), cpu); + + if (task_load_will_fit(env->p, env->task_load, + cpu, env->boost_policy)) { + env->sbc_best_cluster_flag |= SBC_FLAG_COLOC_CLUSTER; + + if (env->boost_policy == SCHED_BOOST_NONE) + return env->rtg->preferred_cluster; + + for_each_sched_cluster(cluster) { + if (cluster != env->rtg->preferred_cluster) { + __set_bit(cluster->id, + env->backup_list); + __clear_bit(cluster->id, + env->candidate_list); + } + } + + return env->rtg->preferred_cluster; + } + + /* + * Since the task load does not fit on the preferred + * cluster anymore, pretend that the task does not + * have any preferred cluster. This allows the waking + * task to get the appropriate CPU it needs as per the + * non co-location placement policy without having to + * wait until the preferred cluster is updated. + */ + env->rtg = NULL; + } + + for_each_sched_cluster(cluster) { + if (!skip_cluster(cluster, env)) { + int cpu = cluster_first_cpu(cluster); + + env->task_load = scale_load_to_cpu(task_load(env->p), + cpu); + if (task_load_will_fit(env->p, env->task_load, cpu, + env->boost_policy)) + return cluster; + + __set_bit(cluster->id, env->backup_list); + __clear_bit(cluster->id, env->candidate_list); + } + } + + return NULL; +} + +static struct sched_cluster * +next_candidate(const unsigned long *list, int start, int end) +{ + int cluster_id; + + cluster_id = find_next_bit(list, end, start - 1 + 1); + if (cluster_id >= end) + return NULL; + + return sched_cluster[cluster_id]; +} + +static void +update_spare_capacity(struct cluster_cpu_stats *stats, + struct cpu_select_env *env, int cpu, int capacity, + u64 cpu_load) +{ + s64 spare_capacity = sched_ravg_window - cpu_load; + + if (spare_capacity > 0 && + (spare_capacity > stats->highest_spare_capacity || + (spare_capacity == stats->highest_spare_capacity && + ((!env->need_waker_cluster && + capacity > cpu_capacity(stats->best_capacity_cpu)) || + (env->need_waker_cluster && + cpu_rq(cpu)->nr_running < + cpu_rq(stats->best_capacity_cpu)->nr_running))))) { + /* + * If sync waker is the only runnable of CPU, cr_avg of the + * CPU is 0 so we have high chance to place the wakee on the + * waker's CPU which likely causes preemtion of the waker. + * This can lead migration of preempted waker. Place the + * wakee on the real idle CPU when it's possible by checking + * nr_running to avoid such preemption. + */ + stats->highest_spare_capacity = spare_capacity; + stats->best_capacity_cpu = cpu; + } +} + +static inline void find_backup_cluster( +struct cpu_select_env *env, struct cluster_cpu_stats *stats) +{ + struct sched_cluster *next = NULL; + int i; + struct cpumask search_cpus; + + extern int num_clusters; + + while (!bitmap_empty(env->backup_list, num_clusters)) { + next = next_candidate(env->backup_list, 0, num_clusters); + __clear_bit(next->id, env->backup_list); + + cpumask_and(&search_cpus, &env->search_cpus, &next->cpus); + for_each_cpu(i, &search_cpus) { + trace_sched_cpu_load_wakeup(cpu_rq(i), idle_cpu(i), + sched_irqload(i), power_cost(i, task_load(env->p) + + cpu_cravg_sync(i, env->sync)), 0); + + update_spare_capacity(stats, env, i, next->capacity, + cpu_load_sync(i, env->sync)); + } + env->sbc_best_cluster_flag = SBC_FLAG_BACKUP_CLUSTER; + } +} + +struct sched_cluster * +next_best_cluster(struct sched_cluster *cluster, struct cpu_select_env *env, + struct cluster_cpu_stats *stats) +{ + struct sched_cluster *next = NULL; + + extern int num_clusters; + + __clear_bit(cluster->id, env->candidate_list); + + if (env->rtg && preferred_cluster(cluster, env->p)) + return NULL; + + do { + if (bitmap_empty(env->candidate_list, num_clusters)) + return NULL; + + next = next_candidate(env->candidate_list, 0, num_clusters); + if (next) { + if (next->min_power_cost > stats->min_cost) { + clear_bit(next->id, env->candidate_list); + next = NULL; + continue; + } + + if (skip_cluster(next, env)) + next = NULL; + } + } while (!next); + + env->task_load = scale_load_to_cpu(task_load(env->p), + cluster_first_cpu(next)); + return next; +} + +#ifdef CONFIG_SCHED_HMP_CSTATE_AWARE +static void __update_cluster_stats(int cpu, struct cluster_cpu_stats *stats, + struct cpu_select_env *env, int cpu_cost) +{ + int wakeup_latency; + int prev_cpu = env->prev_cpu; + + wakeup_latency = cpu_rq(cpu)->wakeup_latency; + + if (env->need_idle) { + stats->min_cost = cpu_cost; + if (idle_cpu(cpu)) { + if (wakeup_latency < stats->best_cpu_wakeup_latency || + (wakeup_latency == stats->best_cpu_wakeup_latency && + cpu == prev_cpu)) { + stats->best_idle_cpu = cpu; + stats->best_cpu_wakeup_latency = wakeup_latency; + } + } else { + if (env->cpu_load < stats->min_load || + (env->cpu_load == stats->min_load && + cpu == prev_cpu)) { + stats->least_loaded_cpu = cpu; + stats->min_load = env->cpu_load; + } + } + + return; + } + + if (cpu_cost < stats->min_cost) { + stats->min_cost = cpu_cost; + stats->best_cpu_wakeup_latency = wakeup_latency; + stats->best_load = env->cpu_load; + stats->best_cpu = cpu; + env->sbc_best_flag = SBC_FLAG_CPU_COST; + return; + } + + /* CPU cost is the same. Start breaking the tie by C-state */ + + if (wakeup_latency > stats->best_cpu_wakeup_latency) + return; + + if (wakeup_latency < stats->best_cpu_wakeup_latency) { + stats->best_cpu_wakeup_latency = wakeup_latency; + stats->best_load = env->cpu_load; + stats->best_cpu = cpu; + env->sbc_best_flag = SBC_FLAG_COST_CSTATE_TIE_BREAKER; + return; + } + + /* C-state is the same. Use prev CPU to break the tie */ + if (cpu == prev_cpu) { + stats->best_cpu = cpu; + env->sbc_best_flag = SBC_FLAG_COST_CSTATE_PREV_CPU_TIE_BREAKER; + return; + } + + if (stats->best_cpu != prev_cpu && + ((wakeup_latency == 0 && env->cpu_load < stats->best_load) || + (wakeup_latency > 0 && env->cpu_load > stats->best_load))) { + stats->best_load = env->cpu_load; + stats->best_cpu = cpu; + env->sbc_best_flag = SBC_FLAG_CSTATE_LOAD; + } +} +#else /* CONFIG_SCHED_HMP_CSTATE_AWARE */ +static void __update_cluster_stats(int cpu, struct cluster_cpu_stats *stats, + struct cpu_select_env *env, int cpu_cost) +{ + int prev_cpu = env->prev_cpu; + + if (cpu != prev_cpu && cpus_share_cache(prev_cpu, cpu)) { + if (stats->best_sibling_cpu_cost > cpu_cost || + (stats->best_sibling_cpu_cost == cpu_cost && + stats->best_sibling_cpu_load > env->cpu_load)) { + stats->best_sibling_cpu_cost = cpu_cost; + stats->best_sibling_cpu_load = env->cpu_load; + stats->best_sibling_cpu = cpu; + } + } + + if ((cpu_cost < stats->min_cost) || + ((stats->best_cpu != prev_cpu && + stats->min_load > env->cpu_load) || cpu == prev_cpu)) { + if (env->need_idle) { + if (idle_cpu(cpu)) { + stats->min_cost = cpu_cost; + stats->best_idle_cpu = cpu; + } + } else { + stats->min_cost = cpu_cost; + stats->min_load = env->cpu_load; + stats->best_cpu = cpu; + env->sbc_best_flag = SBC_FLAG_MIN_COST; + } + } +} +#endif /* CONFIG_SCHED_HMP_CSTATE_AWARE */ + +static void update_cluster_stats(int cpu, struct cluster_cpu_stats *stats, + struct cpu_select_env *env) +{ + int cpu_cost; + + /* + * We try to find the least loaded *busy* CPU irrespective + * of the power cost. + */ + if (env->pack_task) + cpu_cost = cpu_min_power_cost(cpu); + + else + cpu_cost = power_cost(cpu, task_load(env->p) + + cpu_cravg_sync(cpu, env->sync)); + + if (cpu_cost <= stats->min_cost) + __update_cluster_stats(cpu, stats, env, cpu_cost); +} + +static void find_best_cpu_in_cluster(struct sched_cluster *c, + struct cpu_select_env *env, struct cluster_cpu_stats *stats) +{ + int i; + struct cpumask search_cpus; + + cpumask_and(&search_cpus, &env->search_cpus, &c->cpus); + + env->need_idle = wake_to_idle(env->p) || c->wake_up_idle; + + for_each_cpu(i, &search_cpus) { + env->cpu_load = cpu_load_sync(i, env->sync); + + trace_sched_cpu_load_wakeup(cpu_rq(i), idle_cpu(i), + sched_irqload(i), + power_cost(i, task_load(env->p) + + cpu_cravg_sync(i, env->sync)), 0); + + if (skip_cpu(i, env)) + continue; + + update_spare_capacity(stats, env, i, c->capacity, + env->cpu_load); + + /* + * need_idle takes precedence over sched boost but when both + * are set, idlest CPU with in all the clusters is selected + * when boost_policy = BOOST_ON_ALL whereas idlest CPU in the + * big cluster is selected within boost_policy = BOOST_ON_BIG. + */ + if ((!env->need_idle && + env->boost_policy != SCHED_BOOST_NONE) || + env->need_waker_cluster || + sched_cpu_high_irqload(i) || + spill_threshold_crossed(env, cpu_rq(i))) + continue; + + update_cluster_stats(i, stats, env); + } +} + +static inline void init_cluster_cpu_stats(struct cluster_cpu_stats *stats) +{ + stats->best_cpu = stats->best_idle_cpu = -1; + stats->best_capacity_cpu = stats->best_sibling_cpu = -1; + stats->min_cost = stats->best_sibling_cpu_cost = INT_MAX; + stats->min_load = stats->best_sibling_cpu_load = ULLONG_MAX; + stats->highest_spare_capacity = 0; + stats->least_loaded_cpu = -1; + stats->best_cpu_wakeup_latency = INT_MAX; + /* No need to initialize stats->best_load */ +} + +static inline bool env_has_special_flags(struct cpu_select_env *env) +{ + if (env->need_idle || env->boost_policy != SCHED_BOOST_NONE || + env->reason) + return true; + + return false; +} + +static inline bool +bias_to_prev_cpu(struct cpu_select_env *env, struct cluster_cpu_stats *stats) +{ + int prev_cpu; + struct task_struct *task = env->p; + struct sched_cluster *cluster; + + if (!task->ravg.mark_start || !sched_short_sleep_task_threshold) + return false; + + prev_cpu = env->prev_cpu; + if (!cpumask_test_cpu(prev_cpu, &env->search_cpus)) + return false; + + if (task->ravg.mark_start - task->last_cpu_selected_ts >= + sched_long_cpu_selection_threshold) + return false; + + /* + * This function should be used by task wake up path only as it's + * assuming p->last_switch_out_ts as last sleep time. + * p->last_switch_out_ts can denote last preemption time as well as + * last sleep time. + */ + if (task->ravg.mark_start - task->last_switch_out_ts >= + sched_short_sleep_task_threshold) + return false; + + env->task_load = scale_load_to_cpu(task_load(task), prev_cpu); + cluster = cpu_rq(prev_cpu)->cluster; + + if (!task_load_will_fit(task, env->task_load, prev_cpu, + sched_boost_policy())) { + + __set_bit(cluster->id, env->backup_list); + __clear_bit(cluster->id, env->candidate_list); + return false; + } + + env->cpu_load = cpu_load_sync(prev_cpu, env->sync); + if (sched_cpu_high_irqload(prev_cpu) || + spill_threshold_crossed(env, cpu_rq(prev_cpu))) { + update_spare_capacity(stats, env, prev_cpu, + cluster->capacity, env->cpu_load); + cpumask_clear_cpu(prev_cpu, &env->search_cpus); + return false; + } + + return true; +} + +static inline bool +wake_to_waker_cluster(struct cpu_select_env *env) +{ + return env->sync && + task_load(current) > sched_big_waker_task_load && + task_load(env->p) < sched_small_wakee_task_load; +} + +static inline bool +bias_to_waker_cpu(struct cpu_select_env *env, int cpu) +{ + return sysctl_sched_prefer_sync_wakee_to_waker && + cpu_rq(cpu)->nr_running == 1 && + cpumask_test_cpu(cpu, &env->search_cpus); +} + +static inline int +cluster_allowed(struct cpu_select_env *env, struct sched_cluster *cluster) +{ + return cpumask_intersects(&env->search_cpus, &cluster->cpus); +} + +/* return cheapest cpu that can fit this task */ +static int select_best_cpu(struct task_struct *p, int target, int reason, + int sync) +{ + struct sched_cluster *cluster, *pref_cluster = NULL; + struct cluster_cpu_stats stats; + struct related_thread_group *grp; + unsigned int sbc_flag = 0; + int cpu = raw_smp_processor_id(); + bool special; + + struct cpu_select_env env = { + .p = p, + .reason = reason, + .need_idle = wake_to_idle(p), + .need_waker_cluster = 0, + .sync = sync, + .prev_cpu = target, + .rtg = NULL, + .sbc_best_flag = 0, + .sbc_best_cluster_flag = 0, + .pack_task = false, + }; + + env.boost_policy = task_sched_boost(p) ? + sched_boost_policy() : SCHED_BOOST_NONE; + + bitmap_copy(env.candidate_list, all_cluster_ids, NR_CPUS); + bitmap_zero(env.backup_list, NR_CPUS); + + cpumask_and(&env.search_cpus, tsk_cpus_allowed(p), cpu_active_mask); + cpumask_andnot(&env.search_cpus, &env.search_cpus, cpu_isolated_mask); + + init_cluster_cpu_stats(&stats); + special = env_has_special_flags(&env); + + rcu_read_lock(); + + grp = task_related_thread_group(p); + + if (grp && grp->preferred_cluster) { + pref_cluster = grp->preferred_cluster; + if (!cluster_allowed(&env, pref_cluster)) + clear_bit(pref_cluster->id, env.candidate_list); + else + env.rtg = grp; + } else if (!special) { + cluster = cpu_rq(cpu)->cluster; + if (wake_to_waker_cluster(&env)) { + if (bias_to_waker_cpu(&env, cpu)) { + target = cpu; + sbc_flag = SBC_FLAG_WAKER_CLUSTER | + SBC_FLAG_WAKER_CPU; + goto out; + } else if (cluster_allowed(&env, cluster)) { + env.need_waker_cluster = 1; + bitmap_zero(env.candidate_list, NR_CPUS); + __set_bit(cluster->id, env.candidate_list); + env.sbc_best_cluster_flag = + SBC_FLAG_WAKER_CLUSTER; + } + } else if (bias_to_prev_cpu(&env, &stats)) { + sbc_flag = SBC_FLAG_PREV_CPU; + goto out; + } + } + + if (!special && is_short_burst_task(p)) { + env.pack_task = true; + sbc_flag = SBC_FLAG_PACK_TASK; + } +retry: + cluster = select_least_power_cluster(&env); + + if (!cluster) + goto out; + + /* + * 'cluster' now points to the minimum power cluster which can satisfy + * task's perf goals. Walk down the cluster list starting with that + * cluster. For non-small tasks, skip clusters that don't have + * mostly_idle/idle cpus + */ + + do { + find_best_cpu_in_cluster(cluster, &env, &stats); + + } while ((cluster = next_best_cluster(cluster, &env, &stats))); + + if (env.need_idle) { + if (stats.best_idle_cpu >= 0) { + target = stats.best_idle_cpu; + sbc_flag |= SBC_FLAG_IDLE_CSTATE; + } else if (stats.least_loaded_cpu >= 0) { + target = stats.least_loaded_cpu; + sbc_flag |= SBC_FLAG_IDLE_LEAST_LOADED; + } + } else if (stats.best_cpu >= 0) { + if (stats.best_sibling_cpu >= 0 && + stats.best_cpu != task_cpu(p) && + stats.min_cost == stats.best_sibling_cpu_cost) { + stats.best_cpu = stats.best_sibling_cpu; + sbc_flag |= SBC_FLAG_BEST_SIBLING; + } + sbc_flag |= env.sbc_best_flag; + target = stats.best_cpu; + } else { + if (env.rtg && env.boost_policy == SCHED_BOOST_NONE) { + env.rtg = NULL; + goto retry; + } + + /* + * With boost_policy == SCHED_BOOST_ON_BIG, we reach here with + * backup_list = little cluster, candidate_list = none and + * stats->best_capacity_cpu points the best spare capacity + * CPU among the CPUs in the big cluster. + */ + if (env.boost_policy == SCHED_BOOST_ON_BIG && + stats.best_capacity_cpu >= 0) + sbc_flag |= SBC_FLAG_BOOST_CLUSTER; + else + find_backup_cluster(&env, &stats); + + if (stats.best_capacity_cpu >= 0) { + target = stats.best_capacity_cpu; + sbc_flag |= SBC_FLAG_BEST_CAP_CPU; + } + } + p->last_cpu_selected_ts = sched_ktime_clock(); +out: + sbc_flag |= env.sbc_best_cluster_flag; + rcu_read_unlock(); + trace_sched_task_load(p, sched_boost_policy() && task_sched_boost(p), + env.reason, env.sync, env.need_idle, sbc_flag, target); + return target; +} + +#ifdef CONFIG_CFS_BANDWIDTH + +static inline struct task_group *next_task_group(struct task_group *tg) +{ + tg = list_entry_rcu(tg->list.next, typeof(struct task_group), list); + + return (&tg->list == &task_groups) ? NULL : tg; +} + +/* Iterate over all cfs_rq in a cpu */ +#define for_each_cfs_rq(cfs_rq, tg, cpu) \ + for (tg = container_of(&task_groups, struct task_group, list); \ + ((tg = next_task_group(tg)) && (cfs_rq = tg->cfs_rq[cpu]));) + +void reset_cfs_rq_hmp_stats(int cpu, int reset_cra) +{ + struct task_group *tg; + struct cfs_rq *cfs_rq; + + rcu_read_lock(); + + for_each_cfs_rq(cfs_rq, tg, cpu) + reset_hmp_stats(&cfs_rq->hmp_stats, reset_cra); + + rcu_read_unlock(); +} + +static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq); + +static void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq, + struct task_struct *p, int change_cra); +static void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq, + struct task_struct *p, int change_cra); + +/* Add task's contribution to a cpu' HMP statistics */ +void _inc_hmp_sched_stats_fair(struct rq *rq, + struct task_struct *p, int change_cra) +{ + struct cfs_rq *cfs_rq; + struct sched_entity *se = &p->se; + + /* + * Although below check is not strictly required (as + * inc/dec_nr_big_task and inc/dec_cumulative_runnable_avg called + * from inc_cfs_rq_hmp_stats() have similar checks), we gain a bit on + * efficiency by short-circuiting for_each_sched_entity() loop when + * sched_disable_window_stats + */ + if (sched_disable_window_stats) + return; + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + inc_cfs_rq_hmp_stats(cfs_rq, p, change_cra); + if (cfs_rq_throttled(cfs_rq)) + break; + } + + /* Update rq->hmp_stats only if we didn't find any throttled cfs_rq */ + if (!se) + inc_rq_hmp_stats(rq, p, change_cra); +} + +/* Remove task's contribution from a cpu' HMP statistics */ +static void +_dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p, int change_cra) +{ + struct cfs_rq *cfs_rq; + struct sched_entity *se = &p->se; + + /* See comment on efficiency in _inc_hmp_sched_stats_fair */ + if (sched_disable_window_stats) + return; + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + dec_cfs_rq_hmp_stats(cfs_rq, p, change_cra); + if (cfs_rq_throttled(cfs_rq)) + break; + } + + /* Update rq->hmp_stats only if we didn't find any throttled cfs_rq */ + if (!se) + dec_rq_hmp_stats(rq, p, change_cra); +} + +static void inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p) +{ + _inc_hmp_sched_stats_fair(rq, p, 1); +} + +static void dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p) +{ + _dec_hmp_sched_stats_fair(rq, p, 1); +} + +static void fixup_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p, + u32 new_task_load, u32 new_pred_demand) +{ + struct cfs_rq *cfs_rq; + struct sched_entity *se = &p->se; + s64 task_load_delta = (s64)new_task_load - task_load(p); + s64 pred_demand_delta = PRED_DEMAND_DELTA; + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + + fixup_cumulative_runnable_avg(&cfs_rq->hmp_stats, p, + task_load_delta, + pred_demand_delta); + fixup_nr_big_tasks(&cfs_rq->hmp_stats, p, task_load_delta); + if (cfs_rq_throttled(cfs_rq)) + break; + } + + /* Fix up rq->hmp_stats only if we didn't find any throttled cfs_rq */ + if (!se) { + fixup_cumulative_runnable_avg(&rq->hmp_stats, p, + task_load_delta, + pred_demand_delta); + fixup_nr_big_tasks(&rq->hmp_stats, p, task_load_delta); + } +} + +static int task_will_be_throttled(struct task_struct *p); + +#else /* CONFIG_CFS_BANDWIDTH */ + +inline void reset_cfs_rq_hmp_stats(int cpu, int reset_cra) { } + +static void +inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p) +{ + inc_nr_big_task(&rq->hmp_stats, p); + inc_cumulative_runnable_avg(&rq->hmp_stats, p); +} + +static void +dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p) +{ + dec_nr_big_task(&rq->hmp_stats, p); + dec_cumulative_runnable_avg(&rq->hmp_stats, p); +} +static void +fixup_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p, + u32 new_task_load, u32 new_pred_demand) +{ + s64 task_load_delta = (s64)new_task_load - task_load(p); + s64 pred_demand_delta = PRED_DEMAND_DELTA; + + fixup_cumulative_runnable_avg(&rq->hmp_stats, p, task_load_delta, + pred_demand_delta); + fixup_nr_big_tasks(&rq->hmp_stats, p, task_load_delta); +} + +static inline int task_will_be_throttled(struct task_struct *p) +{ + return 0; +} + +void _inc_hmp_sched_stats_fair(struct rq *rq, + struct task_struct *p, int change_cra) +{ + inc_nr_big_task(&rq->hmp_stats, p); +} + +#endif /* CONFIG_CFS_BANDWIDTH */ + +/* + * Reset balance_interval at all sched_domain levels of given cpu, so that it + * honors kick. + */ +static inline void reset_balance_interval(int cpu) +{ + struct sched_domain *sd; + + if (cpu >= nr_cpu_ids) + return; + + rcu_read_lock(); + for_each_domain(cpu, sd) + sd->balance_interval = 0; + rcu_read_unlock(); +} + +/* + * Check if a task is on the "wrong" cpu (i.e its current cpu is not the ideal + * cpu as per its demand or priority) + * + * Returns reason why task needs to be migrated + */ +static inline int migration_needed(struct task_struct *p, int cpu) +{ + int nice; + struct related_thread_group *grp; + + if (p->state != TASK_RUNNING || p->nr_cpus_allowed == 1) + return 0; + + /* No need to migrate task that is about to be throttled */ + if (task_will_be_throttled(p)) + return 0; + + if (sched_boost_policy() == SCHED_BOOST_ON_BIG && + cpu_capacity(cpu) != max_capacity && task_sched_boost(p)) + return UP_MIGRATION; + + if (sched_cpu_high_irqload(cpu)) + return IRQLOAD_MIGRATION; + + nice = task_nice(p); + rcu_read_lock(); + grp = task_related_thread_group(p); + /* + * Don't assume higher capacity means higher power. If the task + * is running on the power efficient CPU, avoid migrating it + * to a lower capacity cluster. + */ + if (!grp && (nice > SCHED_UPMIGRATE_MIN_NICE || + upmigrate_discouraged(p)) && + cpu_capacity(cpu) > min_capacity && + cpu_max_power_cost(cpu) == max_power_cost) { + rcu_read_unlock(); + return DOWN_MIGRATION; + } + + if (!task_will_fit(p, cpu)) { + rcu_read_unlock(); + return UP_MIGRATION; + } + rcu_read_unlock(); + + return 0; +} + +static inline int +kick_active_balance(struct rq *rq, struct task_struct *p, int new_cpu) +{ + unsigned long flags; + int rc = 0; + + /* Invoke active balance to force migrate currently running task */ + raw_spin_lock_irqsave(&rq->lock, flags); + if (!rq->active_balance) { + rq->active_balance = 1; + rq->push_cpu = new_cpu; + get_task_struct(p); + rq->push_task = p; + rc = 1; + } + raw_spin_unlock_irqrestore(&rq->lock, flags); + + return rc; +} + +static DEFINE_RAW_SPINLOCK(migration_lock); + +static bool do_migration(int reason, int new_cpu, int cpu) +{ + if ((reason == UP_MIGRATION || reason == DOWN_MIGRATION) + && same_cluster(new_cpu, cpu)) + return false; + + /* Inter cluster high irqload migrations are OK */ + return new_cpu != cpu; +} + +/* + * Check if currently running task should be migrated to a better cpu. + * + * Todo: Effect this via changes to nohz_balancer_kick() and load balance? + */ +void check_for_migration(struct rq *rq, struct task_struct *p) +{ + int cpu = cpu_of(rq), new_cpu; + int active_balance = 0, reason; + + reason = migration_needed(p, cpu); + if (!reason) + return; + + raw_spin_lock(&migration_lock); + new_cpu = select_best_cpu(p, cpu, reason, 0); + + if (do_migration(reason, new_cpu, cpu)) { + active_balance = kick_active_balance(rq, p, new_cpu); + if (active_balance) + mark_reserved(new_cpu); + } + + raw_spin_unlock(&migration_lock); + + if (active_balance) + stop_one_cpu_nowait(cpu, active_load_balance_cpu_stop, rq, + &rq->active_balance_work); +} + +#ifdef CONFIG_CFS_BANDWIDTH + +static void init_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq) +{ + cfs_rq->hmp_stats.nr_big_tasks = 0; + cfs_rq->hmp_stats.cumulative_runnable_avg = 0; + cfs_rq->hmp_stats.pred_demands_sum = 0; +} + +static void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq, + struct task_struct *p, int change_cra) +{ + inc_nr_big_task(&cfs_rq->hmp_stats, p); + if (change_cra) + inc_cumulative_runnable_avg(&cfs_rq->hmp_stats, p); +} + +static void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq, + struct task_struct *p, int change_cra) +{ + dec_nr_big_task(&cfs_rq->hmp_stats, p); + if (change_cra) + dec_cumulative_runnable_avg(&cfs_rq->hmp_stats, p); +} + +static void inc_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats, + struct cfs_rq *cfs_rq) +{ + stats->nr_big_tasks += cfs_rq->hmp_stats.nr_big_tasks; + stats->cumulative_runnable_avg += + cfs_rq->hmp_stats.cumulative_runnable_avg; + stats->pred_demands_sum += cfs_rq->hmp_stats.pred_demands_sum; +} + +static void dec_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats, + struct cfs_rq *cfs_rq) +{ + stats->nr_big_tasks -= cfs_rq->hmp_stats.nr_big_tasks; + stats->cumulative_runnable_avg -= + cfs_rq->hmp_stats.cumulative_runnable_avg; + stats->pred_demands_sum -= cfs_rq->hmp_stats.pred_demands_sum; + + BUG_ON(stats->nr_big_tasks < 0 || + (s64)stats->cumulative_runnable_avg < 0); + BUG_ON((s64)stats->pred_demands_sum < 0); +} + +#else /* CONFIG_CFS_BANDWIDTH */ + +static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq, + struct task_struct *p, int change_cra) { } + +static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq, + struct task_struct *p, int change_cra) { } + +#endif /* CONFIG_CFS_BANDWIDTH */ + +#else /* CONFIG_SCHED_HMP */ + +static inline void init_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq) { } + +static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq, + struct task_struct *p, int change_cra) { } + +static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq, + struct task_struct *p, int change_cra) { } + +#define dec_throttled_cfs_rq_hmp_stats(...) +#define inc_throttled_cfs_rq_hmp_stats(...) + +#endif /* CONFIG_SCHED_HMP */ + #if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10 #error "load tracking assumes 2^10 as unit" #endif @@ -2836,6 +3941,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, if (cfs_rq) cfs_rq->runnable_load_sum += weight * scaled_delta; } + if (running) sa->util_sum += scaled_delta * scale_cpu; @@ -3434,6 +4540,12 @@ static inline int idle_balance(struct rq *rq) return 0; } +static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq, + struct task_struct *p, int change_cra) { } + +static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq, + struct task_struct *p, int change_cra) { } + #endif /* CONFIG_SMP */ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -4060,6 +5172,35 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) return cfs_bandwidth_used() && cfs_rq->throttled; } +#ifdef CONFIG_SCHED_HMP +/* + * Check if task is part of a hierarchy where some cfs_rq does not have any + * runtime left. + * + * We can't rely on throttled_hierarchy() to do this test, as + * cfs_rq->throttle_count will not be updated yet when this function is called + * from scheduler_tick() + */ +static int task_will_be_throttled(struct task_struct *p) +{ + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq; + + if (!cfs_bandwidth_used()) + return 0; + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + if (!cfs_rq->runtime_enabled) + continue; + if (cfs_rq->runtime_remaining <= 0) + return 1; + } + + return 0; +} +#endif + /* check whether cfs_rq, or any parent, is throttled */ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) { @@ -4139,13 +5280,16 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) if (dequeue) dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); qcfs_rq->h_nr_running -= task_delta; + dec_throttled_cfs_rq_hmp_stats(&qcfs_rq->hmp_stats, cfs_rq); if (qcfs_rq->load.weight) dequeue = 0; } - if (!se) + if (!se) { sub_nr_running(rq, task_delta); + dec_throttled_cfs_rq_hmp_stats(&rq->hmp_stats, cfs_rq); + } cfs_rq->throttled = 1; cfs_rq->throttled_clock = rq_clock(rq); @@ -4170,6 +5314,12 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) start_cfs_bandwidth(cfs_b); raw_spin_unlock(&cfs_b->lock); + + /* Log effect on hmp stats after throttling */ + trace_sched_cpu_load_cgroup(rq, idle_cpu(cpu_of(rq)), + sched_irqload(cpu_of(rq)), + power_cost(cpu_of(rq), 0), + cpu_temp(cpu_of(rq))); } void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) @@ -4179,6 +5329,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) struct sched_entity *se; int enqueue = 1; long task_delta; + struct cfs_rq *tcfs_rq __maybe_unused = cfs_rq; se = cfs_rq->tg->se[cpu_of(rq)]; @@ -4206,17 +5357,26 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) if (enqueue) enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); cfs_rq->h_nr_running += task_delta; + inc_throttled_cfs_rq_hmp_stats(&cfs_rq->hmp_stats, tcfs_rq); if (cfs_rq_throttled(cfs_rq)) break; } - if (!se) + if (!se) { add_nr_running(rq, task_delta); + inc_throttled_cfs_rq_hmp_stats(&rq->hmp_stats, tcfs_rq); + } /* determine whether we need to wake up potentially idle cpu */ if (rq->curr == rq->idle && rq->cfs.nr_running) resched_curr(rq); + + /* Log effect on hmp stats after un-throttling */ + trace_sched_cpu_load_cgroup(rq, idle_cpu(cpu_of(rq)), + sched_irqload(cpu_of(rq)), + power_cost(cpu_of(rq), 0), + cpu_temp(cpu_of(rq))); } static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, @@ -4602,6 +5762,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) { cfs_rq->runtime_enabled = 0; INIT_LIST_HEAD(&cfs_rq->throttled_list); + init_cfs_rq_hmp_stats(cfs_rq); } void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) @@ -4717,7 +5878,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) WARN_ON(task_rq(p) != rq); - if (cfs_rq->nr_running > 1) { + if (rq->cfs.h_nr_running > 1) { u64 slice = sched_slice(cfs_rq, se); u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; s64 delta = slice - ran; @@ -4733,8 +5894,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) /* * called from enqueue/dequeue and updates the hrtick when the - * current task is from our class and nr_running is low enough - * to matter. + * current task is from our class. */ static void hrtick_update(struct rq *rq) { @@ -4743,8 +5903,7 @@ static void hrtick_update(struct rq *rq) if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class) return; - if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency) - hrtick_start_fair(rq, curr); + hrtick_start_fair(rq, curr); } #else /* !CONFIG_SCHED_HRTICK */ static inline void @@ -4802,7 +5961,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running++; - walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p); + inc_cfs_rq_hmp_stats(cfs_rq, p, 1); flags = ENQUEUE_WAKEUP; } @@ -4810,7 +5969,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_running++; - walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p); + inc_cfs_rq_hmp_stats(cfs_rq, p, 1); if (cfs_rq_throttled(cfs_rq)) break; @@ -4819,8 +5978,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_shares(se); } - if (!se) + if (!se) { add_nr_running(rq, 1); + inc_rq_hmp_stats(rq, p, 1); + } #ifdef CONFIG_SMP @@ -4843,8 +6004,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) */ schedtune_enqueue_task(p, cpu_of(rq)); - if (!se) { - walt_inc_cumulative_runnable_avg(rq, p); + if (energy_aware() && !se) { if (!task_new && !rq->rd->overutilized && cpu_overutilized(rq->cpu)) { rq->rd->overutilized = true; @@ -4882,7 +6042,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running--; - walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p); + dec_cfs_rq_hmp_stats(cfs_rq, p, 1); /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { @@ -4902,7 +6062,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_running--; - walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p); + dec_cfs_rq_hmp_stats(cfs_rq, p, 1); if (cfs_rq_throttled(cfs_rq)) break; @@ -4911,8 +6071,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_shares(se); } - if (!se) + if (!se) { sub_nr_running(rq, 1); + dec_rq_hmp_stats(rq, p, 1); + } #ifdef CONFIG_SMP @@ -4925,8 +6087,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) */ schedtune_dequeue_task(p, cpu_of(rq)); - if (!se) - walt_dec_cumulative_runnable_avg(rq, p); #endif /* CONFIG_SMP */ hrtick_update(rq); @@ -5339,11 +6499,6 @@ unsigned long capacity_curr_of(int cpu) >> SCHED_CAPACITY_SHIFT; } -static inline bool energy_aware(void) -{ - return sched_feat(ENERGY_AWARE); -} - struct energy_env { struct sched_group *sg_top; struct sched_group *sg_cap; @@ -5943,12 +7098,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, static inline unsigned long task_util(struct task_struct *p) { -#ifdef CONFIG_SCHED_WALT - if (!walt_disabled && sysctl_sched_use_walt_task_util) { - unsigned long demand = p->ravg.demand; - return (demand << 10) / walt_ravg_window; - } -#endif return p->se.avg.util_avg; } @@ -6333,6 +7482,10 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) } } + if (!(current->flags & PF_WAKE_UP_IDLE) && + !(p->flags & PF_WAKE_UP_IDLE)) + return target; + /* * Otherwise, iterate the domains and find an elegible idle cpu. */ @@ -6857,6 +8010,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f int want_affine = 0; int sync = wake_flags & WF_SYNC; +#ifdef CONFIG_SCHED_HMP + return select_best_cpu(p, prev_cpu, 0, sync); +#endif + if (sd_flag & SD_BALANCE_WAKE) { record_wakee(p); want_affine = !wake_wide(p, sibling_count_hint) && @@ -7443,6 +8600,10 @@ enum group_type { #define LBF_NEED_BREAK 0x02 #define LBF_DST_PINNED 0x04 #define LBF_SOME_PINNED 0x08 +#define LBF_BIG_TASK_ACTIVE_BALANCE 0x80 +#define LBF_IGNORE_BIG_TASKS 0x100 +#define LBF_IGNORE_PREFERRED_CLUSTER_TASKS 0x200 +#define LBF_MOVED_RELATED_THREAD_GROUP_TASK 0x400 struct lb_env { struct sched_domain *sd; @@ -7460,6 +8621,8 @@ struct lb_env { unsigned int src_grp_nr_running; /* The set of CPUs under consideration for load-balancing */ struct cpumask *cpus; + unsigned int busiest_grp_capacity; + unsigned int busiest_nr_running; unsigned int flags; @@ -7470,6 +8633,7 @@ struct lb_env { enum fbq_type fbq_type; enum group_type busiest_group_type; struct list_head tasks; + enum sched_boost_policy boost_policy; }; /* @@ -7567,6 +8731,7 @@ static int can_migrate_task(struct task_struct *p, struct lb_env *env) { int tsk_cache_hot; + int twf, group_cpus; lockdep_assert_held(&env->src_rq->lock); @@ -7613,6 +8778,39 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* Record that we found atleast one task that could run on dst_cpu */ env->flags &= ~LBF_ALL_PINNED; + if (cpu_capacity(env->dst_cpu) > cpu_capacity(env->src_cpu)) { + if (nr_big_tasks(env->src_rq) && !is_big_task(p)) + return 0; + + if (env->boost_policy == SCHED_BOOST_ON_BIG && + !task_sched_boost(p)) + return 0; + } + + twf = task_will_fit(p, env->dst_cpu); + + /* + * Attempt to not pull tasks that don't fit. We may get lucky and find + * one that actually fits. + */ + if (env->flags & LBF_IGNORE_BIG_TASKS && !twf) + return 0; + + if (env->flags & LBF_IGNORE_PREFERRED_CLUSTER_TASKS && + !preferred_cluster(rq_cluster(cpu_rq(env->dst_cpu)), p)) + return 0; + + /* + * Group imbalance can sometimes cause work to be pulled across groups + * even though the group could have managed the imbalance on its own. + * Prevent inter-cluster migrations for big tasks when the number of + * tasks is lower than the capacity of the group. + */ + group_cpus = DIV_ROUND_UP(env->busiest_grp_capacity, + SCHED_CAPACITY_SCALE); + if (!twf && env->busiest_nr_running <= group_cpus) + return 0; + if (task_running(env->src_rq, p)) { schedstat_inc(p, se.statistics.nr_failed_migrations_running); return 0; @@ -7620,15 +8818,16 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* * Aggressive migration if: - * 1) destination numa is preferred - * 2) task is cache cold, or - * 3) too many balance attempts have failed. + * 1) IDLE or NEWLY_IDLE balance. + * 2) destination numa is preferred + * 3) task is cache cold, or + * 4) too many balance attempts have failed. */ tsk_cache_hot = migrate_degrades_locality(p, env); if (tsk_cache_hot == -1) tsk_cache_hot = task_hot(p, env); - if (tsk_cache_hot <= 0 || + if (env->idle != CPU_NOT_IDLE || tsk_cache_hot <= 0 || env->sd->nr_balance_failed > env->sd->cache_nice_tries) { if (tsk_cache_hot == 1) { schedstat_inc(env->sd, lb_hot_gained[env->idle]); @@ -7648,10 +8847,12 @@ static void detach_task(struct task_struct *p, struct lb_env *env) { lockdep_assert_held(&env->src_rq->lock); - deactivate_task(env->src_rq, p, 0); p->on_rq = TASK_ON_RQ_MIGRATING; + deactivate_task(env->src_rq, p, 0); double_lock_balance(env->src_rq, env->dst_rq); set_task_cpu(p, env->dst_cpu); + if (task_in_related_thread_group(p)) + env->flags |= LBF_MOVED_RELATED_THREAD_GROUP_TASK; double_unlock_balance(env->src_rq, env->dst_rq); } @@ -7680,6 +8881,7 @@ static struct task_struct *detach_one_task(struct lb_env *env) * inside detach_tasks(). */ schedstat_inc(env->sd, lb_gained[env->idle]); + return p; } return NULL; @@ -7699,12 +8901,20 @@ static int detach_tasks(struct lb_env *env) struct task_struct *p; unsigned long load; int detached = 0; + int orig_loop = env->loop; lockdep_assert_held(&env->src_rq->lock); if (env->imbalance <= 0) return 0; + if (!same_cluster(env->dst_cpu, env->src_cpu)) + env->flags |= LBF_IGNORE_PREFERRED_CLUSTER_TASKS; + + if (cpu_capacity(env->dst_cpu) < cpu_capacity(env->src_cpu)) + env->flags |= LBF_IGNORE_BIG_TASKS; + +redo: while (!list_empty(tasks)) { /* * We don't want to steal all, otherwise we may be treated likewise, @@ -7774,6 +8984,15 @@ next: list_move_tail(&p->se.group_node, tasks); } + if (env->flags & (LBF_IGNORE_BIG_TASKS | + LBF_IGNORE_PREFERRED_CLUSTER_TASKS) && !detached) { + tasks = &env->src_rq->cfs_tasks; + env->flags &= ~(LBF_IGNORE_BIG_TASKS | + LBF_IGNORE_PREFERRED_CLUSTER_TASKS); + env->loop = orig_loop; + goto redo; + } + /* * Right now, this is one of only two places we collect this stat * so we can safely collect detach_one_task() stats here rather @@ -7792,8 +9011,8 @@ static void attach_task(struct rq *rq, struct task_struct *p) lockdep_assert_held(&rq->lock); BUG_ON(task_rq(p) != rq); - p->on_rq = TASK_ON_RQ_QUEUED; activate_task(rq, p, 0); + p->on_rq = TASK_ON_RQ_QUEUED; check_preempt_curr(rq, p, 0); } @@ -7940,6 +9159,10 @@ struct sg_lb_stats { unsigned long group_capacity; unsigned long group_util; /* Total utilization of the group */ unsigned int sum_nr_running; /* Nr tasks running in the group */ +#ifdef CONFIG_SCHED_HMP + unsigned long sum_nr_big_tasks; + u64 group_cpu_load; /* Scaled load of all CPUs of the group */ +#endif unsigned int idle_cpus; unsigned int group_weight; enum group_type group_type; @@ -7983,10 +9206,64 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) .avg_load = 0UL, .sum_nr_running = 0, .group_type = group_other, +#ifdef CONFIG_SCHED_HMP + .sum_nr_big_tasks = 0UL, + .group_cpu_load = 0ULL, +#endif }, }; } +#ifdef CONFIG_SCHED_HMP + +static int +bail_inter_cluster_balance(struct lb_env *env, struct sd_lb_stats *sds) +{ + int local_cpu, busiest_cpu; + int local_capacity, busiest_capacity; + int local_pwr_cost, busiest_pwr_cost; + int nr_cpus; + int boost = sched_boost(); + + if (!sysctl_sched_restrict_cluster_spill || + boost == FULL_THROTTLE_BOOST || boost == CONSERVATIVE_BOOST) + return 0; + + local_cpu = group_first_cpu(sds->local); + busiest_cpu = group_first_cpu(sds->busiest); + + local_capacity = cpu_max_possible_capacity(local_cpu); + busiest_capacity = cpu_max_possible_capacity(busiest_cpu); + + local_pwr_cost = cpu_max_power_cost(local_cpu); + busiest_pwr_cost = cpu_max_power_cost(busiest_cpu); + + if (local_pwr_cost <= busiest_pwr_cost) + return 0; + + if (local_capacity > busiest_capacity && + sds->busiest_stat.sum_nr_big_tasks) + return 0; + + nr_cpus = cpumask_weight(sched_group_cpus(sds->busiest)); + if ((sds->busiest_stat.group_cpu_load < nr_cpus * sched_spill_load) && + (sds->busiest_stat.sum_nr_running < + nr_cpus * sysctl_sched_spill_nr_run)) + return 1; + + return 0; +} + +#else /* CONFIG_SCHED_HMP */ + +static inline int +bail_inter_cluster_balance(struct lb_env *env, struct sd_lb_stats *sds) +{ + return 0; +} + +#endif /* CONFIG_SCHED_HMP */ + /** * get_sd_load_idx - Obtain the load index for a given sched domain. * @sd: The sched_domain whose load_idx is to be obtained. @@ -8130,6 +9407,8 @@ void update_group_capacity(struct sched_domain *sd, int cpu) struct sched_group_capacity *sgc; struct rq *rq = cpu_rq(cpu); + if (cpumask_test_cpu(cpu, cpu_isolated_mask)) + continue; /* * build_sched_domains() -> init_sched_groups_capacity() * gets here before we've attached the domains to the @@ -8161,9 +9440,14 @@ void update_group_capacity(struct sched_domain *sd, int cpu) do { struct sched_group_capacity *sgc = group->sgc; - capacity += sgc->capacity; - max_capacity = max(sgc->max_capacity, max_capacity); - min_capacity = min(sgc->min_capacity, min_capacity); + cpumask_t *cpus = sched_group_cpus(group); + + /* Revisit this later. This won't work for MT domain */ + if (!cpu_isolated(cpumask_first(cpus))) { + capacity += sgc->capacity; + max_capacity = max(sgc->max_capacity, max_capacity); + min_capacity = min(sgc->min_capacity, min_capacity); + } group = group->next; } while (group != child->groups); } @@ -8279,7 +9563,7 @@ group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref) static inline enum group_type group_classify(struct sched_group *group, - struct sg_lb_stats *sgs) + struct sg_lb_stats *sgs, struct lb_env *env) { if (sgs->group_no_capacity) return group_overloaded; @@ -8348,6 +9632,14 @@ static inline void update_sg_lb_stats(struct lb_env *env, for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { struct rq *rq = cpu_rq(i); + trace_sched_cpu_load_lb(cpu_rq(i), idle_cpu(i), + sched_irqload(i), + power_cost(i, 0), + cpu_temp(i)); + + if (cpu_isolated(i)) + continue; + /* if we are entering idle and there are CPUs with * their tick stopped, do an update for them */ @@ -8368,6 +9660,11 @@ static inline void update_sg_lb_stats(struct lb_env *env, if (nr_running > 1) *overload = true; +#ifdef CONFIG_SCHED_HMP + sgs->sum_nr_big_tasks += rq->hmp_stats.nr_big_tasks; + sgs->group_cpu_load += cpu_load(i); +#endif + #ifdef CONFIG_NUMA_BALANCING sgs->nr_numa_running += rq->nr_numa_running; sgs->nr_preferred_running += rq->nr_preferred_running; @@ -8379,25 +9676,62 @@ static inline void update_sg_lb_stats(struct lb_env *env, if (!nr_running && idle_cpu(i)) sgs->idle_cpus++; - if (cpu_overutilized(i)) { + if (energy_aware() && cpu_overutilized(i)) { *overutilized = true; if (!sgs->group_misfit_task && rq->misfit_task) sgs->group_misfit_task = capacity_of(i); } } - /* Adjust by relative CPU capacity of the group */ - sgs->group_capacity = group->sgc->capacity; - sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity; + /* Isolated CPU has no weight */ + if (!group->group_weight) { + sgs->group_capacity = 0; + sgs->avg_load = 0; + sgs->group_no_capacity = 1; + sgs->group_type = group_other; + sgs->group_weight = group->group_weight; + } else { + /* Adjust by relative CPU capacity of the group */ + sgs->group_capacity = group->sgc->capacity; + sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / + sgs->group_capacity; + + sgs->group_weight = group->group_weight; + + sgs->group_no_capacity = group_is_overloaded(env, sgs); + sgs->group_type = group_classify(group, sgs, env); + } if (sgs->sum_nr_running) sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; +} - sgs->group_weight = group->group_weight; +#ifdef CONFIG_SCHED_HMP +static bool update_sd_pick_busiest_active_balance(struct lb_env *env, + struct sd_lb_stats *sds, + struct sched_group *sg, + struct sg_lb_stats *sgs) +{ + if (env->idle != CPU_NOT_IDLE && + cpu_capacity(env->dst_cpu) > group_rq_capacity(sg)) { + if (sgs->sum_nr_big_tasks > + sds->busiest_stat.sum_nr_big_tasks) { + env->flags |= LBF_BIG_TASK_ACTIVE_BALANCE; + return true; + } + } - sgs->group_no_capacity = group_is_overloaded(env, sgs); - sgs->group_type = group_classify(group, sgs); + return false; } +#else +static bool update_sd_pick_busiest_active_balance(struct lb_env *env, + struct sd_lb_stats *sds, + struct sched_group *sg, + struct sg_lb_stats *sgs) +{ + return false; +} +#endif /** * update_sd_pick_busiest - return 1 on busiest group @@ -8419,35 +9753,40 @@ static bool update_sd_pick_busiest(struct lb_env *env, { struct sg_lb_stats *busiest = &sds->busiest_stat; + if (update_sd_pick_busiest_active_balance(env, sds, sg, sgs)) + return true; + if (sgs->group_type > busiest->group_type) return true; if (sgs->group_type < busiest->group_type) return false; - /* - * Candidate sg doesn't face any serious load-balance problems - * so don't pick it if the local sg is already filled up. - */ - if (sgs->group_type == group_other && - !group_has_capacity(env, &sds->local_stat)) - return false; + if (energy_aware()) { + /* + * Candidate sg doesn't face any serious load-balance problems + * so don't pick it if the local sg is already filled up. + */ + if (sgs->group_type == group_other && + !group_has_capacity(env, &sds->local_stat)) + return false; - if (sgs->avg_load <= busiest->avg_load) - return false; + if (sgs->avg_load <= busiest->avg_load) + return false; - if (!(env->sd->flags & SD_ASYM_CPUCAPACITY)) - goto asym_packing; + if (!(env->sd->flags & SD_ASYM_CPUCAPACITY)) + goto asym_packing; - /* - * Candidate sg has no more than one task per CPU and - * has higher per-CPU capacity. Migrating tasks to less - * capable CPUs may harm throughput. Maximize throughput, - * power/energy consequences are not considered. - */ - if (sgs->sum_nr_running <= sgs->group_weight && - group_smaller_cpu_capacity(sds->local, sg)) - return false; + /* + * Candidate sg has no more than one task per CPU and + * has higher per-CPU capacity. Migrating tasks to less + * capable CPUs may harm throughput. Maximize throughput, + * power/energy consequences are not considered. + */ + if (sgs->sum_nr_running <= sgs->group_weight && + group_smaller_cpu_capacity(sds->local, sg)) + return false; + } asym_packing: /* This is the busiest node in its class. */ @@ -8555,14 +9894,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd group_has_capacity(env, &sds->local_stat) && (sgs->sum_nr_running > 1)) { sgs->group_no_capacity = 1; - sgs->group_type = group_classify(sg, sgs); + sgs->group_type = group_classify(sg, sgs, env); } /* * Ignore task groups with misfit tasks if local group has no * capacity or if per-cpu capacity isn't higher. */ - if (sgs->group_type == group_misfit_task && + if (energy_aware() && + sgs->group_type == group_misfit_task && (!group_has_capacity(env, &sds->local_stat) || !group_smaller_cpu_capacity(sg, sds->local))) sgs->group_type = group_other; @@ -8570,6 +9910,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd if (update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; sds->busiest_stat = *sgs; + env->busiest_nr_running = sgs->sum_nr_running; + env->busiest_grp_capacity = sgs->group_capacity; } next_group: @@ -8591,12 +9933,12 @@ next_group: env->dst_rq->rd->overload = overload; /* Update over-utilization (tipping point, U >= 0) indicator */ - if (env->dst_rq->rd->overutilized != overutilized) { + if (energy_aware() && env->dst_rq->rd->overutilized != overutilized) { env->dst_rq->rd->overutilized = overutilized; trace_sched_overutilized(overutilized); } } else { - if (!env->dst_rq->rd->overutilized && overutilized) { + if (energy_aware() && !env->dst_rq->rd->overutilized && overutilized) { env->dst_rq->rd->overutilized = true; trace_sched_overutilized(true); } @@ -8748,20 +10090,22 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s */ if (busiest->avg_load <= sds->avg_load || local->avg_load >= sds->avg_load) { - /* Misfitting tasks should be migrated in any case */ - if (busiest->group_type == group_misfit_task) { - env->imbalance = busiest->group_misfit_task; - return; - } + if (energy_aware()) { + /* Misfitting tasks should be migrated in any case */ + if (busiest->group_type == group_misfit_task) { + env->imbalance = busiest->group_misfit_task; + return; + } - /* - * Busiest group is overloaded, local is not, use the spare - * cycles to maximize throughput - */ - if (busiest->group_type == group_overloaded && - local->group_type <= group_misfit_task) { - env->imbalance = busiest->load_per_task; - return; + /* + * Busiest group is overloaded, local is not, use the spare + * cycles to maximize throughput + */ + if (busiest->group_type == group_overloaded && + local->group_type <= group_misfit_task) { + env->imbalance = busiest->load_per_task; + return; + } } env->imbalance = 0; @@ -8798,7 +10142,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s ) / SCHED_CAPACITY_SCALE; /* Boost imbalance to allow misfit task to be balanced. */ - if (busiest->group_type == group_misfit_task) + if (energy_aware() && busiest->group_type == group_misfit_task) env->imbalance = max_t(long, env->imbalance, busiest->group_misfit_task); @@ -8859,6 +10203,12 @@ static struct sched_group *find_busiest_group(struct lb_env *env) if (!sds.busiest || busiest->sum_nr_running == 0) goto out_balanced; + if (env->flags & LBF_BIG_TASK_ACTIVE_BALANCE) + goto force_balance; + + if (bail_inter_cluster_balance(env, &sds)) + goto out_balanced; + sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load) / sds.total_capacity; @@ -8879,7 +10229,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) goto force_balance; /* Misfitting tasks should be dealt with regardless of the avg load */ - if (busiest->group_type == group_misfit_task) { + if (energy_aware() && busiest->group_type == group_misfit_task) { goto force_balance; } @@ -8930,6 +10280,60 @@ out_balanced: return NULL; } +#ifdef CONFIG_SCHED_HMP +static struct rq *find_busiest_queue_hmp(struct lb_env *env, + struct sched_group *group) +{ + struct rq *busiest = NULL, *busiest_big = NULL; + u64 max_runnable_avg = 0, max_runnable_avg_big = 0; + int max_nr_big = 0, nr_big; + bool find_big = !!(env->flags & LBF_BIG_TASK_ACTIVE_BALANCE); + int i; + cpumask_t cpus; + + cpumask_andnot(&cpus, sched_group_cpus(group), cpu_isolated_mask); + + for_each_cpu(i, &cpus) { + struct rq *rq = cpu_rq(i); + u64 cumulative_runnable_avg = + rq->hmp_stats.cumulative_runnable_avg; + + if (!cpumask_test_cpu(i, env->cpus)) + continue; + + + if (find_big) { + nr_big = nr_big_tasks(rq); + if (nr_big > max_nr_big || + (nr_big > 0 && nr_big == max_nr_big && + cumulative_runnable_avg > max_runnable_avg_big)) { + max_runnable_avg_big = cumulative_runnable_avg; + busiest_big = rq; + max_nr_big = nr_big; + continue; + } + } + + if (cumulative_runnable_avg > max_runnable_avg) { + max_runnable_avg = cumulative_runnable_avg; + busiest = rq; + } + } + + if (busiest_big) + return busiest_big; + + env->flags &= ~LBF_BIG_TASK_ACTIVE_BALANCE; + return busiest; +} +#else +static inline struct rq *find_busiest_queue_hmp(struct lb_env *env, + struct sched_group *group) +{ + return NULL; +} +#endif + /* * find_busiest_queue - find the busiest runqueue among the cpus in group. */ @@ -8940,6 +10344,10 @@ static struct rq *find_busiest_queue(struct lb_env *env, unsigned long busiest_load = 0, busiest_capacity = 1; int i; +#ifdef CONFIG_SCHED_HMP + return find_busiest_queue_hmp(env, group); +#endif + for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { unsigned long capacity, wl; enum fbq_type rt; @@ -9008,15 +10416,20 @@ static struct rq *find_busiest_queue(struct lb_env *env, * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but * so long as it is large enough. */ -#define MAX_PINNED_INTERVAL 512 +#define MAX_PINNED_INTERVAL 16 /* Working cpumask for load_balance and load_balance_newidle. */ DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); +#define NEED_ACTIVE_BALANCE_THRESHOLD 10 + static int need_active_balance(struct lb_env *env) { struct sched_domain *sd = env->sd; + if (env->flags & LBF_BIG_TASK_ACTIVE_BALANCE) + return 1; + if (env->idle == CPU_NEWLY_IDLE) { /* @@ -9041,7 +10454,8 @@ static int need_active_balance(struct lb_env *env) return 1; } - if ((capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) && + if (energy_aware() && + (capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) && ((capacity_orig_of(env->src_cpu) < capacity_orig_of(env->dst_cpu))) && env->src_rq->cfs.h_nr_running == 1 && cpu_overutilized(env->src_cpu) && @@ -9049,10 +10463,18 @@ static int need_active_balance(struct lb_env *env) return 1; } - return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); + return unlikely(sd->nr_balance_failed > + sd->cache_nice_tries + NEED_ACTIVE_BALANCE_THRESHOLD); } -static int active_load_balance_cpu_stop(void *data); +static int group_balance_cpu_not_isolated(struct sched_group *sg) +{ + cpumask_t cpus; + + cpumask_and(&cpus, sched_group_cpus(sg), sched_group_mask(sg)); + cpumask_andnot(&cpus, &cpus, cpu_isolated_mask); + return cpumask_first(&cpus); +} static int should_we_balance(struct lb_env *env) { @@ -9071,7 +10493,8 @@ static int should_we_balance(struct lb_env *env) sg_mask = sched_group_mask(sg); /* Try to find first idle cpu */ for_each_cpu_and(cpu, sg_cpus, env->cpus) { - if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu)) + if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu) || + cpu_isolated(cpu)) continue; balance_cpu = cpu; @@ -9079,7 +10502,7 @@ static int should_we_balance(struct lb_env *env) } if (balance_cpu == -1) - balance_cpu = group_balance_cpu(sg); + balance_cpu = group_balance_cpu_not_isolated(sg); /* * First idle cpu or the first cpu(busiest) in this sched group @@ -9096,23 +10519,29 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, int *continue_balancing) { - int ld_moved, cur_ld_moved, active_balance = 0; + int ld_moved = 0, cur_ld_moved, active_balance = 0; struct sched_domain *sd_parent = lb_sd_parent(sd) ? sd->parent : NULL; - struct sched_group *group; - struct rq *busiest; + struct sched_group *group = NULL; + struct rq *busiest = NULL; unsigned long flags; struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask); struct lb_env env = { - .sd = sd, - .dst_cpu = this_cpu, - .dst_rq = this_rq, - .dst_grpmask = sched_group_cpus(sd->groups), - .idle = idle, - .loop_break = sched_nr_migrate_break, - .cpus = cpus, - .fbq_type = all, - .tasks = LIST_HEAD_INIT(env.tasks), + .sd = sd, + .dst_cpu = this_cpu, + .dst_rq = this_rq, + .dst_grpmask = sched_group_cpus(sd->groups), + .idle = idle, + .loop_break = sched_nr_migrate_break, + .cpus = cpus, + .fbq_type = all, + .tasks = LIST_HEAD_INIT(env.tasks), + .imbalance = 0, + .flags = 0, + .loop = 0, + .busiest_nr_running = 0, + .busiest_grp_capacity = 0, + .boost_policy = sched_boost_policy(), }; /* @@ -9160,12 +10589,24 @@ redo: * correctly treated as an imbalance. */ env.flags |= LBF_ALL_PINNED; - env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); more_balance: raw_spin_lock_irqsave(&busiest->lock, flags); update_rq_clock(busiest); + /* The world might have changed. Validate assumptions */ + if (busiest->nr_running <= 1) { + raw_spin_unlock_irqrestore(&busiest->lock, flags); + env.flags &= ~LBF_ALL_PINNED; + goto no_move; + } + + /* + * Set loop_max when rq's lock is taken to prevent a race. + */ + env.loop_max = min(sysctl_sched_nr_migrate, + busiest->nr_running); + /* * cur_ld_moved - load moved in current iteration * ld_moved - cumulative load moved across iterations @@ -9253,17 +10694,22 @@ more_balance: } } +no_move: if (!ld_moved) { - schedstat_inc(sd, lb_failed[idle]); + if (!(env.flags & LBF_BIG_TASK_ACTIVE_BALANCE)) + schedstat_inc(sd, lb_failed[idle]); + /* * Increment the failure counter only on periodic balance. * We do not want newidle balance, which can be very * frequent, pollute the failure counter causing * excessive cache_hot migrations and active balances. */ - if (idle != CPU_NEWLY_IDLE) - if (env.src_grp_nr_running > 1) + if (idle != CPU_NEWLY_IDLE && + !(env.flags & LBF_BIG_TASK_ACTIVE_BALANCE)) { + if (env.src_grp_nr_running > 1) sd->nr_balance_failed++; + } if (need_active_balance(&env)) { raw_spin_lock_irqsave(&busiest->lock, flags); @@ -9285,7 +10731,8 @@ more_balance: * ->active_balance_work. Once set, it's cleared * only after active load balance is finished. */ - if (!busiest->active_balance) { + if (!busiest->active_balance && + !cpu_isolated(cpu_of(busiest))) { busiest->active_balance = 1; busiest->push_cpu = this_cpu; active_balance = 1; @@ -9296,17 +10743,31 @@ more_balance: stop_one_cpu_nowait(cpu_of(busiest), active_load_balance_cpu_stop, busiest, &busiest->active_balance_work); + *continue_balancing = 0; } /* * We've kicked active balancing, reset the failure * counter. */ - sd->nr_balance_failed = sd->cache_nice_tries+1; + sd->nr_balance_failed = + sd->cache_nice_tries + + NEED_ACTIVE_BALANCE_THRESHOLD - 1; } - } else + } else { sd->nr_balance_failed = 0; + /* Assumes one 'busiest' cpu that we pulled tasks from */ + if (!same_freq_domain(this_cpu, cpu_of(busiest))) { + int check_groups = !!(env.flags & + LBF_MOVED_RELATED_THREAD_GROUP_TASK); + + check_for_freq_change(this_rq, false, check_groups); + check_for_freq_change(busiest, false, check_groups); + } else { + check_for_freq_change(this_rq, true, false); + } + } if (likely(!active_balance)) { /* We were unbalanced, so reset the balancing interval */ sd->balance_interval = sd->min_interval; @@ -9364,6 +10825,11 @@ out_one_pinned: (sd->balance_interval < sd->max_interval)) sd->balance_interval *= 2; out: + trace_sched_load_balance(this_cpu, idle, *continue_balancing, + group ? group->cpumask[0] : 0, + busiest ? busiest->nr_running : 0, + env.imbalance, env.flags, ld_moved, + sd->balance_interval); return ld_moved; } @@ -9406,6 +10872,9 @@ static int idle_balance(struct rq *this_rq) int pulled_task = 0; u64 curr_cost = 0; + if (cpu_isolated(this_cpu)) + return 0; + idle_enter_fair(this_rq); /* @@ -9460,9 +10929,12 @@ static int idle_balance(struct rq *this_rq) /* * Stop searching for tasks to pull if there are - * now runnable tasks on this rq. + * now runnable tasks on the balance rq or if + * continue_balancing has been unset (only possible + * due to active migration). */ - if (pulled_task || this_rq->nr_running > 0) + if (pulled_task || this_rq->nr_running > 0 || + !continue_balancing) break; } rcu_read_unlock(); @@ -9514,13 +10986,19 @@ static int active_load_balance_cpu_stop(void *data) struct task_struct *push_task = NULL; int push_task_detached = 0; struct lb_env env = { - .sd = sd, - .dst_cpu = target_cpu, - .dst_rq = target_rq, - .src_cpu = busiest_rq->cpu, - .src_rq = busiest_rq, - .idle = CPU_IDLE, + .sd = sd, + .dst_cpu = target_cpu, + .dst_rq = target_rq, + .src_cpu = busiest_rq->cpu, + .src_rq = busiest_rq, + .idle = CPU_IDLE, + .busiest_nr_running = 0, + .busiest_grp_capacity = 0, + .flags = 0, + .loop = 0, + .boost_policy = sched_boost_policy(), }; + bool moved = false; raw_spin_lock_irq(&busiest_rq->lock); @@ -9541,12 +11019,15 @@ static int active_load_balance_cpu_stop(void *data) BUG_ON(busiest_rq == target_rq); push_task = busiest_rq->push_task; + target_cpu = busiest_rq->push_cpu; if (push_task) { if (task_on_rq_queued(push_task) && + push_task->state == TASK_RUNNING && task_cpu(push_task) == busiest_cpu && cpu_online(target_cpu)) { detach_task(push_task, &env); push_task_detached = 1; + moved = true; } goto out_unlock; } @@ -9565,14 +11046,18 @@ static int active_load_balance_cpu_stop(void *data) update_rq_clock(busiest_rq); p = detach_one_task(&env); - if (p) + if (p) { schedstat_inc(sd, alb_pushed); - else + moved = true; + } else { schedstat_inc(sd, alb_failed); + } } rcu_read_unlock(); out_unlock: busiest_rq->active_balance = 0; + push_task = busiest_rq->push_task; + target_cpu = busiest_rq->push_cpu; if (push_task) busiest_rq->push_task = NULL; @@ -9583,6 +11068,7 @@ out_unlock: if (push_task_detached) attach_one_task(target_rq, push_task); put_task_struct(push_task); + clear_reserved(target_cpu); } if (p) @@ -9590,6 +11076,15 @@ out_unlock: local_irq_enable(); + if (moved && !same_freq_domain(busiest_cpu, target_cpu)) { + int check_groups = !!(env.flags & + LBF_MOVED_RELATED_THREAD_GROUP_TASK); + check_for_freq_change(busiest_rq, false, check_groups); + check_for_freq_change(target_rq, false, check_groups); + } else if (moved) { + check_for_freq_change(target_rq, true, false); + } + return 0; } @@ -9605,9 +11100,49 @@ static inline int on_null_domain(struct rq *rq) * needed, they will kick the idle load balancer, which then does idle * load balancing for all the idle CPUs. */ -static inline int find_new_ilb(void) + +#ifdef CONFIG_SCHED_HMP +static inline int find_new_hmp_ilb(int type) { - int ilb = cpumask_first(nohz.idle_cpus_mask); + int call_cpu = raw_smp_processor_id(); + struct sched_domain *sd; + int ilb; + + rcu_read_lock(); + + /* Pick an idle cpu "closest" to call_cpu */ + for_each_domain(call_cpu, sd) { + for_each_cpu_and(ilb, nohz.idle_cpus_mask, + sched_domain_span(sd)) { + if (idle_cpu(ilb) && (type != NOHZ_KICK_RESTRICT || + cpu_max_power_cost(ilb) <= + cpu_max_power_cost(call_cpu))) { + rcu_read_unlock(); + reset_balance_interval(ilb); + return ilb; + } + } + } + + rcu_read_unlock(); + return nr_cpu_ids; +} +#else /* CONFIG_SCHED_HMP */ +static inline int find_new_hmp_ilb(int type) +{ + return 0; +} +#endif /* CONFIG_SCHED_HMP */ + +static inline int find_new_ilb(int type) +{ + int ilb; + +#ifdef CONFIG_SCHED_HMP + return find_new_hmp_ilb(type); +#endif + + ilb = cpumask_first(nohz.idle_cpus_mask); if (ilb < nr_cpu_ids && idle_cpu(ilb)) return ilb; @@ -9620,13 +11155,13 @@ static inline int find_new_ilb(void) * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle * CPU (if there is one). */ -static void nohz_balancer_kick(void) +static void nohz_balancer_kick(int type) { int ilb_cpu; nohz.next_balance++; - ilb_cpu = find_new_ilb(); + ilb_cpu = find_new_ilb(type); if (ilb_cpu >= nr_cpu_ids) return; @@ -9643,16 +11178,21 @@ static void nohz_balancer_kick(void) return; } +void nohz_balance_clear_nohz_mask(int cpu) +{ + if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) { + cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); + atomic_dec(&nohz.nr_cpus); + } +} + static inline void nohz_balance_exit_idle(int cpu) { if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { /* * Completely isolated CPUs don't ever set, so we must test. */ - if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) { - cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); - atomic_dec(&nohz.nr_cpus); - } + nohz_balance_clear_nohz_mask(cpu); clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); } } @@ -9709,7 +11249,7 @@ void nohz_balance_enter_idle(int cpu) /* * If we're a completely isolated CPU, we don't play. */ - if (on_null_domain(cpu_rq(cpu))) + if (on_null_domain(cpu_rq(cpu)) || cpu_isolated(cpu)) return; cpumask_set_cpu(cpu, nohz.idle_cpus_mask); @@ -9738,7 +11278,13 @@ static DEFINE_SPINLOCK(balancing); */ void update_max_interval(void) { - max_load_balance_interval = HZ*num_online_cpus()/10; + cpumask_t avail_mask; + unsigned int available_cpus; + + cpumask_andnot(&avail_mask, cpu_online_mask, cpu_isolated_mask); + available_cpus = cpumask_weight(&avail_mask); + + max_load_balance_interval = HZ*available_cpus/10; } /* @@ -9863,12 +11409,15 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) /* Earliest time when we have to do rebalance again */ unsigned long next_balance = jiffies + 60*HZ; int update_next_balance = 0; + cpumask_t cpus; if (idle != CPU_IDLE || !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu))) goto end; - for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { + cpumask_andnot(&cpus, nohz.idle_cpus_mask, cpu_isolated_mask); + + for_each_cpu(balance_cpu, &cpus) { if (balance_cpu == this_cpu || !idle_cpu(balance_cpu)) continue; @@ -9911,6 +11460,79 @@ end: clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)); } +#ifdef CONFIG_SCHED_HMP +static inline int _nohz_kick_needed_hmp(struct rq *rq, int cpu, int *type) +{ + struct sched_domain *sd; + int i; + + if (rq->nr_running < 2) + return 0; + + if (!sysctl_sched_restrict_cluster_spill || + sched_boost_policy() == SCHED_BOOST_ON_ALL) + return 1; + + if (cpu_max_power_cost(cpu) == max_power_cost) + return 1; + + rcu_read_lock(); + sd = rcu_dereference_check_sched_domain(rq->sd); + if (!sd) { + rcu_read_unlock(); + return 0; + } + + for_each_cpu(i, sched_domain_span(sd)) { + if (cpu_load(i) < sched_spill_load && + cpu_rq(i)->nr_running < + sysctl_sched_spill_nr_run) { + /* Change the kick type to limit to CPUs that + * are of equal or lower capacity. + */ + *type = NOHZ_KICK_RESTRICT; + break; + } + } + rcu_read_unlock(); + return 1; +} +#else +static inline int _nohz_kick_needed_hmp(struct rq *rq, int cpu, int *type) +{ + return 0; +} +#endif + +static inline int _nohz_kick_needed(struct rq *rq, int cpu, int *type) +{ + unsigned long now = jiffies; + + /* + * None are in tickless mode and hence no need for NOHZ idle load + * balancing. + */ + if (likely(!atomic_read(&nohz.nr_cpus))) + return 0; + +#ifdef CONFIG_SCHED_HMP + return _nohz_kick_needed_hmp(rq, cpu, type); +#endif + + if (time_before(now, nohz.next_balance)) + return 0; + + if (rq->nr_running >= 2 && + (!energy_aware() || cpu_overutilized(cpu))) + return true; + + /* Do idle load balance if there have misfit task */ + if (energy_aware()) + return rq->misfit_task; + + return (rq->nr_running >= 2); +} + /* * Current heuristic for kicking the idle load balancer in the presence * of an idle cpu in the system. @@ -9922,12 +11544,14 @@ end: * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler * domain span are idle. */ -static inline bool nohz_kick_needed(struct rq *rq) +static inline bool nohz_kick_needed(struct rq *rq, int *type) { - unsigned long now = jiffies; +#ifndef CONFIG_SCHED_HMP struct sched_domain *sd; struct sched_group_capacity *sgc; - int nr_busy, cpu = rq->cpu; + int nr_busy; +#endif + int cpu = rq->cpu; bool kick = false; if (unlikely(rq->idle_balance)) @@ -9940,24 +11564,10 @@ static inline bool nohz_kick_needed(struct rq *rq) set_cpu_sd_state_busy(); nohz_balance_exit_idle(cpu); - /* - * None are in tickless mode and hence no need for NOHZ idle load - * balancing. - */ - if (likely(!atomic_read(&nohz.nr_cpus))) - return false; - - if (time_before(now, nohz.next_balance)) - return false; - - if (rq->nr_running >= 2 && - (!energy_aware() || cpu_overutilized(cpu))) + if (_nohz_kick_needed(rq, cpu, type)) return true; - /* Do idle load balance if there have misfit task */ - if (energy_aware()) - return rq->misfit_task; - +#ifndef CONFIG_SCHED_HMP rcu_read_lock(); sd = rcu_dereference(per_cpu(sd_busy, cpu)); if (sd) { @@ -9989,6 +11599,7 @@ static inline bool nohz_kick_needed(struct rq *rq) unlock: rcu_read_unlock(); +#endif return kick; } #else @@ -10022,15 +11633,19 @@ static void run_rebalance_domains(struct softirq_action *h) */ void trigger_load_balance(struct rq *rq) { - /* Don't need to rebalance while attached to NULL domain */ - if (unlikely(on_null_domain(rq))) + int type = NOHZ_KICK_ANY; + + /* Don't need to rebalance while attached to NULL domain or + * cpu is isolated. + */ + if (unlikely(on_null_domain(rq)) || cpu_isolated(cpu_of(rq))) return; if (time_after_eq(jiffies, rq->next_balance)) raise_softirq(SCHED_SOFTIRQ); #ifdef CONFIG_NO_HZ_COMMON - if (nohz_kick_needed(rq)) - nohz_balancer_kick(); + if (nohz_kick_needed(rq, &type)) + nohz_balancer_kick(type); #endif } @@ -10049,47 +11664,6 @@ static void rq_offline_fair(struct rq *rq) unthrottle_offline_cfs_rqs(rq); } -static inline int -kick_active_balance(struct rq *rq, struct task_struct *p, int new_cpu) -{ - int rc = 0; - - /* Invoke active balance to force migrate currently running task */ - raw_spin_lock(&rq->lock); - if (!rq->active_balance) { - rq->active_balance = 1; - rq->push_cpu = new_cpu; - get_task_struct(p); - rq->push_task = p; - rc = 1; - } - raw_spin_unlock(&rq->lock); - - return rc; -} - -void check_for_migration(struct rq *rq, struct task_struct *p) -{ - int new_cpu; - int active_balance; - int cpu = task_cpu(p); - - if (energy_aware() && rq->misfit_task) { - if (rq->curr->state != TASK_RUNNING || - rq->curr->nr_cpus_allowed == 1) - return; - - new_cpu = select_energy_cpu_brute(p, cpu, 0); - if (capacity_orig_of(new_cpu) > capacity_orig_of(cpu)) { - active_balance = kick_active_balance(rq, p, new_cpu); - if (active_balance) - stop_one_cpu_nowait(cpu, - active_load_balance_cpu_stop, - rq, &rq->active_balance_work); - } - } -} - #endif /* CONFIG_SMP */ /* @@ -10109,7 +11683,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) task_tick_numa(rq, curr); #ifdef CONFIG_SMP - if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) { + if (energy_aware() && + !rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) { rq->rd->overutilized = true; trace_sched_overutilized(true); } @@ -10609,6 +12184,11 @@ const struct sched_class fair_sched_class = { #ifdef CONFIG_FAIR_GROUP_SCHED .task_change_group = task_change_group_fair, #endif +#ifdef CONFIG_SCHED_HMP + .inc_hmp_sched_stats = inc_hmp_sched_stats_fair, + .dec_hmp_sched_stats = dec_hmp_sched_stats_fair, + .fixup_hmp_sched_stats = fixup_hmp_sched_stats_fair, +#endif }; #ifdef CONFIG_SCHED_DEBUG diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 55e461055332..c30c48fde7e6 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -49,7 +49,7 @@ SCHED_FEAT(NONTASK_CAPACITY, true) * Queue remote wakeups on the target CPU and process them * using the scheduler IPI. Reduces rq->lock contention/bounces. */ -SCHED_FEAT(TTWU_QUEUE, true) +SCHED_FEAT(TTWU_QUEUE, false) #ifdef HAVE_RT_PUSH_IPI /* diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c new file mode 100644 index 000000000000..598656b42203 --- /dev/null +++ b/kernel/sched/hmp.c @@ -0,0 +1,4416 @@ +/* Copyright (c) 2012-2018, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Implementation credits: Srivatsa Vaddagiri, Steve Muckle + * Syed Rameez Mustafa, Olav haugan, Joonwoo Park, Pavan Kumar Kondeti + * and Vikram Mulukutla + */ + +#include <linux/cpufreq.h> +#include <linux/list_sort.h> +#include <linux/syscore_ops.h> + +#include "sched.h" + +#include <trace/events/sched.h> + +#define CSTATE_LATENCY_GRANULARITY_SHIFT (6) + +const char *task_event_names[] = {"PUT_PREV_TASK", "PICK_NEXT_TASK", + "TASK_WAKE", "TASK_MIGRATE", "TASK_UPDATE", + "IRQ_UPDATE"}; + +const char *migrate_type_names[] = {"GROUP_TO_RQ", "RQ_TO_GROUP"}; + +static ktime_t ktime_last; +static bool sched_ktime_suspended; + +static bool use_cycle_counter; +static struct cpu_cycle_counter_cb cpu_cycle_counter_cb; + +u64 sched_ktime_clock(void) +{ + if (unlikely(sched_ktime_suspended)) + return ktime_to_ns(ktime_last); + return ktime_get_ns(); +} + +static void sched_resume(void) +{ + sched_ktime_suspended = false; +} + +static int sched_suspend(void) +{ + ktime_last = ktime_get(); + sched_ktime_suspended = true; + return 0; +} + +static struct syscore_ops sched_syscore_ops = { + .resume = sched_resume, + .suspend = sched_suspend +}; + +static int __init sched_init_ops(void) +{ + register_syscore_ops(&sched_syscore_ops); + return 0; +} +late_initcall(sched_init_ops); + +inline void clear_ed_task(struct task_struct *p, struct rq *rq) +{ + if (p == rq->ed_task) + rq->ed_task = NULL; +} + +inline void set_task_last_switch_out(struct task_struct *p, u64 wallclock) +{ + p->last_switch_out_ts = wallclock; +} + +/* + * Note C-state for (idle) cpus. + * + * @cstate = cstate index, 0 -> active state + * @wakeup_energy = energy spent in waking up cpu + * @wakeup_latency = latency to wakeup from cstate + * + */ +void +sched_set_cpu_cstate(int cpu, int cstate, int wakeup_energy, int wakeup_latency) +{ + struct rq *rq = cpu_rq(cpu); + + rq->cstate = cstate; /* C1, C2 etc */ + rq->wakeup_energy = wakeup_energy; + /* disregard small latency delta (64 us). */ + rq->wakeup_latency = ((wakeup_latency >> + CSTATE_LATENCY_GRANULARITY_SHIFT) << + CSTATE_LATENCY_GRANULARITY_SHIFT); +} + +/* + * Note D-state for (idle) cluster. + * + * @dstate = dstate index, 0 -> active state + * @wakeup_energy = energy spent in waking up cluster + * @wakeup_latency = latency to wakeup from cluster + * + */ +void sched_set_cluster_dstate(const cpumask_t *cluster_cpus, int dstate, + int wakeup_energy, int wakeup_latency) +{ + struct sched_cluster *cluster = + cpu_rq(cpumask_first(cluster_cpus))->cluster; + cluster->dstate = dstate; + cluster->dstate_wakeup_energy = wakeup_energy; + cluster->dstate_wakeup_latency = wakeup_latency; +} + +u32 __weak get_freq_max_load(int cpu, u32 freq) +{ + /* 100% by default */ + return 100; +} + +struct freq_max_load_entry { + /* The maximum load which has accounted governor's headroom. */ + u64 hdemand; +}; + +struct freq_max_load { + struct rcu_head rcu; + int length; + struct freq_max_load_entry freqs[0]; +}; + +static DEFINE_PER_CPU(struct freq_max_load *, freq_max_load); +static DEFINE_SPINLOCK(freq_max_load_lock); + +struct cpu_pwr_stats __weak *get_cpu_pwr_stats(void) +{ + return NULL; +} + +int sched_update_freq_max_load(const cpumask_t *cpumask) +{ + int i, cpu, ret; + unsigned int freq; + struct cpu_pstate_pwr *costs; + struct cpu_pwr_stats *per_cpu_info = get_cpu_pwr_stats(); + struct freq_max_load *max_load, *old_max_load; + struct freq_max_load_entry *entry; + u64 max_demand_capacity, max_demand; + unsigned long flags; + u32 hfreq; + int hpct; + + if (!per_cpu_info) + return 0; + + spin_lock_irqsave(&freq_max_load_lock, flags); + max_demand_capacity = div64_u64(max_task_load(), max_possible_capacity); + for_each_cpu(cpu, cpumask) { + if (!per_cpu_info[cpu].ptable) { + ret = -EINVAL; + goto fail; + } + + old_max_load = rcu_dereference(per_cpu(freq_max_load, cpu)); + + /* + * allocate len + 1 and leave the last power cost as 0 for + * power_cost() can stop iterating index when + * per_cpu_info[cpu].len > len of max_load due to race between + * cpu power stats update and get_cpu_pwr_stats(). + */ + max_load = kzalloc(sizeof(struct freq_max_load) + + sizeof(struct freq_max_load_entry) * + (per_cpu_info[cpu].len + 1), GFP_ATOMIC); + if (unlikely(!max_load)) { + ret = -ENOMEM; + goto fail; + } + + max_load->length = per_cpu_info[cpu].len; + + max_demand = max_demand_capacity * + cpu_max_possible_capacity(cpu); + + i = 0; + costs = per_cpu_info[cpu].ptable; + while (costs[i].freq) { + entry = &max_load->freqs[i]; + freq = costs[i].freq; + hpct = get_freq_max_load(cpu, freq); + if (hpct <= 0 || hpct > 100) + hpct = 100; + hfreq = div64_u64((u64)freq * hpct, 100); + entry->hdemand = + div64_u64(max_demand * hfreq, + cpu_max_possible_freq(cpu)); + i++; + } + + rcu_assign_pointer(per_cpu(freq_max_load, cpu), max_load); + if (old_max_load) + kfree_rcu(old_max_load, rcu); + } + + spin_unlock_irqrestore(&freq_max_load_lock, flags); + return 0; + +fail: + for_each_cpu(cpu, cpumask) { + max_load = rcu_dereference(per_cpu(freq_max_load, cpu)); + if (max_load) { + rcu_assign_pointer(per_cpu(freq_max_load, cpu), NULL); + kfree_rcu(max_load, rcu); + } + } + + spin_unlock_irqrestore(&freq_max_load_lock, flags); + return ret; +} + +unsigned int max_possible_efficiency = 1; +unsigned int min_possible_efficiency = UINT_MAX; + +unsigned long __weak arch_get_cpu_efficiency(int cpu) +{ + return SCHED_LOAD_SCALE; +} + +/* Keep track of max/min capacity possible across CPUs "currently" */ +static void __update_min_max_capacity(void) +{ + int i; + int max_cap = 0, min_cap = INT_MAX; + + for_each_online_cpu(i) { + max_cap = max(max_cap, cpu_capacity(i)); + min_cap = min(min_cap, cpu_capacity(i)); + } + + max_capacity = max_cap; + min_capacity = min_cap; +} + +static void update_min_max_capacity(void) +{ + unsigned long flags; + int i; + + local_irq_save(flags); + for_each_possible_cpu(i) + raw_spin_lock(&cpu_rq(i)->lock); + + __update_min_max_capacity(); + + for_each_possible_cpu(i) + raw_spin_unlock(&cpu_rq(i)->lock); + local_irq_restore(flags); +} + +/* + * Return 'capacity' of a cpu in reference to "least" efficient cpu, such that + * least efficient cpu gets capacity of 1024 + */ +static unsigned long +capacity_scale_cpu_efficiency(struct sched_cluster *cluster) +{ + return (1024 * cluster->efficiency) / min_possible_efficiency; +} + +/* + * Return 'capacity' of a cpu in reference to cpu with lowest max_freq + * (min_max_freq), such that one with lowest max_freq gets capacity of 1024. + */ +static unsigned long capacity_scale_cpu_freq(struct sched_cluster *cluster) +{ + return (1024 * cluster_max_freq(cluster)) / min_max_freq; +} + +/* + * Return load_scale_factor of a cpu in reference to "most" efficient cpu, so + * that "most" efficient cpu gets a load_scale_factor of 1 + */ +static inline unsigned long +load_scale_cpu_efficiency(struct sched_cluster *cluster) +{ + return DIV_ROUND_UP(1024 * max_possible_efficiency, + cluster->efficiency); +} + +/* + * Return load_scale_factor of a cpu in reference to cpu with best max_freq + * (max_possible_freq), so that one with best max_freq gets a load_scale_factor + * of 1. + */ +static inline unsigned long load_scale_cpu_freq(struct sched_cluster *cluster) +{ + return DIV_ROUND_UP(1024 * max_possible_freq, + cluster_max_freq(cluster)); +} + +static int compute_capacity(struct sched_cluster *cluster) +{ + int capacity = 1024; + + capacity *= capacity_scale_cpu_efficiency(cluster); + capacity >>= 10; + + capacity *= capacity_scale_cpu_freq(cluster); + capacity >>= 10; + + return capacity; +} + +static int compute_max_possible_capacity(struct sched_cluster *cluster) +{ + int capacity = 1024; + + capacity *= capacity_scale_cpu_efficiency(cluster); + capacity >>= 10; + + capacity *= (1024 * cluster->max_possible_freq) / min_max_freq; + capacity >>= 10; + + return capacity; +} + +static int compute_load_scale_factor(struct sched_cluster *cluster) +{ + int load_scale = 1024; + + /* + * load_scale_factor accounts for the fact that task load + * is in reference to "best" performing cpu. Task's load will need to be + * scaled (up) by a factor to determine suitability to be placed on a + * (little) cpu. + */ + load_scale *= load_scale_cpu_efficiency(cluster); + load_scale >>= 10; + + load_scale *= load_scale_cpu_freq(cluster); + load_scale >>= 10; + + return load_scale; +} + +struct list_head cluster_head; +static DEFINE_MUTEX(cluster_lock); +static cpumask_t all_cluster_cpus = CPU_MASK_NONE; +DECLARE_BITMAP(all_cluster_ids, NR_CPUS); +struct sched_cluster *sched_cluster[NR_CPUS]; +int num_clusters; + +unsigned int max_power_cost = 1; + +struct sched_cluster init_cluster = { + .list = LIST_HEAD_INIT(init_cluster.list), + .id = 0, + .max_power_cost = 1, + .min_power_cost = 1, + .capacity = 1024, + .max_possible_capacity = 1024, + .efficiency = 1, + .load_scale_factor = 1024, + .cur_freq = 1, + .max_freq = 1, + .max_mitigated_freq = UINT_MAX, + .min_freq = 1, + .max_possible_freq = 1, + .dstate = 0, + .dstate_wakeup_energy = 0, + .dstate_wakeup_latency = 0, + .exec_scale_factor = 1024, + .notifier_sent = 0, + .wake_up_idle = 0, +}; + +static void update_all_clusters_stats(void) +{ + struct sched_cluster *cluster; + u64 highest_mpc = 0, lowest_mpc = U64_MAX; + + pre_big_task_count_change(cpu_possible_mask); + + for_each_sched_cluster(cluster) { + u64 mpc; + + cluster->capacity = compute_capacity(cluster); + mpc = cluster->max_possible_capacity = + compute_max_possible_capacity(cluster); + cluster->load_scale_factor = compute_load_scale_factor(cluster); + + cluster->exec_scale_factor = + DIV_ROUND_UP(cluster->efficiency * 1024, + max_possible_efficiency); + + if (mpc > highest_mpc) + highest_mpc = mpc; + + if (mpc < lowest_mpc) + lowest_mpc = mpc; + } + + max_possible_capacity = highest_mpc; + min_max_possible_capacity = lowest_mpc; + + __update_min_max_capacity(); + sched_update_freq_max_load(cpu_possible_mask); + post_big_task_count_change(cpu_possible_mask); +} + +static void assign_cluster_ids(struct list_head *head) +{ + struct sched_cluster *cluster; + int pos = 0; + + list_for_each_entry(cluster, head, list) { + cluster->id = pos; + sched_cluster[pos++] = cluster; + } +} + +static void +move_list(struct list_head *dst, struct list_head *src, bool sync_rcu) +{ + struct list_head *first, *last; + + first = src->next; + last = src->prev; + + if (sync_rcu) { + INIT_LIST_HEAD_RCU(src); + synchronize_rcu(); + } + + first->prev = dst; + dst->prev = last; + last->next = dst; + + /* Ensure list sanity before making the head visible to all CPUs. */ + smp_mb(); + dst->next = first; +} + +static int +compare_clusters(void *priv, struct list_head *a, struct list_head *b) +{ + struct sched_cluster *cluster1, *cluster2; + int ret; + + cluster1 = container_of(a, struct sched_cluster, list); + cluster2 = container_of(b, struct sched_cluster, list); + + /* + * Don't assume higher capacity means higher power. If the + * power cost is same, sort the higher capacity cluster before + * the lower capacity cluster to start placing the tasks + * on the higher capacity cluster. + */ + ret = cluster1->max_power_cost > cluster2->max_power_cost || + (cluster1->max_power_cost == cluster2->max_power_cost && + cluster1->max_possible_capacity < + cluster2->max_possible_capacity); + + return ret; +} + +static void sort_clusters(void) +{ + struct sched_cluster *cluster; + struct list_head new_head; + unsigned int tmp_max = 1; + + INIT_LIST_HEAD(&new_head); + + for_each_sched_cluster(cluster) { + cluster->max_power_cost = power_cost(cluster_first_cpu(cluster), + max_task_load()); + cluster->min_power_cost = power_cost(cluster_first_cpu(cluster), + 0); + + if (cluster->max_power_cost > tmp_max) + tmp_max = cluster->max_power_cost; + } + max_power_cost = tmp_max; + + move_list(&new_head, &cluster_head, true); + + list_sort(NULL, &new_head, compare_clusters); + assign_cluster_ids(&new_head); + + /* + * Ensure cluster ids are visible to all CPUs before making + * cluster_head visible. + */ + move_list(&cluster_head, &new_head, false); +} + +static void +insert_cluster(struct sched_cluster *cluster, struct list_head *head) +{ + struct sched_cluster *tmp; + struct list_head *iter = head; + + list_for_each_entry(tmp, head, list) { + if (cluster->max_power_cost < tmp->max_power_cost) + break; + iter = &tmp->list; + } + + list_add(&cluster->list, iter); +} + +static struct sched_cluster *alloc_new_cluster(const struct cpumask *cpus) +{ + struct sched_cluster *cluster = NULL; + + cluster = kzalloc(sizeof(struct sched_cluster), GFP_ATOMIC); + if (!cluster) { + __WARN_printf("Cluster allocation failed. \ + Possible bad scheduling\n"); + return NULL; + } + + INIT_LIST_HEAD(&cluster->list); + cluster->max_power_cost = 1; + cluster->min_power_cost = 1; + cluster->capacity = 1024; + cluster->max_possible_capacity = 1024; + cluster->efficiency = 1; + cluster->load_scale_factor = 1024; + cluster->cur_freq = 1; + cluster->max_freq = 1; + cluster->max_mitigated_freq = UINT_MAX; + cluster->min_freq = 1; + cluster->max_possible_freq = 1; + cluster->dstate = 0; + cluster->dstate_wakeup_energy = 0; + cluster->dstate_wakeup_latency = 0; + cluster->freq_init_done = false; + + raw_spin_lock_init(&cluster->load_lock); + cluster->cpus = *cpus; + cluster->efficiency = arch_get_cpu_efficiency(cpumask_first(cpus)); + + if (cluster->efficiency > max_possible_efficiency) + max_possible_efficiency = cluster->efficiency; + if (cluster->efficiency < min_possible_efficiency) + min_possible_efficiency = cluster->efficiency; + + cluster->notifier_sent = 0; + return cluster; +} + +static void add_cluster(const struct cpumask *cpus, struct list_head *head) +{ + struct sched_cluster *cluster = alloc_new_cluster(cpus); + int i; + + if (!cluster) + return; + + for_each_cpu(i, cpus) + cpu_rq(i)->cluster = cluster; + + insert_cluster(cluster, head); + set_bit(num_clusters, all_cluster_ids); + num_clusters++; +} + +void update_cluster_topology(void) +{ + struct cpumask cpus = *cpu_possible_mask; + const struct cpumask *cluster_cpus; + struct list_head new_head; + int i; + + INIT_LIST_HEAD(&new_head); + + for_each_cpu(i, &cpus) { + cluster_cpus = cpu_coregroup_mask(i); + cpumask_or(&all_cluster_cpus, &all_cluster_cpus, cluster_cpus); + cpumask_andnot(&cpus, &cpus, cluster_cpus); + add_cluster(cluster_cpus, &new_head); + } + + assign_cluster_ids(&new_head); + + /* + * Ensure cluster ids are visible to all CPUs before making + * cluster_head visible. + */ + move_list(&cluster_head, &new_head, false); + update_all_clusters_stats(); +} + +void init_clusters(void) +{ + bitmap_clear(all_cluster_ids, 0, NR_CPUS); + init_cluster.cpus = *cpu_possible_mask; + raw_spin_lock_init(&init_cluster.load_lock); + INIT_LIST_HEAD(&cluster_head); +} + +int register_cpu_cycle_counter_cb(struct cpu_cycle_counter_cb *cb) +{ + mutex_lock(&cluster_lock); + if (!cb->get_cpu_cycle_counter) { + mutex_unlock(&cluster_lock); + return -EINVAL; + } + + cpu_cycle_counter_cb = *cb; + use_cycle_counter = true; + mutex_unlock(&cluster_lock); + + return 0; +} + +/* Clear any HMP scheduler related requests pending from or on cpu */ +void clear_hmp_request(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + clear_boost_kick(cpu); + clear_reserved(cpu); + if (rq->push_task) { + struct task_struct *push_task = NULL; + + raw_spin_lock_irqsave(&rq->lock, flags); + if (rq->push_task) { + clear_reserved(rq->push_cpu); + push_task = rq->push_task; + rq->push_task = NULL; + } + rq->active_balance = 0; + raw_spin_unlock_irqrestore(&rq->lock, flags); + if (push_task) + put_task_struct(push_task); + } +} + +int sched_set_static_cpu_pwr_cost(int cpu, unsigned int cost) +{ + struct rq *rq = cpu_rq(cpu); + + rq->static_cpu_pwr_cost = cost; + return 0; +} + +unsigned int sched_get_static_cpu_pwr_cost(int cpu) +{ + return cpu_rq(cpu)->static_cpu_pwr_cost; +} + +int sched_set_static_cluster_pwr_cost(int cpu, unsigned int cost) +{ + struct sched_cluster *cluster = cpu_rq(cpu)->cluster; + + cluster->static_cluster_pwr_cost = cost; + return 0; +} + +unsigned int sched_get_static_cluster_pwr_cost(int cpu) +{ + return cpu_rq(cpu)->cluster->static_cluster_pwr_cost; +} + +int sched_set_cluster_wake_idle(int cpu, unsigned int wake_idle) +{ + struct sched_cluster *cluster = cpu_rq(cpu)->cluster; + + cluster->wake_up_idle = !!wake_idle; + return 0; +} + +unsigned int sched_get_cluster_wake_idle(int cpu) +{ + return cpu_rq(cpu)->cluster->wake_up_idle; +} + +/* + * sched_window_stats_policy and sched_ravg_hist_size have a 'sysctl' copy + * associated with them. This is required for atomic update of those variables + * when being modifed via sysctl interface. + * + * IMPORTANT: Initialize both copies to same value!! + */ + +/* + * Tasks that are runnable continuously for a period greather than + * EARLY_DETECTION_DURATION can be flagged early as potential + * high load tasks. + */ +#define EARLY_DETECTION_DURATION 9500000 + +static __read_mostly unsigned int sched_ravg_hist_size = 5; +__read_mostly unsigned int sysctl_sched_ravg_hist_size = 5; + +static __read_mostly unsigned int sched_window_stats_policy = + WINDOW_STATS_MAX_RECENT_AVG; +__read_mostly unsigned int sysctl_sched_window_stats_policy = + WINDOW_STATS_MAX_RECENT_AVG; + +#define SCHED_ACCOUNT_WAIT_TIME 1 + +__read_mostly unsigned int sysctl_sched_cpu_high_irqload = (10 * NSEC_PER_MSEC); + +/* + * Enable colocation and frequency aggregation for all threads in a process. + * The children inherits the group id from the parent. + */ +unsigned int __read_mostly sysctl_sched_enable_thread_grouping; + + +#define SCHED_NEW_TASK_WINDOWS 5 + +#define SCHED_FREQ_ACCOUNT_WAIT_TIME 0 + +/* + * This governs what load needs to be used when reporting CPU busy time + * to the cpufreq governor. + */ +__read_mostly unsigned int sysctl_sched_freq_reporting_policy; + +/* + * For increase, send notification if + * freq_required - cur_freq > sysctl_sched_freq_inc_notify + */ +__read_mostly int sysctl_sched_freq_inc_notify = 10 * 1024 * 1024; /* + 10GHz */ + +/* + * For decrease, send notification if + * cur_freq - freq_required > sysctl_sched_freq_dec_notify + */ +__read_mostly int sysctl_sched_freq_dec_notify = 10 * 1024 * 1024; /* - 10GHz */ + +static __read_mostly unsigned int sched_io_is_busy; + +__read_mostly unsigned int sysctl_sched_pred_alert_freq = 10 * 1024 * 1024; + +/* + * Maximum possible frequency across all cpus. Task demand and cpu + * capacity (cpu_power) metrics are scaled in reference to it. + */ +unsigned int max_possible_freq = 1; + +/* + * Minimum possible max_freq across all cpus. This will be same as + * max_possible_freq on homogeneous systems and could be different from + * max_possible_freq on heterogenous systems. min_max_freq is used to derive + * capacity (cpu_power) of cpus. + */ +unsigned int min_max_freq = 1; + +unsigned int max_capacity = 1024; /* max(rq->capacity) */ +unsigned int min_capacity = 1024; /* min(rq->capacity) */ +unsigned int max_possible_capacity = 1024; /* max(rq->max_possible_capacity) */ +unsigned int +min_max_possible_capacity = 1024; /* min(rq->max_possible_capacity) */ + +/* Min window size (in ns) = 10ms */ +#define MIN_SCHED_RAVG_WINDOW 10000000 + +/* Max window size (in ns) = 1s */ +#define MAX_SCHED_RAVG_WINDOW 1000000000 + +/* Window size (in ns) */ +__read_mostly unsigned int sched_ravg_window = MIN_SCHED_RAVG_WINDOW; + +/* Maximum allowed threshold before freq aggregation must be enabled */ +#define MAX_FREQ_AGGR_THRESH 1000 + +/* Temporarily disable window-stats activity on all cpus */ +unsigned int __read_mostly sched_disable_window_stats; + +struct related_thread_group *related_thread_groups[MAX_NUM_CGROUP_COLOC_ID]; +static LIST_HEAD(active_related_thread_groups); +static DEFINE_RWLOCK(related_thread_group_lock); + +#define for_each_related_thread_group(grp) \ + list_for_each_entry(grp, &active_related_thread_groups, list) + +/* + * Task load is categorized into buckets for the purpose of top task tracking. + * The entire range of load from 0 to sched_ravg_window needs to be covered + * in NUM_LOAD_INDICES number of buckets. Therefore the size of each bucket + * is given by sched_ravg_window / NUM_LOAD_INDICES. Since the default value + * of sched_ravg_window is MIN_SCHED_RAVG_WINDOW, use that to compute + * sched_load_granule. + */ +__read_mostly unsigned int sched_load_granule = + MIN_SCHED_RAVG_WINDOW / NUM_LOAD_INDICES; + +/* Size of bitmaps maintained to track top tasks */ +static const unsigned int top_tasks_bitmap_size = + BITS_TO_LONGS(NUM_LOAD_INDICES + 1) * sizeof(unsigned long); + +/* + * Demand aggregation for frequency purpose: + * + * 'sched_freq_aggregate' controls aggregation of cpu demand of related threads + * for frequency determination purpose. This aggregation is done per-cluster. + * + * CPU demand of tasks from various related groups is aggregated per-cluster and + * added to the "max_busy_cpu" in that cluster, where max_busy_cpu is determined + * by just rq->prev_runnable_sum. + * + * Some examples follow, which assume: + * Cluster0 = CPU0-3, Cluster1 = CPU4-7 + * One related thread group A that has tasks A0, A1, A2 + * + * A->cpu_time[X].curr/prev_sum = counters in which cpu execution stats of + * tasks belonging to group A are accumulated when they run on cpu X. + * + * CX->curr/prev_sum = counters in which cpu execution stats of all tasks + * not belonging to group A are accumulated when they run on cpu X + * + * Lets say the stats for window M was as below: + * + * C0->prev_sum = 1ms, A->cpu_time[0].prev_sum = 5ms + * Task A0 ran 5ms on CPU0 + * Task B0 ran 1ms on CPU0 + * + * C1->prev_sum = 5ms, A->cpu_time[1].prev_sum = 6ms + * Task A1 ran 4ms on CPU1 + * Task A2 ran 2ms on CPU1 + * Task B1 ran 5ms on CPU1 + * + * C2->prev_sum = 0ms, A->cpu_time[2].prev_sum = 0 + * CPU2 idle + * + * C3->prev_sum = 0ms, A->cpu_time[3].prev_sum = 0 + * CPU3 idle + * + * In this case, CPU1 was most busy going by just its prev_sum counter. Demand + * from all group A tasks are added to CPU1. IOW, at end of window M, cpu busy + * time reported to governor will be: + * + * + * C0 busy time = 1ms + * C1 busy time = 5 + 5 + 6 = 16ms + * + */ +static __read_mostly unsigned int sched_freq_aggregate = 1; +__read_mostly unsigned int sysctl_sched_freq_aggregate = 1; + +unsigned int __read_mostly sysctl_sched_freq_aggregate_threshold_pct; +static unsigned int __read_mostly sched_freq_aggregate_threshold; + +/* Initial task load. Newly created tasks are assigned this load. */ +unsigned int __read_mostly sched_init_task_load_windows; +unsigned int __read_mostly sysctl_sched_init_task_load_pct = 15; + +unsigned int max_task_load(void) +{ + return sched_ravg_window; +} + +/* A cpu can no longer accommodate more tasks if: + * + * rq->nr_running > sysctl_sched_spill_nr_run || + * rq->hmp_stats.cumulative_runnable_avg > sched_spill_load + */ +unsigned int __read_mostly sysctl_sched_spill_nr_run = 10; + +/* + * Place sync wakee tasks those have less than configured demand to the waker's + * cluster. + */ +unsigned int __read_mostly sched_small_wakee_task_load; +unsigned int __read_mostly sysctl_sched_small_wakee_task_load_pct = 10; + +unsigned int __read_mostly sched_big_waker_task_load; +unsigned int __read_mostly sysctl_sched_big_waker_task_load_pct = 25; + +/* + * CPUs with load greater than the sched_spill_load_threshold are not + * eligible for task placement. When all CPUs in a cluster achieve a + * load higher than this level, tasks becomes eligible for inter + * cluster migration. + */ +unsigned int __read_mostly sched_spill_load; +unsigned int __read_mostly sysctl_sched_spill_load_pct = 100; + +/* + * Prefer the waker CPU for sync wakee task, if the CPU has only 1 runnable + * task. This eliminates the LPM exit latency associated with the idle + * CPUs in the waker cluster. + */ +unsigned int __read_mostly sysctl_sched_prefer_sync_wakee_to_waker; + +/* + * Tasks whose bandwidth consumption on a cpu is more than + * sched_upmigrate are considered "big" tasks. Big tasks will be + * considered for "up" migration, i.e migrating to a cpu with better + * capacity. + */ +unsigned int __read_mostly sched_upmigrate; +unsigned int __read_mostly sysctl_sched_upmigrate_pct = 80; + +/* + * Big tasks, once migrated, will need to drop their bandwidth + * consumption to less than sched_downmigrate before they are "down" + * migrated. + */ +unsigned int __read_mostly sched_downmigrate; +unsigned int __read_mostly sysctl_sched_downmigrate_pct = 60; + +/* + * Task groups whose aggregate demand on a cpu is more than + * sched_group_upmigrate need to be up-migrated if possible. + */ +unsigned int __read_mostly sched_group_upmigrate; +unsigned int __read_mostly sysctl_sched_group_upmigrate_pct = 100; + +/* + * Task groups, once up-migrated, will need to drop their aggregate + * demand to less than sched_group_downmigrate before they are "down" + * migrated. + */ +unsigned int __read_mostly sched_group_downmigrate; +unsigned int __read_mostly sysctl_sched_group_downmigrate_pct = 95; + +/* + * The load scale factor of a CPU gets boosted when its max frequency + * is restricted due to which the tasks are migrating to higher capacity + * CPUs early. The sched_upmigrate threshold is auto-upgraded by + * rq->max_possible_freq/rq->max_freq of a lower capacity CPU. + */ +unsigned int up_down_migrate_scale_factor = 1024; + +/* + * Scheduler selects and places task to its previous CPU if sleep time is + * less than sysctl_sched_select_prev_cpu_us. + */ +unsigned int __read_mostly +sched_short_sleep_task_threshold = 2000 * NSEC_PER_USEC; + +unsigned int __read_mostly sysctl_sched_select_prev_cpu_us = 2000; + +unsigned int __read_mostly +sched_long_cpu_selection_threshold = 100 * NSEC_PER_MSEC; + +unsigned int __read_mostly sysctl_sched_restrict_cluster_spill; + +/* + * Scheduler tries to avoid waking up idle CPUs for tasks running + * in short bursts. If the task average burst is less than + * sysctl_sched_short_burst nanoseconds and it sleeps on an average + * for more than sysctl_sched_short_sleep nanoseconds, then the + * task is eligible for packing. + */ +unsigned int __read_mostly sysctl_sched_short_burst; +unsigned int __read_mostly sysctl_sched_short_sleep = 1 * NSEC_PER_MSEC; + +static void _update_up_down_migrate(unsigned int *up_migrate, + unsigned int *down_migrate, bool is_group) +{ + unsigned int delta; + + if (up_down_migrate_scale_factor == 1024) + return; + + delta = *up_migrate - *down_migrate; + + *up_migrate /= NSEC_PER_USEC; + *up_migrate *= up_down_migrate_scale_factor; + *up_migrate >>= 10; + *up_migrate *= NSEC_PER_USEC; + + if (!is_group) + *up_migrate = min(*up_migrate, sched_ravg_window); + + *down_migrate /= NSEC_PER_USEC; + *down_migrate *= up_down_migrate_scale_factor; + *down_migrate >>= 10; + *down_migrate *= NSEC_PER_USEC; + + *down_migrate = min(*down_migrate, *up_migrate - delta); +} + +static void update_up_down_migrate(void) +{ + unsigned int up_migrate = pct_to_real(sysctl_sched_upmigrate_pct); + unsigned int down_migrate = pct_to_real(sysctl_sched_downmigrate_pct); + + _update_up_down_migrate(&up_migrate, &down_migrate, false); + sched_upmigrate = up_migrate; + sched_downmigrate = down_migrate; + + up_migrate = pct_to_real(sysctl_sched_group_upmigrate_pct); + down_migrate = pct_to_real(sysctl_sched_group_downmigrate_pct); + + _update_up_down_migrate(&up_migrate, &down_migrate, true); + sched_group_upmigrate = up_migrate; + sched_group_downmigrate = down_migrate; +} + +void set_hmp_defaults(void) +{ + sched_spill_load = + pct_to_real(sysctl_sched_spill_load_pct); + + update_up_down_migrate(); + + sched_init_task_load_windows = + div64_u64((u64)sysctl_sched_init_task_load_pct * + (u64)sched_ravg_window, 100); + + sched_short_sleep_task_threshold = sysctl_sched_select_prev_cpu_us * + NSEC_PER_USEC; + + sched_small_wakee_task_load = + div64_u64((u64)sysctl_sched_small_wakee_task_load_pct * + (u64)sched_ravg_window, 100); + + sched_big_waker_task_load = + div64_u64((u64)sysctl_sched_big_waker_task_load_pct * + (u64)sched_ravg_window, 100); + + sched_freq_aggregate_threshold = + pct_to_real(sysctl_sched_freq_aggregate_threshold_pct); +} + +u32 sched_get_init_task_load(struct task_struct *p) +{ + return p->init_load_pct; +} + +int sched_set_init_task_load(struct task_struct *p, int init_load_pct) +{ + if (init_load_pct < 0 || init_load_pct > 100) + return -EINVAL; + + p->init_load_pct = init_load_pct; + + return 0; +} + +#ifdef CONFIG_CGROUP_SCHED + +int upmigrate_discouraged(struct task_struct *p) +{ + return task_group(p)->upmigrate_discouraged; +} + +#else + +static inline int upmigrate_discouraged(struct task_struct *p) +{ + return 0; +} + +#endif + +/* Is a task "big" on its current cpu */ +static inline int __is_big_task(struct task_struct *p, u64 scaled_load) +{ + int nice = task_nice(p); + + if (nice > SCHED_UPMIGRATE_MIN_NICE || upmigrate_discouraged(p)) + return 0; + + return scaled_load > sched_upmigrate; +} + +int is_big_task(struct task_struct *p) +{ + return __is_big_task(p, scale_load_to_cpu(task_load(p), task_cpu(p))); +} + +u64 cpu_load(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + return scale_load_to_cpu(rq->hmp_stats.cumulative_runnable_avg, cpu); +} + +u64 cpu_load_sync(int cpu, int sync) +{ + return scale_load_to_cpu(cpu_cravg_sync(cpu, sync), cpu); +} + +/* + * Task will fit on a cpu if it's bandwidth consumption on that cpu + * will be less than sched_upmigrate. A big task that was previously + * "up" migrated will be considered fitting on "little" cpu if its + * bandwidth consumption on "little" cpu will be less than + * sched_downmigrate. This will help avoid frequenty migrations for + * tasks with load close to the upmigrate threshold + */ +int task_load_will_fit(struct task_struct *p, u64 task_load, int cpu, + enum sched_boost_policy boost_policy) +{ + int upmigrate = sched_upmigrate; + + if (cpu_capacity(cpu) == max_capacity) + return 1; + + if (cpu_capacity(task_cpu(p)) > cpu_capacity(cpu)) + upmigrate = sched_downmigrate; + + if (boost_policy != SCHED_BOOST_ON_BIG) { + if (task_nice(p) > SCHED_UPMIGRATE_MIN_NICE || + upmigrate_discouraged(p)) + return 1; + + if (task_load < upmigrate) + return 1; + } else { + if (task_sched_boost(p) || task_load >= upmigrate) + return 0; + + return 1; + } + + return 0; +} + +int task_will_fit(struct task_struct *p, int cpu) +{ + u64 tload = scale_load_to_cpu(task_load(p), cpu); + + return task_load_will_fit(p, tload, cpu, sched_boost_policy()); +} + +static int +group_will_fit(struct sched_cluster *cluster, struct related_thread_group *grp, + u64 demand, bool group_boost) +{ + int cpu = cluster_first_cpu(cluster); + int prev_capacity = 0; + unsigned int threshold = sched_group_upmigrate; + u64 load; + + if (cluster->capacity == max_capacity) + return 1; + + if (group_boost) + return 0; + + if (!demand) + return 1; + + if (grp->preferred_cluster) + prev_capacity = grp->preferred_cluster->capacity; + + if (cluster->capacity < prev_capacity) + threshold = sched_group_downmigrate; + + load = scale_load_to_cpu(demand, cpu); + if (load < threshold) + return 1; + + return 0; +} + +/* + * Return the cost of running task p on CPU cpu. This function + * currently assumes that task p is the only task which will run on + * the CPU. + */ +unsigned int power_cost(int cpu, u64 demand) +{ + int first, mid, last; + struct cpu_pwr_stats *per_cpu_info = get_cpu_pwr_stats(); + struct cpu_pstate_pwr *costs; + struct freq_max_load *max_load; + int total_static_pwr_cost = 0; + struct rq *rq = cpu_rq(cpu); + unsigned int pc; + + if (!per_cpu_info || !per_cpu_info[cpu].ptable) + /* + * When power aware scheduling is not in use, or CPU + * power data is not available, just use the CPU + * capacity as a rough stand-in for real CPU power + * numbers, assuming bigger CPUs are more power + * hungry. + */ + return cpu_max_possible_capacity(cpu); + + rcu_read_lock(); + max_load = rcu_dereference(per_cpu(freq_max_load, cpu)); + if (!max_load) { + pc = cpu_max_possible_capacity(cpu); + goto unlock; + } + + costs = per_cpu_info[cpu].ptable; + + if (demand <= max_load->freqs[0].hdemand) { + pc = costs[0].power; + goto unlock; + } else if (demand > max_load->freqs[max_load->length - 1].hdemand) { + pc = costs[max_load->length - 1].power; + goto unlock; + } + + first = 0; + last = max_load->length - 1; + mid = (last - first) >> 1; + while (1) { + if (demand <= max_load->freqs[mid].hdemand) + last = mid; + else + first = mid; + + if (last - first == 1) + break; + mid = first + ((last - first) >> 1); + } + + pc = costs[last].power; + +unlock: + rcu_read_unlock(); + + if (idle_cpu(cpu) && rq->cstate) { + total_static_pwr_cost += rq->static_cpu_pwr_cost; + if (rq->cluster->dstate) + total_static_pwr_cost += + rq->cluster->static_cluster_pwr_cost; + } + + return pc + total_static_pwr_cost; + +} + +void inc_nr_big_task(struct hmp_sched_stats *stats, struct task_struct *p) +{ + if (sched_disable_window_stats) + return; + + if (is_big_task(p)) + stats->nr_big_tasks++; +} + +void dec_nr_big_task(struct hmp_sched_stats *stats, struct task_struct *p) +{ + if (sched_disable_window_stats) + return; + + if (is_big_task(p)) + stats->nr_big_tasks--; + + BUG_ON(stats->nr_big_tasks < 0); +} + +void inc_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra) +{ + inc_nr_big_task(&rq->hmp_stats, p); + if (change_cra) + inc_cumulative_runnable_avg(&rq->hmp_stats, p); +} + +void dec_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra) +{ + dec_nr_big_task(&rq->hmp_stats, p); + if (change_cra) + dec_cumulative_runnable_avg(&rq->hmp_stats, p); +} + +void reset_hmp_stats(struct hmp_sched_stats *stats, int reset_cra) +{ + stats->nr_big_tasks = 0; + if (reset_cra) { + stats->cumulative_runnable_avg = 0; + stats->pred_demands_sum = 0; + } +} + +int preferred_cluster(struct sched_cluster *cluster, struct task_struct *p) +{ + struct related_thread_group *grp; + int rc = 1; + + rcu_read_lock(); + + grp = task_related_thread_group(p); + if (grp) + rc = (grp->preferred_cluster == cluster); + + rcu_read_unlock(); + return rc; +} + +struct sched_cluster *rq_cluster(struct rq *rq) +{ + return rq->cluster; +} + +/* + * reset_cpu_hmp_stats - reset HMP stats for a cpu + * nr_big_tasks + * cumulative_runnable_avg (iff reset_cra is true) + */ +void reset_cpu_hmp_stats(int cpu, int reset_cra) +{ + reset_cfs_rq_hmp_stats(cpu, reset_cra); + reset_hmp_stats(&cpu_rq(cpu)->hmp_stats, reset_cra); +} + +void fixup_nr_big_tasks(struct hmp_sched_stats *stats, + struct task_struct *p, s64 delta) +{ + u64 new_task_load; + u64 old_task_load; + + if (sched_disable_window_stats) + return; + + old_task_load = scale_load_to_cpu(task_load(p), task_cpu(p)); + new_task_load = scale_load_to_cpu(delta + task_load(p), task_cpu(p)); + + if (__is_big_task(p, old_task_load) && !__is_big_task(p, new_task_load)) + stats->nr_big_tasks--; + else if (!__is_big_task(p, old_task_load) && + __is_big_task(p, new_task_load)) + stats->nr_big_tasks++; + + BUG_ON(stats->nr_big_tasks < 0); +} + +/* + * Walk runqueue of cpu and re-initialize 'nr_big_tasks' counters. + */ +static void update_nr_big_tasks(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + struct task_struct *p; + + /* Do not reset cumulative_runnable_avg */ + reset_cpu_hmp_stats(cpu, 0); + + list_for_each_entry(p, &rq->cfs_tasks, se.group_node) + _inc_hmp_sched_stats_fair(rq, p, 0); +} + +/* Disable interrupts and grab runqueue lock of all cpus listed in @cpus */ +void pre_big_task_count_change(const struct cpumask *cpus) +{ + int i; + + local_irq_disable(); + + for_each_cpu(i, cpus) + raw_spin_lock(&cpu_rq(i)->lock); +} + +/* + * Reinitialize 'nr_big_tasks' counters on all affected cpus + */ +void post_big_task_count_change(const struct cpumask *cpus) +{ + int i; + + /* Assumes local_irq_disable() keeps online cpumap stable */ + for_each_cpu(i, cpus) + update_nr_big_tasks(i); + + for_each_cpu(i, cpus) + raw_spin_unlock(&cpu_rq(i)->lock); + + local_irq_enable(); +} + +DEFINE_MUTEX(policy_mutex); + +unsigned int update_freq_aggregate_threshold(unsigned int threshold) +{ + unsigned int old_threshold; + + mutex_lock(&policy_mutex); + + old_threshold = sysctl_sched_freq_aggregate_threshold_pct; + + sysctl_sched_freq_aggregate_threshold_pct = threshold; + sched_freq_aggregate_threshold = + pct_to_real(sysctl_sched_freq_aggregate_threshold_pct); + + mutex_unlock(&policy_mutex); + + return old_threshold; +} + +static inline int invalid_value_freq_input(unsigned int *data) +{ + if (data == &sysctl_sched_freq_aggregate) + return !(*data == 0 || *data == 1); + + return 0; +} + +static inline int invalid_value(unsigned int *data) +{ + unsigned int val = *data; + + if (data == &sysctl_sched_ravg_hist_size) + return (val < 2 || val > RAVG_HIST_SIZE_MAX); + + if (data == &sysctl_sched_window_stats_policy) + return val >= WINDOW_STATS_INVALID_POLICY; + + return invalid_value_freq_input(data); +} + +/* + * Handle "atomic" update of sysctl_sched_window_stats_policy, + * sysctl_sched_ravg_hist_size variables. + */ +int sched_window_update_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + unsigned int *data = (unsigned int *)table->data; + unsigned int old_val; + + mutex_lock(&policy_mutex); + + old_val = *data; + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + if (ret || !write || (write && (old_val == *data))) + goto done; + + if (invalid_value(data)) { + *data = old_val; + ret = -EINVAL; + goto done; + } + + reset_all_window_stats(0, 0); + +done: + mutex_unlock(&policy_mutex); + + return ret; +} + +/* + * Convert percentage value into absolute form. This will avoid div() operation + * in fast path, to convert task load in percentage scale. + */ +int sched_hmp_proc_update_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + unsigned int old_val; + unsigned int *data = (unsigned int *)table->data; + int update_task_count = 0; + + /* + * The policy mutex is acquired with cpu_hotplug.lock + * held from cpu_up()->cpufreq_governor_interactive()-> + * sched_set_window(). So enforce the same order here. + */ + if (write && (data == &sysctl_sched_upmigrate_pct)) { + update_task_count = 1; + get_online_cpus(); + } + + mutex_lock(&policy_mutex); + + old_val = *data; + + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + if (ret || !write) + goto done; + + if (write && (old_val == *data)) + goto done; + + if (sysctl_sched_downmigrate_pct > sysctl_sched_upmigrate_pct || + sysctl_sched_group_downmigrate_pct > + sysctl_sched_group_upmigrate_pct) { + *data = old_val; + ret = -EINVAL; + goto done; + } + + /* + * Big task tunable change will need to re-classify tasks on + * runqueue as big and set their counters appropriately. + * sysctl interface affects secondary variables (*_pct), which is then + * "atomically" carried over to the primary variables. Atomic change + * includes taking runqueue lock of all online cpus and re-initiatizing + * their big counter values based on changed criteria. + */ + if (update_task_count) + pre_big_task_count_change(cpu_online_mask); + + set_hmp_defaults(); + + if (update_task_count) + post_big_task_count_change(cpu_online_mask); + +done: + mutex_unlock(&policy_mutex); + if (update_task_count) + put_online_cpus(); + return ret; +} + +inline int nr_big_tasks(struct rq *rq) +{ + return rq->hmp_stats.nr_big_tasks; +} + +unsigned int cpu_temp(int cpu) +{ + struct cpu_pwr_stats *per_cpu_info = get_cpu_pwr_stats(); + + if (per_cpu_info) + return per_cpu_info[cpu].temp; + else + return 0; +} + +/* + * kfree() may wakeup kswapd. So this function should NOT be called + * with any CPU's rq->lock acquired. + */ +void free_task_load_ptrs(struct task_struct *p) +{ + kfree(p->ravg.curr_window_cpu); + kfree(p->ravg.prev_window_cpu); + + /* + * update_task_ravg() can be called for exiting tasks. While the + * function itself ensures correct behavior, the corresponding + * trace event requires that these pointers be NULL. + */ + p->ravg.curr_window_cpu = NULL; + p->ravg.prev_window_cpu = NULL; +} + +void init_new_task_load(struct task_struct *p) +{ + int i; + u32 init_load_windows = sched_init_task_load_windows; + u32 init_load_pct = current->init_load_pct; + + p->init_load_pct = 0; + rcu_assign_pointer(p->grp, NULL); + INIT_LIST_HEAD(&p->grp_list); + memset(&p->ravg, 0, sizeof(struct ravg)); + p->cpu_cycles = 0; + p->ravg.curr_burst = 0; + /* + * Initialize the avg_burst to twice the threshold, so that + * a task would not be classified as short burst right away + * after fork. It takes at least 6 sleep-wakeup cycles for + * the avg_burst to go below the threshold. + */ + p->ravg.avg_burst = 2 * (u64)sysctl_sched_short_burst; + p->ravg.avg_sleep_time = 0; + + p->ravg.curr_window_cpu = kcalloc(nr_cpu_ids, sizeof(u32), GFP_KERNEL); + p->ravg.prev_window_cpu = kcalloc(nr_cpu_ids, sizeof(u32), GFP_KERNEL); + + /* Don't have much choice. CPU frequency would be bogus */ + BUG_ON(!p->ravg.curr_window_cpu || !p->ravg.prev_window_cpu); + + if (init_load_pct) + init_load_windows = div64_u64((u64)init_load_pct * + (u64)sched_ravg_window, 100); + + p->ravg.demand = init_load_windows; + p->ravg.pred_demand = 0; + for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i) + p->ravg.sum_history[i] = init_load_windows; +} + +/* Return task demand in percentage scale */ +unsigned int pct_task_load(struct task_struct *p) +{ + unsigned int load; + + load = div64_u64((u64)task_load(p) * 100, (u64)max_task_load()); + + return load; +} + +/* + * Return total number of tasks "eligible" to run on highest capacity cpu + * + * This is simply nr_big_tasks for cpus which are not of max_capacity and + * nr_running for cpus of max_capacity + */ +unsigned int nr_eligible_big_tasks(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + int nr_big = rq->hmp_stats.nr_big_tasks; + int nr = rq->nr_running; + + if (!is_max_capacity_cpu(cpu)) + return nr_big; + + return nr; +} + +static inline int exiting_task(struct task_struct *p) +{ + return (p->ravg.sum_history[0] == EXITING_TASK_MARKER); +} + +static int __init set_sched_ravg_window(char *str) +{ + unsigned int window_size; + + get_option(&str, &window_size); + + if (window_size < MIN_SCHED_RAVG_WINDOW || + window_size > MAX_SCHED_RAVG_WINDOW) { + WARN_ON(1); + return -EINVAL; + } + + sched_ravg_window = window_size; + return 0; +} + +early_param("sched_ravg_window", set_sched_ravg_window); + +static inline void +update_window_start(struct rq *rq, u64 wallclock) +{ + s64 delta; + int nr_windows; + + delta = wallclock - rq->window_start; + BUG_ON(delta < 0); + if (delta < sched_ravg_window) + return; + + nr_windows = div64_u64(delta, sched_ravg_window); + rq->window_start += (u64)nr_windows * (u64)sched_ravg_window; +} + +#define DIV64_U64_ROUNDUP(X, Y) div64_u64((X) + (Y - 1), Y) + +static inline u64 scale_exec_time(u64 delta, struct rq *rq) +{ + u32 freq; + + freq = cpu_cycles_to_freq(rq->cc.cycles, rq->cc.time); + delta = DIV64_U64_ROUNDUP(delta * freq, max_possible_freq); + delta *= rq->cluster->exec_scale_factor; + delta >>= 10; + + return delta; +} + +static inline int cpu_is_waiting_on_io(struct rq *rq) +{ + if (!sched_io_is_busy) + return 0; + + return atomic_read(&rq->nr_iowait); +} + +/* Does freq_required sufficiently exceed or fall behind cur_freq? */ +static inline int +nearly_same_freq(unsigned int cur_freq, unsigned int freq_required) +{ + int delta = freq_required - cur_freq; + + if (freq_required > cur_freq) + return delta < sysctl_sched_freq_inc_notify; + + delta = -delta; + + return delta < sysctl_sched_freq_dec_notify; +} + +/* Convert busy time to frequency equivalent */ +static inline unsigned int load_to_freq(struct rq *rq, u64 load) +{ + unsigned int freq; + + load = scale_load_to_cpu(load, cpu_of(rq)); + load *= 128; + load = div64_u64(load, max_task_load()); + + freq = load * cpu_max_possible_freq(cpu_of(rq)); + freq /= 128; + + return freq; +} + +/* + * Return load from all related groups in given frequency domain. + */ +static void group_load_in_freq_domain(struct cpumask *cpus, + u64 *grp_load, u64 *new_grp_load) +{ + int j; + + for_each_cpu(j, cpus) { + struct rq *rq = cpu_rq(j); + + *grp_load += rq->grp_time.prev_runnable_sum; + *new_grp_load += rq->grp_time.nt_prev_runnable_sum; + } +} + +static inline u64 freq_policy_load(struct rq *rq, u64 load); +/* + * Should scheduler alert governor for changing frequency? + * + * @check_pred - evaluate frequency based on the predictive demand + * @check_groups - add load from all related groups on given cpu + * + * check_groups is set to 1 if a "related" task movement/wakeup is triggering + * the notification check. To avoid "re-aggregation" of demand in such cases, + * we check whether the migrated/woken tasks demand (along with demand from + * existing tasks on the cpu) can be met on target cpu + * + */ + +static int send_notification(struct rq *rq, int check_pred, int check_groups) +{ + unsigned int cur_freq, freq_required; + unsigned long flags; + int rc = 0; + u64 group_load = 0, new_load = 0; + + if (check_pred) { + u64 prev = rq->old_busy_time; + u64 predicted = rq->hmp_stats.pred_demands_sum; + + if (rq->cluster->cur_freq == cpu_max_freq(cpu_of(rq))) + return 0; + + prev = max(prev, rq->old_estimated_time); + if (prev > predicted) + return 0; + + cur_freq = load_to_freq(rq, prev); + freq_required = load_to_freq(rq, predicted); + + if (freq_required < cur_freq + sysctl_sched_pred_alert_freq) + return 0; + } else { + /* + * Protect from concurrent update of rq->prev_runnable_sum and + * group cpu load + */ + raw_spin_lock_irqsave(&rq->lock, flags); + if (check_groups) + group_load = rq->grp_time.prev_runnable_sum; + + new_load = rq->prev_runnable_sum + group_load; + new_load = freq_policy_load(rq, new_load); + + raw_spin_unlock_irqrestore(&rq->lock, flags); + + cur_freq = load_to_freq(rq, rq->old_busy_time); + freq_required = load_to_freq(rq, new_load); + + if (nearly_same_freq(cur_freq, freq_required)) + return 0; + } + + raw_spin_lock_irqsave(&rq->lock, flags); + if (!rq->cluster->notifier_sent) { + rq->cluster->notifier_sent = 1; + rc = 1; + trace_sched_freq_alert(cpu_of(rq), check_pred, check_groups, rq, + new_load); + } + raw_spin_unlock_irqrestore(&rq->lock, flags); + + return rc; +} + +/* Alert governor if there is a need to change frequency */ +void check_for_freq_change(struct rq *rq, bool check_pred, bool check_groups) +{ + int cpu = cpu_of(rq); + + if (!send_notification(rq, check_pred, check_groups)) + return; + + atomic_notifier_call_chain( + &load_alert_notifier_head, 0, + (void *)(long)cpu); +} + +void notify_migration(int src_cpu, int dest_cpu, bool src_cpu_dead, + struct task_struct *p) +{ + bool check_groups; + + rcu_read_lock(); + check_groups = task_in_related_thread_group(p); + rcu_read_unlock(); + + if (!same_freq_domain(src_cpu, dest_cpu)) { + if (!src_cpu_dead) + check_for_freq_change(cpu_rq(src_cpu), false, + check_groups); + check_for_freq_change(cpu_rq(dest_cpu), false, check_groups); + } else { + check_for_freq_change(cpu_rq(dest_cpu), true, check_groups); + } +} + +static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p, + u64 irqtime, int event) +{ + if (is_idle_task(p)) { + /* TASK_WAKE && TASK_MIGRATE is not possible on idle task! */ + if (event == PICK_NEXT_TASK) + return 0; + + /* PUT_PREV_TASK, TASK_UPDATE && IRQ_UPDATE are left */ + return irqtime || cpu_is_waiting_on_io(rq); + } + + if (event == TASK_WAKE) + return 0; + + if (event == PUT_PREV_TASK || event == IRQ_UPDATE) + return 1; + + /* + * TASK_UPDATE can be called on sleeping task, when its moved between + * related groups + */ + if (event == TASK_UPDATE) { + if (rq->curr == p) + return 1; + + return p->on_rq ? SCHED_FREQ_ACCOUNT_WAIT_TIME : 0; + } + + /* TASK_MIGRATE, PICK_NEXT_TASK left */ + return SCHED_FREQ_ACCOUNT_WAIT_TIME; +} + +static inline bool is_new_task(struct task_struct *p) +{ + return p->ravg.active_windows < SCHED_NEW_TASK_WINDOWS; +} + +#define INC_STEP 8 +#define DEC_STEP 2 +#define CONSISTENT_THRES 16 +#define INC_STEP_BIG 16 +/* + * bucket_increase - update the count of all buckets + * + * @buckets: array of buckets tracking busy time of a task + * @idx: the index of bucket to be incremented + * + * Each time a complete window finishes, count of bucket that runtime + * falls in (@idx) is incremented. Counts of all other buckets are + * decayed. The rate of increase and decay could be different based + * on current count in the bucket. + */ +static inline void bucket_increase(u8 *buckets, int idx) +{ + int i, step; + + for (i = 0; i < NUM_BUSY_BUCKETS; i++) { + if (idx != i) { + if (buckets[i] > DEC_STEP) + buckets[i] -= DEC_STEP; + else + buckets[i] = 0; + } else { + step = buckets[i] >= CONSISTENT_THRES ? + INC_STEP_BIG : INC_STEP; + if (buckets[i] > U8_MAX - step) + buckets[i] = U8_MAX; + else + buckets[i] += step; + } + } +} + +static inline int busy_to_bucket(u32 normalized_rt) +{ + int bidx; + + bidx = mult_frac(normalized_rt, NUM_BUSY_BUCKETS, max_task_load()); + bidx = min(bidx, NUM_BUSY_BUCKETS - 1); + + /* + * Combine lowest two buckets. The lowest frequency falls into + * 2nd bucket and thus keep predicting lowest bucket is not + * useful. + */ + if (!bidx) + bidx++; + + return bidx; +} + +static inline u64 +scale_load_to_freq(u64 load, unsigned int src_freq, unsigned int dst_freq) +{ + return div64_u64(load * (u64)src_freq, (u64)dst_freq); +} + +/* + * get_pred_busy - calculate predicted demand for a task on runqueue + * + * @rq: runqueue of task p + * @p: task whose prediction is being updated + * @start: starting bucket. returned prediction should not be lower than + * this bucket. + * @runtime: runtime of the task. returned prediction should not be lower + * than this runtime. + * Note: @start can be derived from @runtime. It's passed in only to + * avoid duplicated calculation in some cases. + * + * A new predicted busy time is returned for task @p based on @runtime + * passed in. The function searches through buckets that represent busy + * time equal to or bigger than @runtime and attempts to find the bucket to + * to use for prediction. Once found, it searches through historical busy + * time and returns the latest that falls into the bucket. If no such busy + * time exists, it returns the medium of that bucket. + */ +static u32 get_pred_busy(struct rq *rq, struct task_struct *p, + int start, u32 runtime) +{ + int i; + u8 *buckets = p->ravg.busy_buckets; + u32 *hist = p->ravg.sum_history; + u32 dmin, dmax; + u64 cur_freq_runtime = 0; + int first = NUM_BUSY_BUCKETS, final; + u32 ret = runtime; + + /* skip prediction for new tasks due to lack of history */ + if (unlikely(is_new_task(p))) + goto out; + + /* find minimal bucket index to pick */ + for (i = start; i < NUM_BUSY_BUCKETS; i++) { + if (buckets[i]) { + first = i; + break; + } + } + /* if no higher buckets are filled, predict runtime */ + if (first >= NUM_BUSY_BUCKETS) + goto out; + + /* compute the bucket for prediction */ + final = first; + + /* determine demand range for the predicted bucket */ + if (final < 2) { + /* lowest two buckets are combined */ + dmin = 0; + final = 1; + } else { + dmin = mult_frac(final, max_task_load(), NUM_BUSY_BUCKETS); + } + dmax = mult_frac(final + 1, max_task_load(), NUM_BUSY_BUCKETS); + + /* + * search through runtime history and return first runtime that falls + * into the range of predicted bucket. + */ + for (i = 0; i < sched_ravg_hist_size; i++) { + if (hist[i] >= dmin && hist[i] < dmax) { + ret = hist[i]; + break; + } + } + /* no historical runtime within bucket found, use average of the bin */ + if (ret < dmin) + ret = (dmin + dmax) / 2; + /* + * when updating in middle of a window, runtime could be higher + * than all recorded history. Always predict at least runtime. + */ + ret = max(runtime, ret); +out: + trace_sched_update_pred_demand(rq, p, runtime, + mult_frac((unsigned int)cur_freq_runtime, 100, + sched_ravg_window), ret); + return ret; +} + +static inline u32 calc_pred_demand(struct rq *rq, struct task_struct *p) +{ + if (p->ravg.pred_demand >= p->ravg.curr_window) + return p->ravg.pred_demand; + + return get_pred_busy(rq, p, busy_to_bucket(p->ravg.curr_window), + p->ravg.curr_window); +} + +/* + * predictive demand of a task is calculated at the window roll-over. + * if the task current window busy time exceeds the predicted + * demand, update it here to reflect the task needs. + */ +void update_task_pred_demand(struct rq *rq, struct task_struct *p, int event) +{ + u32 new, old; + + if (is_idle_task(p) || exiting_task(p)) + return; + + if (event != PUT_PREV_TASK && event != TASK_UPDATE && + (!SCHED_FREQ_ACCOUNT_WAIT_TIME || + (event != TASK_MIGRATE && + event != PICK_NEXT_TASK))) + return; + + /* + * TASK_UPDATE can be called on sleeping task, when its moved between + * related groups + */ + if (event == TASK_UPDATE) { + if (!p->on_rq && !SCHED_FREQ_ACCOUNT_WAIT_TIME) + return; + } + + new = calc_pred_demand(rq, p); + old = p->ravg.pred_demand; + + if (old >= new) + return; + + if (task_on_rq_queued(p) && (!task_has_dl_policy(p) || + !p->dl.dl_throttled)) + p->sched_class->fixup_hmp_sched_stats(rq, p, + p->ravg.demand, + new); + + p->ravg.pred_demand = new; +} + +void clear_top_tasks_bitmap(unsigned long *bitmap) +{ + memset(bitmap, 0, top_tasks_bitmap_size); + __set_bit(NUM_LOAD_INDICES, bitmap); +} + +/* + * Special case the last index and provide a fast path for index = 0. + * Note that sched_load_granule can change underneath us if we are not + * holding any runqueue locks while calling the two functions below. + */ +static u32 top_task_load(struct rq *rq) +{ + int index = rq->prev_top; + u8 prev = 1 - rq->curr_table; + + if (!index) { + int msb = NUM_LOAD_INDICES - 1; + + if (!test_bit(msb, rq->top_tasks_bitmap[prev])) + return 0; + else + return sched_load_granule; + } else if (index == NUM_LOAD_INDICES - 1) { + return sched_ravg_window; + } else { + return (index + 1) * sched_load_granule; + } +} + +static u32 load_to_index(u32 load) +{ + u32 index = load / sched_load_granule; + + return min(index, (u32)(NUM_LOAD_INDICES - 1)); +} + +static void update_top_tasks(struct task_struct *p, struct rq *rq, + u32 old_curr_window, int new_window, bool full_window) +{ + u8 curr = rq->curr_table; + u8 prev = 1 - curr; + u8 *curr_table = rq->top_tasks[curr]; + u8 *prev_table = rq->top_tasks[prev]; + int old_index, new_index, update_index; + u32 curr_window = p->ravg.curr_window; + u32 prev_window = p->ravg.prev_window; + bool zero_index_update; + + if (old_curr_window == curr_window && !new_window) + return; + + old_index = load_to_index(old_curr_window); + new_index = load_to_index(curr_window); + + if (!new_window) { + zero_index_update = !old_curr_window && curr_window; + if (old_index != new_index || zero_index_update) { + if (old_curr_window) + curr_table[old_index] -= 1; + if (curr_window) + curr_table[new_index] += 1; + if (new_index > rq->curr_top) + rq->curr_top = new_index; + } + + if (!curr_table[old_index]) + __clear_bit(NUM_LOAD_INDICES - old_index - 1, + rq->top_tasks_bitmap[curr]); + + if (curr_table[new_index] == 1) + __set_bit(NUM_LOAD_INDICES - new_index - 1, + rq->top_tasks_bitmap[curr]); + + return; + } + + /* + * The window has rolled over for this task. By the time we get + * here, curr/prev swaps would has already occurred. So we need + * to use prev_window for the new index. + */ + update_index = load_to_index(prev_window); + + if (full_window) { + /* + * Two cases here. Either 'p' ran for the entire window or + * it didn't run at all. In either case there is no entry + * in the prev table. If 'p' ran the entire window, we just + * need to create a new entry in the prev table. In this case + * update_index will be correspond to sched_ravg_window + * so we can unconditionally update the top index. + */ + if (prev_window) { + prev_table[update_index] += 1; + rq->prev_top = update_index; + } + + if (prev_table[update_index] == 1) + __set_bit(NUM_LOAD_INDICES - update_index - 1, + rq->top_tasks_bitmap[prev]); + } else { + zero_index_update = !old_curr_window && prev_window; + if (old_index != update_index || zero_index_update) { + if (old_curr_window) + prev_table[old_index] -= 1; + + prev_table[update_index] += 1; + + if (update_index > rq->prev_top) + rq->prev_top = update_index; + + if (!prev_table[old_index]) + __clear_bit(NUM_LOAD_INDICES - old_index - 1, + rq->top_tasks_bitmap[prev]); + + if (prev_table[update_index] == 1) + __set_bit(NUM_LOAD_INDICES - update_index - 1, + rq->top_tasks_bitmap[prev]); + } + } + + if (curr_window) { + curr_table[new_index] += 1; + + if (new_index > rq->curr_top) + rq->curr_top = new_index; + + if (curr_table[new_index] == 1) + __set_bit(NUM_LOAD_INDICES - new_index - 1, + rq->top_tasks_bitmap[curr]); + } +} + +static inline void clear_top_tasks_table(u8 *table) +{ + memset(table, 0, NUM_LOAD_INDICES * sizeof(u8)); +} + +static void rollover_top_tasks(struct rq *rq, bool full_window) +{ + u8 curr_table = rq->curr_table; + u8 prev_table = 1 - curr_table; + int curr_top = rq->curr_top; + + clear_top_tasks_table(rq->top_tasks[prev_table]); + clear_top_tasks_bitmap(rq->top_tasks_bitmap[prev_table]); + + if (full_window) { + curr_top = 0; + clear_top_tasks_table(rq->top_tasks[curr_table]); + clear_top_tasks_bitmap( + rq->top_tasks_bitmap[curr_table]); + } + + rq->curr_table = prev_table; + rq->prev_top = curr_top; + rq->curr_top = 0; +} + +static u32 empty_windows[NR_CPUS]; + +static void rollover_task_window(struct task_struct *p, bool full_window) +{ + u32 *curr_cpu_windows = empty_windows; + u32 curr_window; + int i; + + /* Rollover the sum */ + curr_window = 0; + + if (!full_window) { + curr_window = p->ravg.curr_window; + curr_cpu_windows = p->ravg.curr_window_cpu; + } + + p->ravg.prev_window = curr_window; + p->ravg.curr_window = 0; + + /* Roll over individual CPU contributions */ + for (i = 0; i < nr_cpu_ids; i++) { + p->ravg.prev_window_cpu[i] = curr_cpu_windows[i]; + p->ravg.curr_window_cpu[i] = 0; + } +} + +static void rollover_cpu_window(struct rq *rq, bool full_window) +{ + u64 curr_sum = rq->curr_runnable_sum; + u64 nt_curr_sum = rq->nt_curr_runnable_sum; + u64 grp_curr_sum = rq->grp_time.curr_runnable_sum; + u64 grp_nt_curr_sum = rq->grp_time.nt_curr_runnable_sum; + + if (unlikely(full_window)) { + curr_sum = 0; + nt_curr_sum = 0; + grp_curr_sum = 0; + grp_nt_curr_sum = 0; + } + + rq->prev_runnable_sum = curr_sum; + rq->nt_prev_runnable_sum = nt_curr_sum; + rq->grp_time.prev_runnable_sum = grp_curr_sum; + rq->grp_time.nt_prev_runnable_sum = grp_nt_curr_sum; + + rq->curr_runnable_sum = 0; + rq->nt_curr_runnable_sum = 0; + rq->grp_time.curr_runnable_sum = 0; + rq->grp_time.nt_curr_runnable_sum = 0; +} + +/* + * Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum) + */ +static void update_cpu_busy_time(struct task_struct *p, struct rq *rq, + int event, u64 wallclock, u64 irqtime) +{ + int new_window, full_window = 0; + int p_is_curr_task = (p == rq->curr); + u64 mark_start = p->ravg.mark_start; + u64 window_start = rq->window_start; + u32 window_size = sched_ravg_window; + u64 delta; + u64 *curr_runnable_sum = &rq->curr_runnable_sum; + u64 *prev_runnable_sum = &rq->prev_runnable_sum; + u64 *nt_curr_runnable_sum = &rq->nt_curr_runnable_sum; + u64 *nt_prev_runnable_sum = &rq->nt_prev_runnable_sum; + bool new_task; + struct related_thread_group *grp; + int cpu = rq->cpu; + u32 old_curr_window = p->ravg.curr_window; + + new_window = mark_start < window_start; + if (new_window) { + full_window = (window_start - mark_start) >= window_size; + if (p->ravg.active_windows < USHRT_MAX) + p->ravg.active_windows++; + } + + new_task = is_new_task(p); + + /* + * Handle per-task window rollover. We don't care about the idle + * task or exiting tasks. + */ + if (!is_idle_task(p) && !exiting_task(p)) { + if (new_window) + rollover_task_window(p, full_window); + } + + if (p_is_curr_task && new_window) { + rollover_cpu_window(rq, full_window); + rollover_top_tasks(rq, full_window); + } + + if (!account_busy_for_cpu_time(rq, p, irqtime, event)) + goto done; + + grp = p->grp; + if (grp && sched_freq_aggregate) { + struct group_cpu_time *cpu_time = &rq->grp_time; + + curr_runnable_sum = &cpu_time->curr_runnable_sum; + prev_runnable_sum = &cpu_time->prev_runnable_sum; + + nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; + nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; + } + + if (!new_window) { + /* + * account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. No rollover + * since we didn't start a new window. An example of this is + * when a task starts execution and then sleeps within the + * same window. + */ + + if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) + delta = wallclock - mark_start; + else + delta = irqtime; + delta = scale_exec_time(delta, rq); + *curr_runnable_sum += delta; + if (new_task) + *nt_curr_runnable_sum += delta; + + if (!is_idle_task(p) && !exiting_task(p)) { + p->ravg.curr_window += delta; + p->ravg.curr_window_cpu[cpu] += delta; + } + + goto done; + } + + if (!p_is_curr_task) { + /* + * account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. A new window + * has also started, but p is not the current task, so the + * window is not rolled over - just split up and account + * as necessary into curr and prev. The window is only + * rolled over when a new window is processed for the current + * task. + * + * Irqtime can't be accounted by a task that isn't the + * currently running task. + */ + + if (!full_window) { + /* + * A full window hasn't elapsed, account partial + * contribution to previous completed window. + */ + delta = scale_exec_time(window_start - mark_start, rq); + if (!exiting_task(p)) { + p->ravg.prev_window += delta; + p->ravg.prev_window_cpu[cpu] += delta; + } + } else { + /* + * Since at least one full window has elapsed, + * the contribution to the previous window is the + * full window (window_size). + */ + delta = scale_exec_time(window_size, rq); + if (!exiting_task(p)) { + p->ravg.prev_window = delta; + p->ravg.prev_window_cpu[cpu] = delta; + } + } + + *prev_runnable_sum += delta; + if (new_task) + *nt_prev_runnable_sum += delta; + + /* Account piece of busy time in the current window. */ + delta = scale_exec_time(wallclock - window_start, rq); + *curr_runnable_sum += delta; + if (new_task) + *nt_curr_runnable_sum += delta; + + if (!exiting_task(p)) { + p->ravg.curr_window = delta; + p->ravg.curr_window_cpu[cpu] = delta; + } + + goto done; + } + + if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) { + /* + * account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. A new window + * has started and p is the current task so rollover is + * needed. If any of these three above conditions are true + * then this busy time can't be accounted as irqtime. + * + * Busy time for the idle task or exiting tasks need not + * be accounted. + * + * An example of this would be a task that starts execution + * and then sleeps once a new window has begun. + */ + + if (!full_window) { + /* + * A full window hasn't elapsed, account partial + * contribution to previous completed window. + */ + delta = scale_exec_time(window_start - mark_start, rq); + if (!is_idle_task(p) && !exiting_task(p)) { + p->ravg.prev_window += delta; + p->ravg.prev_window_cpu[cpu] += delta; + } + } else { + /* + * Since at least one full window has elapsed, + * the contribution to the previous window is the + * full window (window_size). + */ + delta = scale_exec_time(window_size, rq); + if (!is_idle_task(p) && !exiting_task(p)) { + p->ravg.prev_window = delta; + p->ravg.prev_window_cpu[cpu] = delta; + } + } + + /* + * Rollover is done here by overwriting the values in + * prev_runnable_sum and curr_runnable_sum. + */ + *prev_runnable_sum += delta; + if (new_task) + *nt_prev_runnable_sum += delta; + + /* Account piece of busy time in the current window. */ + delta = scale_exec_time(wallclock - window_start, rq); + *curr_runnable_sum += delta; + if (new_task) + *nt_curr_runnable_sum += delta; + + if (!is_idle_task(p) && !exiting_task(p)) { + p->ravg.curr_window = delta; + p->ravg.curr_window_cpu[cpu] = delta; + } + + goto done; + } + + if (irqtime) { + /* + * account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. A new window + * has started and p is the current task so rollover is + * needed. The current task must be the idle task because + * irqtime is not accounted for any other task. + * + * Irqtime will be accounted each time we process IRQ activity + * after a period of idleness, so we know the IRQ busy time + * started at wallclock - irqtime. + */ + + BUG_ON(!is_idle_task(p)); + mark_start = wallclock - irqtime; + + /* + * Roll window over. If IRQ busy time was just in the current + * window then that is all that need be accounted. + */ + if (mark_start > window_start) { + *curr_runnable_sum = scale_exec_time(irqtime, rq); + return; + } + + /* + * The IRQ busy time spanned multiple windows. Process the + * busy time preceding the current window start first. + */ + delta = window_start - mark_start; + if (delta > window_size) + delta = window_size; + delta = scale_exec_time(delta, rq); + *prev_runnable_sum += delta; + + /* Process the remaining IRQ busy time in the current window. */ + delta = wallclock - window_start; + rq->curr_runnable_sum = scale_exec_time(delta, rq); + + return; + } + +done: + if (!is_idle_task(p) && !exiting_task(p)) + update_top_tasks(p, rq, old_curr_window, + new_window, full_window); +} + +static inline u32 predict_and_update_buckets(struct rq *rq, + struct task_struct *p, u32 runtime) { + + int bidx; + u32 pred_demand; + + bidx = busy_to_bucket(runtime); + pred_demand = get_pred_busy(rq, p, bidx, runtime); + bucket_increase(p->ravg.busy_buckets, bidx); + + return pred_demand; +} + +#define THRESH_CC_UPDATE (2 * NSEC_PER_USEC) + +/* + * Assumes rq_lock is held and wallclock was recorded in the same critical + * section as this function's invocation. + */ +static inline u64 read_cycle_counter(int cpu, u64 wallclock) +{ + struct sched_cluster *cluster = cpu_rq(cpu)->cluster; + u64 delta; + + if (unlikely(!cluster)) + return cpu_cycle_counter_cb.get_cpu_cycle_counter(cpu); + + /* + * Why don't we need locking here? Let's say that delta is negative + * because some other CPU happened to update last_cc_update with a + * more recent timestamp. We simply read the conter again in that case + * with no harmful side effects. This can happen if there is an FIQ + * between when we read the wallclock and when we use it here. + */ + delta = wallclock - atomic64_read(&cluster->last_cc_update); + if (delta > THRESH_CC_UPDATE) { + atomic64_set(&cluster->cycles, + cpu_cycle_counter_cb.get_cpu_cycle_counter(cpu)); + atomic64_set(&cluster->last_cc_update, wallclock); + } + + return atomic64_read(&cluster->cycles); +} + +static void update_task_cpu_cycles(struct task_struct *p, int cpu, + u64 wallclock) +{ + if (use_cycle_counter) + p->cpu_cycles = read_cycle_counter(cpu, wallclock); +} + +static void +update_task_rq_cpu_cycles(struct task_struct *p, struct rq *rq, int event, + u64 wallclock, u64 irqtime) +{ + u64 cur_cycles; + int cpu = cpu_of(rq); + + lockdep_assert_held(&rq->lock); + + if (!use_cycle_counter) { + rq->cc.cycles = cpu_cur_freq(cpu); + rq->cc.time = 1; + return; + } + + cur_cycles = read_cycle_counter(cpu, wallclock); + + /* + * If current task is idle task and irqtime == 0 CPU was + * indeed idle and probably its cycle counter was not + * increasing. We still need estimatied CPU frequency + * for IO wait time accounting. Use the previously + * calculated frequency in such a case. + */ + if (!is_idle_task(rq->curr) || irqtime) { + if (unlikely(cur_cycles < p->cpu_cycles)) + rq->cc.cycles = cur_cycles + (U64_MAX - p->cpu_cycles); + else + rq->cc.cycles = cur_cycles - p->cpu_cycles; + rq->cc.cycles = rq->cc.cycles * NSEC_PER_MSEC; + + if (event == IRQ_UPDATE && is_idle_task(p)) + /* + * Time between mark_start of idle task and IRQ handler + * entry time is CPU cycle counter stall period. + * Upon IRQ handler entry sched_account_irqstart() + * replenishes idle task's cpu cycle counter so + * rq->cc.cycles now represents increased cycles during + * IRQ handler rather than time between idle entry and + * IRQ exit. Thus use irqtime as time delta. + */ + rq->cc.time = irqtime; + else + rq->cc.time = wallclock - p->ravg.mark_start; + BUG_ON((s64)rq->cc.time < 0); + } + + p->cpu_cycles = cur_cycles; + + trace_sched_get_task_cpu_cycles(cpu, event, rq->cc.cycles, + rq->cc.time, p); +} + +static int +account_busy_for_task_demand(struct rq *rq, struct task_struct *p, int event) +{ + /* + * No need to bother updating task demand for exiting tasks + * or the idle task. + */ + if (exiting_task(p) || is_idle_task(p)) + return 0; + + /* + * When a task is waking up it is completing a segment of non-busy + * time. Likewise, if wait time is not treated as busy time, then + * when a task begins to run or is migrated, it is not running and + * is completing a segment of non-busy time. + */ + if (event == TASK_WAKE || (!SCHED_ACCOUNT_WAIT_TIME && + (event == PICK_NEXT_TASK || event == TASK_MIGRATE))) + return 0; + + /* + * TASK_UPDATE can be called on sleeping task, when its moved between + * related groups + */ + if (event == TASK_UPDATE) { + if (rq->curr == p) + return 1; + + return p->on_rq ? SCHED_ACCOUNT_WAIT_TIME : 0; + } + + return 1; +} + +/* + * Called when new window is starting for a task, to record cpu usage over + * recently concluded window(s). Normally 'samples' should be 1. It can be > 1 + * when, say, a real-time task runs without preemption for several windows at a + * stretch. + */ +static void update_history(struct rq *rq, struct task_struct *p, + u32 runtime, int samples, int event) +{ + u32 *hist = &p->ravg.sum_history[0]; + int ridx, widx; + u32 max = 0, avg, demand, pred_demand; + u64 sum = 0; + + /* Ignore windows where task had no activity */ + if (!runtime || is_idle_task(p) || exiting_task(p) || !samples) + goto done; + + /* Push new 'runtime' value onto stack */ + widx = sched_ravg_hist_size - 1; + ridx = widx - samples; + for (; ridx >= 0; --widx, --ridx) { + hist[widx] = hist[ridx]; + sum += hist[widx]; + if (hist[widx] > max) + max = hist[widx]; + } + + for (widx = 0; widx < samples && widx < sched_ravg_hist_size; widx++) { + hist[widx] = runtime; + sum += hist[widx]; + if (hist[widx] > max) + max = hist[widx]; + } + + p->ravg.sum = 0; + + if (sched_window_stats_policy == WINDOW_STATS_RECENT) { + demand = runtime; + } else if (sched_window_stats_policy == WINDOW_STATS_MAX) { + demand = max; + } else { + avg = div64_u64(sum, sched_ravg_hist_size); + if (sched_window_stats_policy == WINDOW_STATS_AVG) + demand = avg; + else + demand = max(avg, runtime); + } + pred_demand = predict_and_update_buckets(rq, p, runtime); + + /* + * A throttled deadline sched class task gets dequeued without + * changing p->on_rq. Since the dequeue decrements hmp stats + * avoid decrementing it here again. + */ + if (task_on_rq_queued(p) && (!task_has_dl_policy(p) || + !p->dl.dl_throttled)) + p->sched_class->fixup_hmp_sched_stats(rq, p, demand, + pred_demand); + + p->ravg.demand = demand; + p->ravg.pred_demand = pred_demand; + +done: + trace_sched_update_history(rq, p, runtime, samples, event); +} + +static u64 add_to_task_demand(struct rq *rq, struct task_struct *p, u64 delta) +{ + delta = scale_exec_time(delta, rq); + p->ravg.sum += delta; + if (unlikely(p->ravg.sum > sched_ravg_window)) + p->ravg.sum = sched_ravg_window; + + return delta; +} + +/* + * Account cpu demand of task and/or update task's cpu demand history + * + * ms = p->ravg.mark_start; + * wc = wallclock + * ws = rq->window_start + * + * Three possibilities: + * + * a) Task event is contained within one window. + * window_start < mark_start < wallclock + * + * ws ms wc + * | | | + * V V V + * |---------------| + * + * In this case, p->ravg.sum is updated *iff* event is appropriate + * (ex: event == PUT_PREV_TASK) + * + * b) Task event spans two windows. + * mark_start < window_start < wallclock + * + * ms ws wc + * | | | + * V V V + * -----|------------------- + * + * In this case, p->ravg.sum is updated with (ws - ms) *iff* event + * is appropriate, then a new window sample is recorded followed + * by p->ravg.sum being set to (wc - ws) *iff* event is appropriate. + * + * c) Task event spans more than two windows. + * + * ms ws_tmp ws wc + * | | | | + * V V V V + * ---|-------|-------|-------|-------|------ + * | | + * |<------ nr_full_windows ------>| + * + * In this case, p->ravg.sum is updated with (ws_tmp - ms) first *iff* + * event is appropriate, window sample of p->ravg.sum is recorded, + * 'nr_full_window' samples of window_size is also recorded *iff* + * event is appropriate and finally p->ravg.sum is set to (wc - ws) + * *iff* event is appropriate. + * + * IMPORTANT : Leave p->ravg.mark_start unchanged, as update_cpu_busy_time() + * depends on it! + */ +static u64 update_task_demand(struct task_struct *p, struct rq *rq, + int event, u64 wallclock) +{ + u64 mark_start = p->ravg.mark_start; + u64 delta, window_start = rq->window_start; + int new_window, nr_full_windows; + u32 window_size = sched_ravg_window; + u64 runtime; + + new_window = mark_start < window_start; + if (!account_busy_for_task_demand(rq, p, event)) { + if (new_window) + /* + * If the time accounted isn't being accounted as + * busy time, and a new window started, only the + * previous window need be closed out with the + * pre-existing demand. Multiple windows may have + * elapsed, but since empty windows are dropped, + * it is not necessary to account those. + */ + update_history(rq, p, p->ravg.sum, 1, event); + return 0; + } + + if (!new_window) { + /* + * The simple case - busy time contained within the existing + * window. + */ + return add_to_task_demand(rq, p, wallclock - mark_start); + } + + /* + * Busy time spans at least two windows. Temporarily rewind + * window_start to first window boundary after mark_start. + */ + delta = window_start - mark_start; + nr_full_windows = div64_u64(delta, window_size); + window_start -= (u64)nr_full_windows * (u64)window_size; + + /* Process (window_start - mark_start) first */ + runtime = add_to_task_demand(rq, p, window_start - mark_start); + + /* Push new sample(s) into task's demand history */ + update_history(rq, p, p->ravg.sum, 1, event); + if (nr_full_windows) { + u64 scaled_window = scale_exec_time(window_size, rq); + + update_history(rq, p, scaled_window, nr_full_windows, event); + runtime += nr_full_windows * scaled_window; + } + + /* + * Roll window_start back to current to process any remainder + * in current window. + */ + window_start += (u64)nr_full_windows * (u64)window_size; + + /* Process (wallclock - window_start) next */ + mark_start = window_start; + runtime += add_to_task_demand(rq, p, wallclock - mark_start); + + return runtime; +} + +static inline void +update_task_burst(struct task_struct *p, struct rq *rq, int event, u64 runtime) +{ + /* + * update_task_demand() has checks for idle task and + * exit task. The runtime may include the wait time, + * so update the burst only for the cases where the + * task is running. + */ + if (event == PUT_PREV_TASK || (event == TASK_UPDATE && + rq->curr == p)) + p->ravg.curr_burst += runtime; +} + +/* Reflect task activity on its demand and cpu's busy time statistics */ +void update_task_ravg(struct task_struct *p, struct rq *rq, int event, + u64 wallclock, u64 irqtime) +{ + u64 runtime; + + if (!rq->window_start || sched_disable_window_stats || + p->ravg.mark_start == wallclock) + return; + + lockdep_assert_held(&rq->lock); + + update_window_start(rq, wallclock); + + if (!p->ravg.mark_start) { + update_task_cpu_cycles(p, cpu_of(rq), wallclock); + goto done; + } + + update_task_rq_cpu_cycles(p, rq, event, wallclock, irqtime); + runtime = update_task_demand(p, rq, event, wallclock); + if (runtime) + update_task_burst(p, rq, event, runtime); + update_cpu_busy_time(p, rq, event, wallclock, irqtime); + update_task_pred_demand(rq, p, event); + + if (exiting_task(p)) + goto done; + + trace_sched_update_task_ravg(p, rq, event, wallclock, irqtime, + rq->cc.cycles, rq->cc.time, + p->grp ? &rq->grp_time : NULL); + +done: + p->ravg.mark_start = wallclock; +} + +void sched_account_irqtime(int cpu, struct task_struct *curr, + u64 delta, u64 wallclock) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags, nr_windows; + u64 cur_jiffies_ts; + + raw_spin_lock_irqsave(&rq->lock, flags); + + /* + * cputime (wallclock) uses sched_clock so use the same here for + * consistency. + */ + delta += sched_clock() - wallclock; + cur_jiffies_ts = get_jiffies_64(); + + if (is_idle_task(curr)) + update_task_ravg(curr, rq, IRQ_UPDATE, sched_ktime_clock(), + delta); + + nr_windows = cur_jiffies_ts - rq->irqload_ts; + + if (nr_windows) { + if (nr_windows < 10) { + /* Decay CPU's irqload by 3/4 for each window. */ + rq->avg_irqload *= (3 * nr_windows); + rq->avg_irqload = div64_u64(rq->avg_irqload, + 4 * nr_windows); + } else { + rq->avg_irqload = 0; + } + rq->avg_irqload += rq->cur_irqload; + rq->cur_irqload = 0; + } + + rq->cur_irqload += delta; + rq->irqload_ts = cur_jiffies_ts; + raw_spin_unlock_irqrestore(&rq->lock, flags); +} + +void sched_account_irqstart(int cpu, struct task_struct *curr, u64 wallclock) +{ + struct rq *rq = cpu_rq(cpu); + + if (!rq->window_start || sched_disable_window_stats) + return; + + if (is_idle_task(curr)) { + /* We're here without rq->lock held, IRQ disabled */ + raw_spin_lock(&rq->lock); + update_task_cpu_cycles(curr, cpu, sched_ktime_clock()); + raw_spin_unlock(&rq->lock); + } +} + +void reset_task_stats(struct task_struct *p) +{ + u32 sum = 0; + u32 *curr_window_ptr = NULL; + u32 *prev_window_ptr = NULL; + + if (exiting_task(p)) { + sum = EXITING_TASK_MARKER; + } else { + curr_window_ptr = p->ravg.curr_window_cpu; + prev_window_ptr = p->ravg.prev_window_cpu; + memset(curr_window_ptr, 0, sizeof(u32) * nr_cpu_ids); + memset(prev_window_ptr, 0, sizeof(u32) * nr_cpu_ids); + } + + memset(&p->ravg, 0, sizeof(struct ravg)); + + p->ravg.curr_window_cpu = curr_window_ptr; + p->ravg.prev_window_cpu = prev_window_ptr; + + p->ravg.avg_burst = 2 * (u64)sysctl_sched_short_burst; + + /* Retain EXITING_TASK marker */ + p->ravg.sum_history[0] = sum; +} + +void mark_task_starting(struct task_struct *p) +{ + u64 wallclock; + struct rq *rq = task_rq(p); + + if (!rq->window_start || sched_disable_window_stats) { + reset_task_stats(p); + return; + } + + wallclock = sched_ktime_clock(); + p->ravg.mark_start = p->last_wake_ts = wallclock; + p->last_cpu_selected_ts = wallclock; + p->last_switch_out_ts = 0; + update_task_cpu_cycles(p, cpu_of(rq), wallclock); +} + +void set_window_start(struct rq *rq) +{ + static int sync_cpu_available; + + if (rq->window_start) + return; + + if (!sync_cpu_available) { + rq->window_start = sched_ktime_clock(); + sync_cpu_available = 1; + } else { + struct rq *sync_rq = cpu_rq(cpumask_any(cpu_online_mask)); + + raw_spin_unlock(&rq->lock); + double_rq_lock(rq, sync_rq); + rq->window_start = sync_rq->window_start; + rq->curr_runnable_sum = rq->prev_runnable_sum = 0; + rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0; + raw_spin_unlock(&sync_rq->lock); + } + + rq->curr->ravg.mark_start = rq->window_start; +} + +static void reset_all_task_stats(void) +{ + struct task_struct *g, *p; + + do_each_thread(g, p) { + reset_task_stats(p); + } while_each_thread(g, p); +} + +enum reset_reason_code { + WINDOW_CHANGE, + POLICY_CHANGE, + HIST_SIZE_CHANGE, + FREQ_AGGREGATE_CHANGE, +}; + +const char *sched_window_reset_reasons[] = { + "WINDOW_CHANGE", + "POLICY_CHANGE", + "HIST_SIZE_CHANGE", + "FREQ_AGGREGATE_CHANGE", +}; + +/* Called with IRQs enabled */ +void reset_all_window_stats(u64 window_start, unsigned int window_size) +{ + int cpu, i; + unsigned long flags; + u64 start_ts = sched_ktime_clock(); + int reason = WINDOW_CHANGE; + unsigned int old = 0, new = 0; + + local_irq_save(flags); + + read_lock(&tasklist_lock); + + read_lock(&related_thread_group_lock); + + /* Taking all runqueue locks prevents race with sched_exit(). */ + for_each_possible_cpu(cpu) + raw_spin_lock(&cpu_rq(cpu)->lock); + + sched_disable_window_stats = 1; + + reset_all_task_stats(); + + read_unlock(&tasklist_lock); + + if (window_size) { + sched_ravg_window = window_size * TICK_NSEC; + set_hmp_defaults(); + sched_load_granule = sched_ravg_window / NUM_LOAD_INDICES; + } + + sched_disable_window_stats = 0; + + for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + + if (window_start) + rq->window_start = window_start; + rq->curr_runnable_sum = rq->prev_runnable_sum = 0; + rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0; + memset(&rq->grp_time, 0, sizeof(struct group_cpu_time)); + for (i = 0; i < NUM_TRACKED_WINDOWS; i++) { + memset(&rq->load_subs[i], 0, + sizeof(struct load_subtractions)); + clear_top_tasks_table(rq->top_tasks[i]); + clear_top_tasks_bitmap(rq->top_tasks_bitmap[i]); + } + + rq->curr_table = 0; + rq->curr_top = 0; + rq->prev_top = 0; + reset_cpu_hmp_stats(cpu, 1); + } + + if (sched_window_stats_policy != sysctl_sched_window_stats_policy) { + reason = POLICY_CHANGE; + old = sched_window_stats_policy; + new = sysctl_sched_window_stats_policy; + sched_window_stats_policy = sysctl_sched_window_stats_policy; + } else if (sched_ravg_hist_size != sysctl_sched_ravg_hist_size) { + reason = HIST_SIZE_CHANGE; + old = sched_ravg_hist_size; + new = sysctl_sched_ravg_hist_size; + sched_ravg_hist_size = sysctl_sched_ravg_hist_size; + } else if (sched_freq_aggregate != + sysctl_sched_freq_aggregate) { + reason = FREQ_AGGREGATE_CHANGE; + old = sched_freq_aggregate; + new = sysctl_sched_freq_aggregate; + sched_freq_aggregate = sysctl_sched_freq_aggregate; + } + + for_each_possible_cpu(cpu) + raw_spin_unlock(&cpu_rq(cpu)->lock); + + read_unlock(&related_thread_group_lock); + + local_irq_restore(flags); + + trace_sched_reset_all_window_stats(window_start, window_size, + sched_ktime_clock() - start_ts, reason, old, new); +} + +/* + * In this function we match the accumulated subtractions with the current + * and previous windows we are operating with. Ignore any entries where + * the window start in the load_subtraction struct does not match either + * the curent or the previous window. This could happen whenever CPUs + * become idle or busy with interrupts disabled for an extended period. + */ +static inline void account_load_subtractions(struct rq *rq) +{ + u64 ws = rq->window_start; + u64 prev_ws = ws - sched_ravg_window; + struct load_subtractions *ls = rq->load_subs; + int i; + + for (i = 0; i < NUM_TRACKED_WINDOWS; i++) { + if (ls[i].window_start == ws) { + rq->curr_runnable_sum -= ls[i].subs; + rq->nt_curr_runnable_sum -= ls[i].new_subs; + } else if (ls[i].window_start == prev_ws) { + rq->prev_runnable_sum -= ls[i].subs; + rq->nt_prev_runnable_sum -= ls[i].new_subs; + } + + ls[i].subs = 0; + ls[i].new_subs = 0; + } + + BUG_ON((s64)rq->prev_runnable_sum < 0); + BUG_ON((s64)rq->curr_runnable_sum < 0); + BUG_ON((s64)rq->nt_prev_runnable_sum < 0); + BUG_ON((s64)rq->nt_curr_runnable_sum < 0); +} + +static inline u64 freq_policy_load(struct rq *rq, u64 load) +{ + unsigned int reporting_policy = sysctl_sched_freq_reporting_policy; + + switch (reporting_policy) { + case FREQ_REPORT_MAX_CPU_LOAD_TOP_TASK: + load = max_t(u64, load, top_task_load(rq)); + break; + case FREQ_REPORT_TOP_TASK: + load = top_task_load(rq); + break; + case FREQ_REPORT_CPU_LOAD: + break; + default: + break; + } + + return load; +} + +void sched_get_cpus_busy(struct sched_load *busy, + const struct cpumask *query_cpus) +{ + unsigned long flags; + struct rq *rq; + const int cpus = cpumask_weight(query_cpus); + u64 load[cpus], group_load[cpus]; + u64 nload[cpus], ngload[cpus]; + u64 pload[cpus]; + unsigned int max_freq[cpus]; + int notifier_sent = 0; + int early_detection[cpus]; + int cpu, i = 0; + unsigned int window_size; + u64 max_prev_sum = 0; + int max_busy_cpu = cpumask_first(query_cpus); + u64 total_group_load = 0, total_ngload = 0; + bool aggregate_load = false; + struct sched_cluster *cluster = cpu_cluster(cpumask_first(query_cpus)); + + if (unlikely(cpus == 0)) + return; + + local_irq_save(flags); + + /* + * This function could be called in timer context, and the + * current task may have been executing for a long time. Ensure + * that the window stats are current by doing an update. + */ + + for_each_cpu(cpu, query_cpus) + raw_spin_lock(&cpu_rq(cpu)->lock); + + window_size = sched_ravg_window; + + /* + * We don't really need the cluster lock for this entire for loop + * block. However, there is no advantage in optimizing this as rq + * locks are held regardless and would prevent migration anyways + */ + raw_spin_lock(&cluster->load_lock); + + for_each_cpu(cpu, query_cpus) { + rq = cpu_rq(cpu); + + update_task_ravg(rq->curr, rq, TASK_UPDATE, sched_ktime_clock(), + 0); + + /* + * Ensure that we don't report load for 'cpu' again via the + * cpufreq_update_util path in the window that started at + * rq->window_start + */ + rq->load_reported_window = rq->window_start; + + account_load_subtractions(rq); + load[i] = rq->prev_runnable_sum; + nload[i] = rq->nt_prev_runnable_sum; + pload[i] = rq->hmp_stats.pred_demands_sum; + rq->old_estimated_time = pload[i]; + + if (load[i] > max_prev_sum) { + max_prev_sum = load[i]; + max_busy_cpu = cpu; + } + + /* + * sched_get_cpus_busy() is called for all CPUs in a + * frequency domain. So the notifier_sent flag per + * cluster works even when a frequency domain spans + * more than 1 cluster. + */ + if (rq->cluster->notifier_sent) { + notifier_sent = 1; + rq->cluster->notifier_sent = 0; + } + early_detection[i] = (rq->ed_task != NULL); + max_freq[i] = cpu_max_freq(cpu); + i++; + } + + raw_spin_unlock(&cluster->load_lock); + + group_load_in_freq_domain( + &cpu_rq(max_busy_cpu)->freq_domain_cpumask, + &total_group_load, &total_ngload); + aggregate_load = !!(total_group_load > sched_freq_aggregate_threshold); + + i = 0; + for_each_cpu(cpu, query_cpus) { + group_load[i] = 0; + ngload[i] = 0; + + if (early_detection[i]) + goto skip_early; + + rq = cpu_rq(cpu); + if (aggregate_load) { + if (cpu == max_busy_cpu) { + group_load[i] = total_group_load; + ngload[i] = total_ngload; + } + } else { + group_load[i] = rq->grp_time.prev_runnable_sum; + ngload[i] = rq->grp_time.nt_prev_runnable_sum; + } + + load[i] += group_load[i]; + nload[i] += ngload[i]; + + load[i] = freq_policy_load(rq, load[i]); + rq->old_busy_time = load[i]; + + /* + * Scale load in reference to cluster max_possible_freq. + * + * Note that scale_load_to_cpu() scales load in reference to + * the cluster max_freq. + */ + load[i] = scale_load_to_cpu(load[i], cpu); + nload[i] = scale_load_to_cpu(nload[i], cpu); + pload[i] = scale_load_to_cpu(pload[i], cpu); +skip_early: + i++; + } + + for_each_cpu(cpu, query_cpus) + raw_spin_unlock(&(cpu_rq(cpu))->lock); + + local_irq_restore(flags); + + i = 0; + for_each_cpu(cpu, query_cpus) { + rq = cpu_rq(cpu); + + if (early_detection[i]) { + busy[i].prev_load = div64_u64(sched_ravg_window, + NSEC_PER_USEC); + busy[i].new_task_load = 0; + busy[i].predicted_load = 0; + goto exit_early; + } + + load[i] = scale_load_to_freq(load[i], max_freq[i], + cpu_max_possible_freq(cpu)); + nload[i] = scale_load_to_freq(nload[i], max_freq[i], + cpu_max_possible_freq(cpu)); + + pload[i] = scale_load_to_freq(pload[i], max_freq[i], + rq->cluster->max_possible_freq); + + busy[i].prev_load = div64_u64(load[i], NSEC_PER_USEC); + busy[i].new_task_load = div64_u64(nload[i], NSEC_PER_USEC); + busy[i].predicted_load = div64_u64(pload[i], NSEC_PER_USEC); + +exit_early: + trace_sched_get_busy(cpu, busy[i].prev_load, + busy[i].new_task_load, + busy[i].predicted_load, + early_detection[i], + aggregate_load && + cpu == max_busy_cpu); + i++; + } +} + +void sched_set_io_is_busy(int val) +{ + sched_io_is_busy = val; +} + +int sched_set_window(u64 window_start, unsigned int window_size) +{ + u64 now, cur_jiffies, jiffy_ktime_ns; + s64 ws; + unsigned long flags; + + if (window_size * TICK_NSEC < MIN_SCHED_RAVG_WINDOW) + return -EINVAL; + + mutex_lock(&policy_mutex); + + /* + * Get a consistent view of ktime, jiffies, and the time + * since the last jiffy (based on last_jiffies_update). + */ + local_irq_save(flags); + cur_jiffies = jiffy_to_ktime_ns(&now, &jiffy_ktime_ns); + local_irq_restore(flags); + + /* translate window_start from jiffies to nanoseconds */ + ws = (window_start - cur_jiffies); /* jiffy difference */ + ws *= TICK_NSEC; + ws += jiffy_ktime_ns; + + /* + * Roll back calculated window start so that it is in + * the past (window stats must have a current window). + */ + while (ws > now) + ws -= (window_size * TICK_NSEC); + + BUG_ON(sched_ktime_clock() < ws); + + reset_all_window_stats(ws, window_size); + + sched_update_freq_max_load(cpu_possible_mask); + + mutex_unlock(&policy_mutex); + + return 0; +} + +static inline void create_subtraction_entry(struct rq *rq, u64 ws, int index) +{ + rq->load_subs[index].window_start = ws; + rq->load_subs[index].subs = 0; + rq->load_subs[index].new_subs = 0; +} + +static bool get_subtraction_index(struct rq *rq, u64 ws) +{ + int i; + u64 oldest = ULLONG_MAX; + int oldest_index = 0; + + for (i = 0; i < NUM_TRACKED_WINDOWS; i++) { + u64 entry_ws = rq->load_subs[i].window_start; + + if (ws == entry_ws) + return i; + + if (entry_ws < oldest) { + oldest = entry_ws; + oldest_index = i; + } + } + + create_subtraction_entry(rq, ws, oldest_index); + return oldest_index; +} + +static void update_rq_load_subtractions(int index, struct rq *rq, + u32 sub_load, bool new_task) +{ + rq->load_subs[index].subs += sub_load; + if (new_task) + rq->load_subs[index].new_subs += sub_load; +} + +static void update_cluster_load_subtractions(struct task_struct *p, + int cpu, u64 ws, bool new_task) +{ + struct sched_cluster *cluster = cpu_cluster(cpu); + struct cpumask cluster_cpus = cluster->cpus; + u64 prev_ws = ws - sched_ravg_window; + int i; + + cpumask_clear_cpu(cpu, &cluster_cpus); + raw_spin_lock(&cluster->load_lock); + + for_each_cpu(i, &cluster_cpus) { + struct rq *rq = cpu_rq(i); + int index; + + if (p->ravg.curr_window_cpu[i]) { + index = get_subtraction_index(rq, ws); + update_rq_load_subtractions(index, rq, + p->ravg.curr_window_cpu[i], new_task); + p->ravg.curr_window_cpu[i] = 0; + } + + if (p->ravg.prev_window_cpu[i]) { + index = get_subtraction_index(rq, prev_ws); + update_rq_load_subtractions(index, rq, + p->ravg.prev_window_cpu[i], new_task); + p->ravg.prev_window_cpu[i] = 0; + } + } + + raw_spin_unlock(&cluster->load_lock); +} + +static inline void inter_cluster_migration_fixup + (struct task_struct *p, int new_cpu, int task_cpu, bool new_task) +{ + struct rq *dest_rq = cpu_rq(new_cpu); + struct rq *src_rq = cpu_rq(task_cpu); + + if (same_freq_domain(new_cpu, task_cpu)) + return; + + p->ravg.curr_window_cpu[new_cpu] = p->ravg.curr_window; + p->ravg.prev_window_cpu[new_cpu] = p->ravg.prev_window; + + dest_rq->curr_runnable_sum += p->ravg.curr_window; + dest_rq->prev_runnable_sum += p->ravg.prev_window; + + src_rq->curr_runnable_sum -= p->ravg.curr_window_cpu[task_cpu]; + src_rq->prev_runnable_sum -= p->ravg.prev_window_cpu[task_cpu]; + + if (new_task) { + dest_rq->nt_curr_runnable_sum += p->ravg.curr_window; + dest_rq->nt_prev_runnable_sum += p->ravg.prev_window; + + src_rq->nt_curr_runnable_sum -= + p->ravg.curr_window_cpu[task_cpu]; + src_rq->nt_prev_runnable_sum -= + p->ravg.prev_window_cpu[task_cpu]; + } + + p->ravg.curr_window_cpu[task_cpu] = 0; + p->ravg.prev_window_cpu[task_cpu] = 0; + + update_cluster_load_subtractions(p, task_cpu, + src_rq->window_start, new_task); + + BUG_ON((s64)src_rq->prev_runnable_sum < 0); + BUG_ON((s64)src_rq->curr_runnable_sum < 0); + BUG_ON((s64)src_rq->nt_prev_runnable_sum < 0); + BUG_ON((s64)src_rq->nt_curr_runnable_sum < 0); +} + +static int get_top_index(unsigned long *bitmap, unsigned long old_top) +{ + int index = find_next_bit(bitmap, NUM_LOAD_INDICES, old_top); + + if (index == NUM_LOAD_INDICES) + return 0; + + return NUM_LOAD_INDICES - 1 - index; +} + +static void +migrate_top_tasks(struct task_struct *p, struct rq *src_rq, struct rq *dst_rq) +{ + int index; + int top_index; + u32 curr_window = p->ravg.curr_window; + u32 prev_window = p->ravg.prev_window; + u8 src = src_rq->curr_table; + u8 dst = dst_rq->curr_table; + u8 *src_table; + u8 *dst_table; + + if (curr_window) { + src_table = src_rq->top_tasks[src]; + dst_table = dst_rq->top_tasks[dst]; + index = load_to_index(curr_window); + src_table[index] -= 1; + dst_table[index] += 1; + + if (!src_table[index]) + __clear_bit(NUM_LOAD_INDICES - index - 1, + src_rq->top_tasks_bitmap[src]); + + if (dst_table[index] == 1) + __set_bit(NUM_LOAD_INDICES - index - 1, + dst_rq->top_tasks_bitmap[dst]); + + if (index > dst_rq->curr_top) + dst_rq->curr_top = index; + + top_index = src_rq->curr_top; + if (index == top_index && !src_table[index]) + src_rq->curr_top = get_top_index( + src_rq->top_tasks_bitmap[src], top_index); + } + + if (prev_window) { + src = 1 - src; + dst = 1 - dst; + src_table = src_rq->top_tasks[src]; + dst_table = dst_rq->top_tasks[dst]; + index = load_to_index(prev_window); + src_table[index] -= 1; + dst_table[index] += 1; + + if (!src_table[index]) + __clear_bit(NUM_LOAD_INDICES - index - 1, + src_rq->top_tasks_bitmap[src]); + + if (dst_table[index] == 1) + __set_bit(NUM_LOAD_INDICES - index - 1, + dst_rq->top_tasks_bitmap[dst]); + + if (index > dst_rq->prev_top) + dst_rq->prev_top = index; + + top_index = src_rq->prev_top; + if (index == top_index && !src_table[index]) + src_rq->prev_top = get_top_index( + src_rq->top_tasks_bitmap[src], top_index); + } +} + +void fixup_busy_time(struct task_struct *p, int new_cpu) +{ + struct rq *src_rq = task_rq(p); + struct rq *dest_rq = cpu_rq(new_cpu); + u64 wallclock; + u64 *src_curr_runnable_sum, *dst_curr_runnable_sum; + u64 *src_prev_runnable_sum, *dst_prev_runnable_sum; + u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum; + u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum; + bool new_task; + struct related_thread_group *grp; + + if (!p->on_rq && p->state != TASK_WAKING) + return; + + if (exiting_task(p)) { + clear_ed_task(p, src_rq); + return; + } + + if (p->state == TASK_WAKING) + double_rq_lock(src_rq, dest_rq); + + if (sched_disable_window_stats) + goto done; + + wallclock = sched_ktime_clock(); + + update_task_ravg(task_rq(p)->curr, task_rq(p), + TASK_UPDATE, + wallclock, 0); + update_task_ravg(dest_rq->curr, dest_rq, + TASK_UPDATE, wallclock, 0); + + update_task_ravg(p, task_rq(p), TASK_MIGRATE, + wallclock, 0); + + update_task_cpu_cycles(p, new_cpu, wallclock); + + new_task = is_new_task(p); + /* Protected by rq_lock */ + grp = p->grp; + + /* + * For frequency aggregation, we continue to do migration fixups + * even for intra cluster migrations. This is because, the aggregated + * load has to reported on a single CPU regardless. + */ + if (grp && sched_freq_aggregate) { + struct group_cpu_time *cpu_time; + + cpu_time = &src_rq->grp_time; + src_curr_runnable_sum = &cpu_time->curr_runnable_sum; + src_prev_runnable_sum = &cpu_time->prev_runnable_sum; + src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; + src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; + + cpu_time = &dest_rq->grp_time; + dst_curr_runnable_sum = &cpu_time->curr_runnable_sum; + dst_prev_runnable_sum = &cpu_time->prev_runnable_sum; + dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; + dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; + + if (p->ravg.curr_window) { + *src_curr_runnable_sum -= p->ravg.curr_window; + *dst_curr_runnable_sum += p->ravg.curr_window; + if (new_task) { + *src_nt_curr_runnable_sum -= + p->ravg.curr_window; + *dst_nt_curr_runnable_sum += + p->ravg.curr_window; + } + } + + if (p->ravg.prev_window) { + *src_prev_runnable_sum -= p->ravg.prev_window; + *dst_prev_runnable_sum += p->ravg.prev_window; + if (new_task) { + *src_nt_prev_runnable_sum -= + p->ravg.prev_window; + *dst_nt_prev_runnable_sum += + p->ravg.prev_window; + } + } + } else { + inter_cluster_migration_fixup(p, new_cpu, + task_cpu(p), new_task); + } + + migrate_top_tasks(p, src_rq, dest_rq); + + if (!same_freq_domain(new_cpu, task_cpu(p))) { + cpufreq_update_util(dest_rq, SCHED_CPUFREQ_INTERCLUSTER_MIG); + cpufreq_update_util(src_rq, SCHED_CPUFREQ_INTERCLUSTER_MIG); + } + + if (p == src_rq->ed_task) { + src_rq->ed_task = NULL; + if (!dest_rq->ed_task) + dest_rq->ed_task = p; + } + +done: + if (p->state == TASK_WAKING) + double_rq_unlock(src_rq, dest_rq); +} + +#define sched_up_down_migrate_auto_update 1 +static void check_for_up_down_migrate_update(const struct cpumask *cpus) +{ + int i = cpumask_first(cpus); + + if (!sched_up_down_migrate_auto_update) + return; + + if (cpu_max_possible_capacity(i) == max_possible_capacity) + return; + + if (cpu_max_possible_freq(i) == cpu_max_freq(i)) + up_down_migrate_scale_factor = 1024; + else + up_down_migrate_scale_factor = (1024 * + cpu_max_possible_freq(i)) / cpu_max_freq(i); + + update_up_down_migrate(); +} + +/* Return cluster which can offer required capacity for group */ +static struct sched_cluster *best_cluster(struct related_thread_group *grp, + u64 total_demand, bool group_boost) +{ + struct sched_cluster *cluster = NULL; + + for_each_sched_cluster(cluster) { + if (group_will_fit(cluster, grp, total_demand, group_boost)) + return cluster; + } + + return sched_cluster[0]; +} + +static void _set_preferred_cluster(struct related_thread_group *grp) +{ + struct task_struct *p; + u64 combined_demand = 0; + bool boost_on_big = sched_boost_policy() == SCHED_BOOST_ON_BIG; + bool group_boost = false; + u64 wallclock; + + if (list_empty(&grp->tasks)) + return; + + wallclock = sched_ktime_clock(); + + /* + * wakeup of two or more related tasks could race with each other and + * could result in multiple calls to _set_preferred_cluster being issued + * at same time. Avoid overhead in such cases of rechecking preferred + * cluster + */ + if (wallclock - grp->last_update < sched_ravg_window / 10) + return; + + list_for_each_entry(p, &grp->tasks, grp_list) { + if (boost_on_big && task_sched_boost(p)) { + group_boost = true; + break; + } + + if (p->ravg.mark_start < wallclock - + (sched_ravg_window * sched_ravg_hist_size)) + continue; + + combined_demand += p->ravg.demand; + + } + + grp->preferred_cluster = best_cluster(grp, + combined_demand, group_boost); + grp->last_update = sched_ktime_clock(); + trace_sched_set_preferred_cluster(grp, combined_demand); +} + +void set_preferred_cluster(struct related_thread_group *grp) +{ + raw_spin_lock(&grp->lock); + _set_preferred_cluster(grp); + raw_spin_unlock(&grp->lock); +} + +#define ADD_TASK 0 +#define REM_TASK 1 + +#define DEFAULT_CGROUP_COLOC_ID 1 + +/* + * Task's cpu usage is accounted in: + * rq->curr/prev_runnable_sum, when its ->grp is NULL + * grp->cpu_time[cpu]->curr/prev_runnable_sum, when its ->grp is !NULL + * + * Transfer task's cpu usage between those counters when transitioning between + * groups + */ +static void transfer_busy_time(struct rq *rq, struct related_thread_group *grp, + struct task_struct *p, int event) +{ + u64 wallclock; + struct group_cpu_time *cpu_time; + u64 *src_curr_runnable_sum, *dst_curr_runnable_sum; + u64 *src_prev_runnable_sum, *dst_prev_runnable_sum; + u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum; + u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum; + int migrate_type; + int cpu = cpu_of(rq); + bool new_task; + int i; + + if (!sched_freq_aggregate) + return; + + wallclock = sched_ktime_clock(); + + update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); + update_task_ravg(p, rq, TASK_UPDATE, wallclock, 0); + new_task = is_new_task(p); + + cpu_time = &rq->grp_time; + if (event == ADD_TASK) { + migrate_type = RQ_TO_GROUP; + + src_curr_runnable_sum = &rq->curr_runnable_sum; + dst_curr_runnable_sum = &cpu_time->curr_runnable_sum; + src_prev_runnable_sum = &rq->prev_runnable_sum; + dst_prev_runnable_sum = &cpu_time->prev_runnable_sum; + + src_nt_curr_runnable_sum = &rq->nt_curr_runnable_sum; + dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; + src_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum; + dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; + + *src_curr_runnable_sum -= p->ravg.curr_window_cpu[cpu]; + *src_prev_runnable_sum -= p->ravg.prev_window_cpu[cpu]; + if (new_task) { + *src_nt_curr_runnable_sum -= + p->ravg.curr_window_cpu[cpu]; + *src_nt_prev_runnable_sum -= + p->ravg.prev_window_cpu[cpu]; + } + + update_cluster_load_subtractions(p, cpu, + rq->window_start, new_task); + + } else { + migrate_type = GROUP_TO_RQ; + + src_curr_runnable_sum = &cpu_time->curr_runnable_sum; + dst_curr_runnable_sum = &rq->curr_runnable_sum; + src_prev_runnable_sum = &cpu_time->prev_runnable_sum; + dst_prev_runnable_sum = &rq->prev_runnable_sum; + + src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; + dst_nt_curr_runnable_sum = &rq->nt_curr_runnable_sum; + src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; + dst_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum; + + *src_curr_runnable_sum -= p->ravg.curr_window; + *src_prev_runnable_sum -= p->ravg.prev_window; + if (new_task) { + *src_nt_curr_runnable_sum -= p->ravg.curr_window; + *src_nt_prev_runnable_sum -= p->ravg.prev_window; + } + + /* + * Need to reset curr/prev windows for all CPUs, not just the + * ones in the same cluster. Since inter cluster migrations + * did not result in the appropriate book keeping, the values + * per CPU would be inaccurate. + */ + for_each_possible_cpu(i) { + p->ravg.curr_window_cpu[i] = 0; + p->ravg.prev_window_cpu[i] = 0; + } + } + + *dst_curr_runnable_sum += p->ravg.curr_window; + *dst_prev_runnable_sum += p->ravg.prev_window; + if (new_task) { + *dst_nt_curr_runnable_sum += p->ravg.curr_window; + *dst_nt_prev_runnable_sum += p->ravg.prev_window; + } + + /* + * When a task enter or exits a group, it's curr and prev windows are + * moved to a single CPU. This behavior might be sub-optimal in the + * exit case, however, it saves us the overhead of handling inter + * cluster migration fixups while the task is part of a related group. + */ + p->ravg.curr_window_cpu[cpu] = p->ravg.curr_window; + p->ravg.prev_window_cpu[cpu] = p->ravg.prev_window; + + trace_sched_migration_update_sum(p, migrate_type, rq); + + BUG_ON((s64)*src_curr_runnable_sum < 0); + BUG_ON((s64)*src_prev_runnable_sum < 0); + BUG_ON((s64)*src_nt_curr_runnable_sum < 0); + BUG_ON((s64)*src_nt_prev_runnable_sum < 0); +} + +static inline struct related_thread_group* +lookup_related_thread_group(unsigned int group_id) +{ + return related_thread_groups[group_id]; +} + +int alloc_related_thread_groups(void) +{ + int i, ret; + struct related_thread_group *grp; + + /* groupd_id = 0 is invalid as it's special id to remove group. */ + for (i = 1; i < MAX_NUM_CGROUP_COLOC_ID; i++) { + grp = kzalloc(sizeof(*grp), GFP_NOWAIT); + if (!grp) { + ret = -ENOMEM; + goto err; + } + + grp->id = i; + INIT_LIST_HEAD(&grp->tasks); + INIT_LIST_HEAD(&grp->list); + raw_spin_lock_init(&grp->lock); + + related_thread_groups[i] = grp; + } + + return 0; + +err: + for (i = 1; i < MAX_NUM_CGROUP_COLOC_ID; i++) { + grp = lookup_related_thread_group(i); + if (grp) { + kfree(grp); + related_thread_groups[i] = NULL; + } else { + break; + } + } + + return ret; +} + +static void remove_task_from_group(struct task_struct *p) +{ + struct related_thread_group *grp = p->grp; + struct rq *rq; + int empty_group = 1; + + raw_spin_lock(&grp->lock); + + rq = __task_rq_lock(p); + transfer_busy_time(rq, p->grp, p, REM_TASK); + list_del_init(&p->grp_list); + rcu_assign_pointer(p->grp, NULL); + __task_rq_unlock(rq); + + if (!list_empty(&grp->tasks)) { + empty_group = 0; + _set_preferred_cluster(grp); + } + + raw_spin_unlock(&grp->lock); + + /* Reserved groups cannot be destroyed */ + if (empty_group && grp->id != DEFAULT_CGROUP_COLOC_ID) + /* + * We test whether grp->list is attached with list_empty() + * hence re-init the list after deletion. + */ + list_del_init(&grp->list); +} + +static int +add_task_to_group(struct task_struct *p, struct related_thread_group *grp) +{ + struct rq *rq; + + raw_spin_lock(&grp->lock); + + /* + * Change p->grp under rq->lock. Will prevent races with read-side + * reference of p->grp in various hot-paths + */ + rq = __task_rq_lock(p); + transfer_busy_time(rq, grp, p, ADD_TASK); + list_add(&p->grp_list, &grp->tasks); + rcu_assign_pointer(p->grp, grp); + __task_rq_unlock(rq); + + _set_preferred_cluster(grp); + + raw_spin_unlock(&grp->lock); + + return 0; +} + +void add_new_task_to_grp(struct task_struct *new) +{ + unsigned long flags; + struct related_thread_group *grp; + struct task_struct *leader = new->group_leader; + unsigned int leader_grp_id = sched_get_group_id(leader); + + if (!sysctl_sched_enable_thread_grouping && + leader_grp_id != DEFAULT_CGROUP_COLOC_ID) + return; + + if (thread_group_leader(new)) + return; + + if (leader_grp_id == DEFAULT_CGROUP_COLOC_ID) { + if (!same_schedtune(new, leader)) + return; + } + + write_lock_irqsave(&related_thread_group_lock, flags); + + rcu_read_lock(); + grp = task_related_thread_group(leader); + rcu_read_unlock(); + + /* + * It's possible that someone already added the new task to the + * group. A leader's thread group is updated prior to calling + * this function. It's also possible that the leader has exited + * the group. In either case, there is nothing else to do. + */ + if (!grp || new->grp) { + write_unlock_irqrestore(&related_thread_group_lock, flags); + return; + } + + raw_spin_lock(&grp->lock); + + rcu_assign_pointer(new->grp, grp); + list_add(&new->grp_list, &grp->tasks); + + raw_spin_unlock(&grp->lock); + write_unlock_irqrestore(&related_thread_group_lock, flags); +} + +static int __sched_set_group_id(struct task_struct *p, unsigned int group_id) +{ + int rc = 0; + unsigned long flags; + struct related_thread_group *grp = NULL; + + if (group_id >= MAX_NUM_CGROUP_COLOC_ID) + return -EINVAL; + + raw_spin_lock_irqsave(&p->pi_lock, flags); + write_lock(&related_thread_group_lock); + + /* Switching from one group to another directly is not permitted */ + if ((current != p && p->flags & PF_EXITING) || + (!p->grp && !group_id) || + (p->grp && group_id)) + goto done; + + if (!group_id) { + remove_task_from_group(p); + goto done; + } + + grp = lookup_related_thread_group(group_id); + if (list_empty(&grp->list)) + list_add(&grp->list, &active_related_thread_groups); + + rc = add_task_to_group(p, grp); +done: + write_unlock(&related_thread_group_lock); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + + return rc; +} + +int sched_set_group_id(struct task_struct *p, unsigned int group_id) +{ + /* DEFAULT_CGROUP_COLOC_ID is a reserved id */ + if (group_id == DEFAULT_CGROUP_COLOC_ID) + return -EINVAL; + + return __sched_set_group_id(p, group_id); +} + +unsigned int sched_get_group_id(struct task_struct *p) +{ + unsigned int group_id; + struct related_thread_group *grp; + + rcu_read_lock(); + grp = task_related_thread_group(p); + group_id = grp ? grp->id : 0; + rcu_read_unlock(); + + return group_id; +} + +#if defined(CONFIG_SCHED_TUNE) && defined(CONFIG_CGROUP_SCHEDTUNE) +/* + * We create a default colocation group at boot. There is no need to + * synchronize tasks between cgroups at creation time because the + * correct cgroup hierarchy is not available at boot. Therefore cgroup + * colocation is turned off by default even though the colocation group + * itself has been allocated. Furthermore this colocation group cannot + * be destroyted once it has been created. All of this has been as part + * of runtime optimizations. + * + * The job of synchronizing tasks to the colocation group is done when + * the colocation flag in the cgroup is turned on. + */ +static int __init create_default_coloc_group(void) +{ + struct related_thread_group *grp = NULL; + unsigned long flags; + + grp = lookup_related_thread_group(DEFAULT_CGROUP_COLOC_ID); + write_lock_irqsave(&related_thread_group_lock, flags); + list_add(&grp->list, &active_related_thread_groups); + write_unlock_irqrestore(&related_thread_group_lock, flags); + + update_freq_aggregate_threshold(MAX_FREQ_AGGR_THRESH); + return 0; +} +late_initcall(create_default_coloc_group); + +int sync_cgroup_colocation(struct task_struct *p, bool insert) +{ + unsigned int grp_id = insert ? DEFAULT_CGROUP_COLOC_ID : 0; + + return __sched_set_group_id(p, grp_id); +} +#endif + +static void update_cpu_cluster_capacity(const cpumask_t *cpus) +{ + int i; + struct sched_cluster *cluster; + struct cpumask cpumask; + + cpumask_copy(&cpumask, cpus); + pre_big_task_count_change(cpu_possible_mask); + + for_each_cpu(i, &cpumask) { + cluster = cpu_rq(i)->cluster; + cpumask_andnot(&cpumask, &cpumask, &cluster->cpus); + + cluster->capacity = compute_capacity(cluster); + cluster->load_scale_factor = compute_load_scale_factor(cluster); + + /* 'cpus' can contain cpumask more than one cluster */ + check_for_up_down_migrate_update(&cluster->cpus); + } + + __update_min_max_capacity(); + + post_big_task_count_change(cpu_possible_mask); +} + +static DEFINE_SPINLOCK(cpu_freq_min_max_lock); +void sched_update_cpu_freq_min_max(const cpumask_t *cpus, u32 fmin, u32 fmax) +{ + struct cpumask cpumask; + struct sched_cluster *cluster; + int i, update_capacity = 0; + unsigned long flags; + + spin_lock_irqsave(&cpu_freq_min_max_lock, flags); + cpumask_copy(&cpumask, cpus); + for_each_cpu(i, &cpumask) { + cluster = cpu_rq(i)->cluster; + cpumask_andnot(&cpumask, &cpumask, &cluster->cpus); + + update_capacity += (cluster->max_mitigated_freq != fmax); + cluster->max_mitigated_freq = fmax; + } + spin_unlock_irqrestore(&cpu_freq_min_max_lock, flags); + + if (update_capacity) + update_cpu_cluster_capacity(cpus); +} + +static int cpufreq_notifier_policy(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct cpufreq_policy *policy = (struct cpufreq_policy *)data; + struct sched_cluster *cluster = NULL; + struct cpumask policy_cluster = *policy->related_cpus; + unsigned int orig_max_freq = 0; + int i, j, update_capacity = 0; + + if (val != CPUFREQ_NOTIFY && val != CPUFREQ_REMOVE_POLICY && + val != CPUFREQ_CREATE_POLICY) + return 0; + + if (val == CPUFREQ_REMOVE_POLICY || val == CPUFREQ_CREATE_POLICY) { + update_min_max_capacity(); + return 0; + } + + max_possible_freq = max(max_possible_freq, policy->cpuinfo.max_freq); + if (min_max_freq == 1) + min_max_freq = UINT_MAX; + min_max_freq = min(min_max_freq, policy->cpuinfo.max_freq); + BUG_ON(!min_max_freq); + BUG_ON(!policy->max); + + for_each_cpu(i, &policy_cluster) { + cluster = cpu_rq(i)->cluster; + cpumask_andnot(&policy_cluster, &policy_cluster, + &cluster->cpus); + + orig_max_freq = cluster->max_freq; + cluster->min_freq = policy->min; + cluster->max_freq = policy->max; + cluster->cur_freq = policy->cur; + + if (!cluster->freq_init_done) { + mutex_lock(&cluster_lock); + for_each_cpu(j, &cluster->cpus) + cpumask_copy(&cpu_rq(j)->freq_domain_cpumask, + policy->related_cpus); + cluster->max_possible_freq = policy->cpuinfo.max_freq; + cluster->max_possible_capacity = + compute_max_possible_capacity(cluster); + cluster->freq_init_done = true; + + sort_clusters(); + update_all_clusters_stats(); + mutex_unlock(&cluster_lock); + continue; + } + + update_capacity += (orig_max_freq != cluster->max_freq); + } + + if (update_capacity) + update_cpu_cluster_capacity(policy->related_cpus); + + return 0; +} + +static int cpufreq_notifier_trans(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct cpufreq_freqs *freq = (struct cpufreq_freqs *)data; + unsigned int cpu = freq->cpu, new_freq = freq->new; + unsigned long flags; + struct sched_cluster *cluster; + struct cpumask policy_cpus = cpu_rq(cpu)->freq_domain_cpumask; + int i, j; + + if (val != CPUFREQ_POSTCHANGE) + return 0; + + BUG_ON(!new_freq); + + if (cpu_cur_freq(cpu) == new_freq) + return 0; + + for_each_cpu(i, &policy_cpus) { + cluster = cpu_rq(i)->cluster; + + for_each_cpu(j, &cluster->cpus) { + struct rq *rq = cpu_rq(j); + + raw_spin_lock_irqsave(&rq->lock, flags); + update_task_ravg(rq->curr, rq, TASK_UPDATE, + sched_ktime_clock(), 0); + raw_spin_unlock_irqrestore(&rq->lock, flags); + } + + cluster->cur_freq = new_freq; + cpumask_andnot(&policy_cpus, &policy_cpus, &cluster->cpus); + } + + return 0; +} + +static int pwr_stats_ready_notifier(struct notifier_block *nb, + unsigned long cpu, void *data) +{ + cpumask_t mask = CPU_MASK_NONE; + + cpumask_set_cpu(cpu, &mask); + sched_update_freq_max_load(&mask); + + mutex_lock(&cluster_lock); + sort_clusters(); + mutex_unlock(&cluster_lock); + + return 0; +} + +static struct notifier_block notifier_policy_block = { + .notifier_call = cpufreq_notifier_policy +}; + +static struct notifier_block notifier_trans_block = { + .notifier_call = cpufreq_notifier_trans +}; + +static struct notifier_block notifier_pwr_stats_ready = { + .notifier_call = pwr_stats_ready_notifier +}; + +int __weak register_cpu_pwr_stats_ready_notifier(struct notifier_block *nb) +{ + return -EINVAL; +} + +static int register_sched_callback(void) +{ + int ret; + + ret = cpufreq_register_notifier(¬ifier_policy_block, + CPUFREQ_POLICY_NOTIFIER); + + if (!ret) + ret = cpufreq_register_notifier(¬ifier_trans_block, + CPUFREQ_TRANSITION_NOTIFIER); + + register_cpu_pwr_stats_ready_notifier(¬ifier_pwr_stats_ready); + + return 0; +} + +/* + * cpufreq callbacks can be registered at core_initcall or later time. + * Any registration done prior to that is "forgotten" by cpufreq. See + * initialization of variable init_cpufreq_transition_notifier_list_called + * for further information. + */ +core_initcall(register_sched_callback); + +int update_preferred_cluster(struct related_thread_group *grp, + struct task_struct *p, u32 old_load) +{ + u32 new_load = task_load(p); + + if (!grp) + return 0; + + /* + * Update if task's load has changed significantly or a complete window + * has passed since we last updated preference + */ + if (abs(new_load - old_load) > sched_ravg_window / 4 || + sched_ktime_clock() - grp->last_update > sched_ravg_window) + return 1; + + return 0; +} + +bool early_detection_notify(struct rq *rq, u64 wallclock) +{ + struct task_struct *p; + int loop_max = 10; + + if (sched_boost_policy() == SCHED_BOOST_NONE || !rq->cfs.h_nr_running) + return 0; + + rq->ed_task = NULL; + list_for_each_entry(p, &rq->cfs_tasks, se.group_node) { + if (!loop_max) + break; + + if (wallclock - p->last_wake_ts >= EARLY_DETECTION_DURATION) { + rq->ed_task = p; + return 1; + } + + loop_max--; + } + + return 0; +} + +void update_avg_burst(struct task_struct *p) +{ + update_avg(&p->ravg.avg_burst, p->ravg.curr_burst); + p->ravg.curr_burst = 0; +} + +void note_task_waking(struct task_struct *p, u64 wallclock) +{ + u64 sleep_time = wallclock - p->last_switch_out_ts; + + /* + * When a short burst and short sleeping task goes for a long + * sleep, the task's avg_sleep_time gets boosted. It will not + * come below short_sleep threshold for a lot of time and it + * results in incorrect packing. The idead behind tracking + * avg_sleep_time is to detect if a task is short sleeping + * or not. So limit the sleep time to twice the short sleep + * threshold. For regular long sleeping tasks, the avg_sleep_time + * would be higher than threshold, and packing happens correctly. + */ + sleep_time = min_t(u64, sleep_time, 2 * sysctl_sched_short_sleep); + update_avg(&p->ravg.avg_sleep_time, sleep_time); + + p->last_wake_ts = wallclock; +} + +#ifdef CONFIG_CGROUP_SCHED +u64 cpu_upmigrate_discourage_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct task_group *tg = css_tg(css); + + return tg->upmigrate_discouraged; +} + +int cpu_upmigrate_discourage_write_u64(struct cgroup_subsys_state *css, + struct cftype *cft, u64 upmigrate_discourage) +{ + struct task_group *tg = css_tg(css); + int discourage = upmigrate_discourage > 0; + + if (tg->upmigrate_discouraged == discourage) + return 0; + + /* + * Revisit big-task classification for tasks of this cgroup. It would + * have been efficient to walk tasks of just this cgroup in running + * state, but we don't have easy means to do that. Walk all tasks in + * running state on all cpus instead and re-visit their big task + * classification. + */ + get_online_cpus(); + pre_big_task_count_change(cpu_online_mask); + + tg->upmigrate_discouraged = discourage; + + post_big_task_count_change(cpu_online_mask); + put_online_cpus(); + + return 0; +} +#endif /* CONFIG_CGROUP_SCHED */ diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 33d7003fa1b8..d562efb04775 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -80,6 +80,26 @@ static void update_curr_idle(struct rq *rq) { } +#ifdef CONFIG_SCHED_HMP + +static void +inc_hmp_sched_stats_idle(struct rq *rq, struct task_struct *p) +{ +} + +static void +dec_hmp_sched_stats_idle(struct rq *rq, struct task_struct *p) +{ +} + +static void +fixup_hmp_sched_stats_idle(struct rq *rq, struct task_struct *p, + u32 new_task_load, u32 new_pred_demand) +{ +} + +#endif + /* * Simple, special scheduling class for the per-CPU idle tasks: */ @@ -108,4 +128,9 @@ const struct sched_class idle_sched_class = { .prio_changed = prio_changed_idle, .switched_to = switched_to_idle, .update_curr = update_curr_idle, +#ifdef CONFIG_SCHED_HMP + .inc_hmp_sched_stats = inc_hmp_sched_stats_idle, + .dec_hmp_sched_stats = dec_hmp_sched_stats_idle, + .fixup_hmp_sched_stats = fixup_hmp_sched_stats_idle, +#endif }; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index ad0519e8c7f5..391ec29c71c0 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -5,11 +5,12 @@ #include "sched.h" +#include <linux/interrupt.h> #include <linux/slab.h> #include <linux/irq_work.h> +#include <trace/events/sched.h> #include <linux/hrtimer.h> -#include "walt.h" #include "tune.h" int sched_rr_timeslice = RR_TIMESLICE; @@ -258,8 +259,12 @@ static void pull_rt_task(struct rq *this_rq); static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) { - /* Try to pull RT tasks here if we lower this rq's prio */ - return rq->rt.highest_prio.curr > prev->prio; + /* + * Try to pull RT tasks here if we lower this rq's prio and cpu is not + * isolated + */ + return rq->rt.highest_prio.curr > prev->prio && + !cpu_isolated(cpu_of(rq)); } static inline int rt_overloaded(struct rq *rq) @@ -430,7 +435,7 @@ static void dequeue_top_rt_rq(struct rt_rq *rt_rq); static inline int on_rt_rq(struct sched_rt_entity *rt_se) { - return !list_empty(&rt_se->run_list); + return rt_se->on_rq; } #ifdef CONFIG_RT_GROUP_SCHED @@ -476,8 +481,8 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) return rt_se->my_q; } -static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head); -static void dequeue_rt_entity(struct sched_rt_entity *rt_se); +static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags); +static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags); static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) { @@ -493,7 +498,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) if (!rt_se) enqueue_top_rt_rq(rt_rq); else if (!on_rt_rq(rt_se)) - enqueue_rt_entity(rt_se, false); + enqueue_rt_entity(rt_se, 0); if (rt_rq->highest_prio.curr < curr->prio) resched_curr(rq); @@ -510,7 +515,7 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) if (!rt_se) dequeue_top_rt_rq(rt_rq); else if (on_rt_rq(rt_se)) - dequeue_rt_entity(rt_se); + dequeue_rt_entity(rt_se, 0); } static inline int rt_rq_throttled(struct rt_rq *rt_rq) @@ -1244,6 +1249,41 @@ void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {} #endif /* CONFIG_RT_GROUP_SCHED */ +#ifdef CONFIG_SCHED_HMP + +static void +inc_hmp_sched_stats_rt(struct rq *rq, struct task_struct *p) +{ + inc_cumulative_runnable_avg(&rq->hmp_stats, p); +} + +static void +dec_hmp_sched_stats_rt(struct rq *rq, struct task_struct *p) +{ + dec_cumulative_runnable_avg(&rq->hmp_stats, p); +} + +static void +fixup_hmp_sched_stats_rt(struct rq *rq, struct task_struct *p, + u32 new_task_load, u32 new_pred_demand) +{ + s64 task_load_delta = (s64)new_task_load - task_load(p); + s64 pred_demand_delta = PRED_DEMAND_DELTA; + + fixup_cumulative_runnable_avg(&rq->hmp_stats, p, task_load_delta, + pred_demand_delta); +} + +#else /* CONFIG_SCHED_HMP */ + +static inline void +inc_hmp_sched_stats_rt(struct rq *rq, struct task_struct *p) { } + +static inline void +dec_hmp_sched_stats_rt(struct rq *rq, struct task_struct *p) { } + +#endif /* CONFIG_SCHED_HMP */ + static inline unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se) { @@ -1280,7 +1320,30 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) dec_rt_group(rt_se, rt_rq); } -static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) +/* + * Change rt_se->run_list location unless SAVE && !MOVE + * + * assumes ENQUEUE/DEQUEUE flags match + */ +static inline bool move_entity(unsigned int flags) +{ + if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE) + return false; + + return true; +} + +static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array) +{ + list_del_init(&rt_se->run_list); + + if (list_empty(array->queue + rt_se_prio(rt_se))) + __clear_bit(rt_se_prio(rt_se), array->bitmap); + + rt_se->on_list = 0; +} + +static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) { struct rt_rq *rt_rq = rt_rq_of_se(rt_se); struct rt_prio_array *array = &rt_rq->active; @@ -1293,26 +1356,37 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) * get throttled and the current group doesn't have any other * active members. */ - if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) + if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) { + if (rt_se->on_list) + __delist_rt_entity(rt_se, array); return; + } - if (head) - list_add(&rt_se->run_list, queue); - else - list_add_tail(&rt_se->run_list, queue); - __set_bit(rt_se_prio(rt_se), array->bitmap); + if (move_entity(flags)) { + WARN_ON_ONCE(rt_se->on_list); + if (flags & ENQUEUE_HEAD) + list_add(&rt_se->run_list, queue); + else + list_add_tail(&rt_se->run_list, queue); + + __set_bit(rt_se_prio(rt_se), array->bitmap); + rt_se->on_list = 1; + } + rt_se->on_rq = 1; inc_rt_tasks(rt_se, rt_rq); } -static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) +static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) { struct rt_rq *rt_rq = rt_rq_of_se(rt_se); struct rt_prio_array *array = &rt_rq->active; - list_del_init(&rt_se->run_list); - if (list_empty(array->queue + rt_se_prio(rt_se))) - __clear_bit(rt_se_prio(rt_se), array->bitmap); + if (move_entity(flags)) { + WARN_ON_ONCE(!rt_se->on_list); + __delist_rt_entity(rt_se, array); + } + rt_se->on_rq = 0; dec_rt_tasks(rt_se, rt_rq); } @@ -1321,7 +1395,7 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) * Because the prio of an upper entry depends on the lower * entries, we must remove entries top - down. */ -static void dequeue_rt_stack(struct sched_rt_entity *rt_se) +static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags) { struct sched_rt_entity *back = NULL; @@ -1334,31 +1408,31 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se) for (rt_se = back; rt_se; rt_se = rt_se->back) { if (on_rt_rq(rt_se)) - __dequeue_rt_entity(rt_se); + __dequeue_rt_entity(rt_se, flags); } } -static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) +static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) { struct rq *rq = rq_of_rt_se(rt_se); - dequeue_rt_stack(rt_se); + dequeue_rt_stack(rt_se, flags); for_each_sched_rt_entity(rt_se) - __enqueue_rt_entity(rt_se, head); + __enqueue_rt_entity(rt_se, flags); enqueue_top_rt_rq(&rq->rt); } -static void dequeue_rt_entity(struct sched_rt_entity *rt_se) +static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) { struct rq *rq = rq_of_rt_se(rt_se); - dequeue_rt_stack(rt_se); + dequeue_rt_stack(rt_se, flags); for_each_sched_rt_entity(rt_se) { struct rt_rq *rt_rq = group_rt_rq(rt_se); if (rt_rq && rt_rq->rt_nr_running) - __enqueue_rt_entity(rt_se, false); + __enqueue_rt_entity(rt_se, flags); } enqueue_top_rt_rq(&rq->rt); } @@ -1374,8 +1448,8 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) if (flags & ENQUEUE_WAKEUP) rt_se->timeout = 0; - enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); - walt_inc_cumulative_runnable_avg(rq, p); + enqueue_rt_entity(rt_se, flags); + inc_hmp_sched_stats_rt(rq, p); if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); @@ -1413,8 +1487,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) struct sched_rt_entity *rt_se = &p->rt; update_curr_rt(rq); - dequeue_rt_entity(rt_se); - walt_dec_cumulative_runnable_avg(rq, p); + dequeue_rt_entity(rt_se, flags); + dec_hmp_sched_stats_rt(rq, p); dequeue_pushable_task(rq, p); @@ -1469,6 +1543,40 @@ static void yield_task_rt(struct rq *rq) #ifdef CONFIG_SMP static int find_lowest_rq(struct task_struct *task); +#ifdef CONFIG_SCHED_HMP +static int +select_task_rq_rt_hmp(struct task_struct *p, int cpu, int sd_flag, int flags) +{ + int target; + + rcu_read_lock(); + target = find_lowest_rq(p); + if (target != -1) + cpu = target; + rcu_read_unlock(); + + return cpu; +} +#endif + +/* + * Return whether the task on the given cpu is currently non-preemptible + * while handling a potentially long softint, or if the task is likely + * to block preemptions soon because it is a ksoftirq thread that is + * handling slow softints. + */ +bool +task_may_not_preempt(struct task_struct *task, int cpu) +{ + __u32 softirqs = per_cpu(active_softirqs, cpu) | + __IRQ_STAT(cpu, __softirq_pending); + struct task_struct *cpu_ksoftirqd = per_cpu(ksoftirqd, cpu); + + return ((softirqs & LONG_SOFTIRQ_MASK) && + (task == cpu_ksoftirqd || + task_thread_info(task)->preempt_count & SOFTIRQ_MASK)); +} + /* * Perform a schedtune dequeue and cancelation of boost timers if needed. * Should be called only with the rq->lock held. @@ -1501,6 +1609,11 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags, { struct task_struct *curr; struct rq *rq; + bool may_not_preempt; + +#ifdef CONFIG_SCHED_HMP + return select_task_rq_rt_hmp(p, cpu, sd_flag, flags); +#endif /* For anything but wake ups, just return the task_cpu */ if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) @@ -1512,7 +1625,17 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags, curr = READ_ONCE(rq->curr); /* unlocked access */ /* - * If the current task on @p's runqueue is an RT task, then + * If the current task on @p's runqueue is a softirq task, + * it may run without preemption for a time that is + * ill-suited for a waiting RT task. Therefore, try to + * wake this RT task on another runqueue. + * + * Also, if the current task on @p's runqueue is an RT task, then + * it may run without preemption for a time that is + * ill-suited for a waiting RT task. Therefore, try to + * wake this RT task on another runqueue. + * + * Also, if the current task on @p's runqueue is an RT task, then * try to see if we can wake this RT task up on another * runqueue. Otherwise simply start this RT task * on its current runqueue. @@ -1533,17 +1656,22 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags, * This test is optimistic, if we get it wrong the load-balancer * will have to sort it out. */ - if (curr && unlikely(rt_task(curr)) && + may_not_preempt = task_may_not_preempt(curr, cpu); + if (may_not_preempt || + (unlikely(rt_task(curr)) && (curr->nr_cpus_allowed < 2 || - curr->prio <= p->prio)) { + curr->prio <= p->prio))) { int target = find_lowest_rq(p); /* - * Don't bother moving it if the destination CPU is - * not running a lower priority task. + * If cpu is non-preemptible, prefer remote cpu + * even if it's running a higher-prio task. + * Otherwise: Don't bother moving it if the + * destination CPU is not running a lower priority task. */ if (target != -1 && - p->prio < cpu_rq(target)->rt.highest_prio.curr) + (may_not_preempt || + p->prio < cpu_rq(target)->rt.highest_prio.curr)) cpu = target; } rcu_read_unlock(); @@ -1752,6 +1880,109 @@ static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu) static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); +#ifdef CONFIG_SCHED_HMP + +static int find_lowest_rq_hmp(struct task_struct *task) +{ + struct cpumask *lowest_mask = *this_cpu_ptr(&local_cpu_mask); + struct cpumask candidate_mask = CPU_MASK_NONE; + struct sched_cluster *cluster; + int best_cpu = -1; + int prev_cpu = task_cpu(task); + u64 cpu_load, min_load = ULLONG_MAX; + int i; + int restrict_cluster; + int boost_on_big; + int pack_task, wakeup_latency, least_wakeup_latency = INT_MAX; + + boost_on_big = sched_boost() == FULL_THROTTLE_BOOST && + sched_boost_policy() == SCHED_BOOST_ON_BIG; + + restrict_cluster = sysctl_sched_restrict_cluster_spill; + + /* Make sure the mask is initialized first */ + if (unlikely(!lowest_mask)) + return best_cpu; + + if (task->nr_cpus_allowed == 1) + return best_cpu; /* No other targets possible */ + + if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) + return best_cpu; /* No targets found */ + + pack_task = is_short_burst_task(task); + + /* + * At this point we have built a mask of cpus representing the + * lowest priority tasks in the system. Now we want to elect + * the best one based on our affinity and topology. + */ + +retry: + for_each_sched_cluster(cluster) { + if (boost_on_big && cluster->capacity != max_possible_capacity) + continue; + + cpumask_and(&candidate_mask, &cluster->cpus, lowest_mask); + cpumask_andnot(&candidate_mask, &candidate_mask, + cpu_isolated_mask); + /* + * When placement boost is active, if there is no eligible CPU + * in the highest capacity cluster, we fallback to the other + * clusters. So clear the CPUs of the traversed cluster from + * the lowest_mask. + */ + if (unlikely(boost_on_big)) + cpumask_andnot(lowest_mask, lowest_mask, + &cluster->cpus); + + if (cpumask_empty(&candidate_mask)) + continue; + + for_each_cpu(i, &candidate_mask) { + if (sched_cpu_high_irqload(i)) + continue; + + cpu_load = cpu_rq(i)->hmp_stats.cumulative_runnable_avg; + if (!restrict_cluster) + cpu_load = scale_load_to_cpu(cpu_load, i); + + if (pack_task) { + wakeup_latency = cpu_rq(i)->wakeup_latency; + + if (wakeup_latency > least_wakeup_latency) + continue; + + if (wakeup_latency < least_wakeup_latency) { + least_wakeup_latency = wakeup_latency; + min_load = cpu_load; + best_cpu = i; + continue; + } + } + + if (cpu_load < min_load || + (cpu_load == min_load && + (i == prev_cpu || (best_cpu != prev_cpu && + cpus_share_cache(prev_cpu, i))))) { + min_load = cpu_load; + best_cpu = i; + } + } + + if (restrict_cluster && best_cpu != -1) + break; + } + + if (unlikely(boost_on_big && best_cpu == -1)) { + boost_on_big = 0; + goto retry; + } + + return best_cpu; +} +#endif /* CONFIG_SCHED_HMP */ + static int find_lowest_rq(struct task_struct *task) { struct sched_domain *sd; @@ -1759,6 +1990,10 @@ static int find_lowest_rq(struct task_struct *task) int this_cpu = smp_processor_id(); int cpu = task_cpu(task); +#ifdef CONFIG_SCHED_HMP + return find_lowest_rq_hmp(task); +#endif + /* Make sure the mask is initialized first */ if (unlikely(!lowest_mask)) return -1; @@ -1975,11 +2210,13 @@ retry: goto retry; } + next_task->on_rq = TASK_ON_RQ_MIGRATING; deactivate_task(rq, next_task, 0); next_task->on_rq = TASK_ON_RQ_MIGRATING; set_task_cpu(next_task, lowest_rq->cpu); next_task->on_rq = TASK_ON_RQ_QUEUED; activate_task(lowest_rq, next_task, 0); + next_task->on_rq = TASK_ON_RQ_QUEUED; ret = 1; resched_curr(lowest_rq); @@ -2249,11 +2486,13 @@ static void pull_rt_task(struct rq *this_rq) resched = true; + p->on_rq = TASK_ON_RQ_MIGRATING; deactivate_task(src_rq, p, 0); p->on_rq = TASK_ON_RQ_MIGRATING; set_task_cpu(p, this_cpu); p->on_rq = TASK_ON_RQ_QUEUED; activate_task(this_rq, p, 0); + p->on_rq = TASK_ON_RQ_QUEUED; /* * We continue with the search, just in * case there's an even higher prio task @@ -2326,7 +2565,8 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) * we may need to handle the pulling of RT tasks * now. */ - if (!task_on_rq_queued(p) || rq->rt.rt_nr_running) + if (!task_on_rq_queued(p) || rq->rt.rt_nr_running || + cpu_isolated(cpu_of(rq))) return; queue_pull_task(rq); @@ -2341,6 +2581,7 @@ void __init init_sched_rt_class(void) GFP_KERNEL, cpu_to_node(i)); } } + #endif /* CONFIG_SMP */ /* @@ -2514,6 +2755,11 @@ const struct sched_class rt_sched_class = { .switched_to = switched_to_rt, .update_curr = update_curr_rt, +#ifdef CONFIG_SCHED_HMP + .inc_hmp_sched_stats = inc_hmp_sched_stats_rt, + .dec_hmp_sched_stats = dec_hmp_sched_stats_rt, + .fixup_hmp_sched_stats = fixup_hmp_sched_stats_rt, +#endif }; #ifdef CONFIG_SCHED_DEBUG diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d066a6870245..90cc450dff7e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -28,14 +28,13 @@ extern unsigned long calc_load_update; extern atomic_long_t calc_load_tasks; extern void calc_global_load_tick(struct rq *this_rq); + extern long calc_load_fold_active(struct rq *this_rq); #ifdef CONFIG_SMP extern void update_cpu_load_active(struct rq *this_rq); -extern void check_for_migration(struct rq *rq, struct task_struct *p); #else static inline void update_cpu_load_active(struct rq *this_rq) { } -static inline void check_for_migration(struct rq *rq, struct task_struct *p) { } #endif /* @@ -245,6 +244,10 @@ struct cfs_bandwidth { struct task_group { struct cgroup_subsys_state css; +#ifdef CONFIG_SCHED_HMP + bool upmigrate_discouraged; +#endif + #ifdef CONFIG_FAIR_GROUP_SCHED /* schedulable entities of this group on each cpu */ struct sched_entity **se; @@ -350,12 +353,96 @@ static inline void set_task_rq_fair(struct sched_entity *se, #endif /* CONFIG_SMP */ #endif /* CONFIG_FAIR_GROUP_SCHED */ +extern struct task_group *css_tg(struct cgroup_subsys_state *css); #else /* CONFIG_CGROUP_SCHED */ struct cfs_bandwidth { }; #endif /* CONFIG_CGROUP_SCHED */ +#ifdef CONFIG_SCHED_HMP + +#define NUM_TRACKED_WINDOWS 2 +#define NUM_LOAD_INDICES 1000 + +struct hmp_sched_stats { + int nr_big_tasks; + u64 cumulative_runnable_avg; + u64 pred_demands_sum; +}; + +struct load_subtractions { + u64 window_start; + u64 subs; + u64 new_subs; +}; + +struct group_cpu_time { + u64 curr_runnable_sum; + u64 prev_runnable_sum; + u64 nt_curr_runnable_sum; + u64 nt_prev_runnable_sum; +}; + +struct sched_cluster { + raw_spinlock_t load_lock; + struct list_head list; + struct cpumask cpus; + int id; + int max_power_cost; + int min_power_cost; + int max_possible_capacity; + int capacity; + int efficiency; /* Differentiate cpus with different IPC capability */ + int load_scale_factor; + unsigned int exec_scale_factor; + /* + * max_freq = user maximum + * max_mitigated_freq = thermal defined maximum + * max_possible_freq = maximum supported by hardware + */ + unsigned int cur_freq, max_freq, max_mitigated_freq, min_freq; + unsigned int max_possible_freq; + bool freq_init_done; + int dstate, dstate_wakeup_latency, dstate_wakeup_energy; + unsigned int static_cluster_pwr_cost; + int notifier_sent; + bool wake_up_idle; + atomic64_t last_cc_update; + atomic64_t cycles; +}; + +extern unsigned long all_cluster_ids[]; + +static inline int cluster_first_cpu(struct sched_cluster *cluster) +{ + return cpumask_first(&cluster->cpus); +} + +struct related_thread_group { + int id; + raw_spinlock_t lock; + struct list_head tasks; + struct list_head list; + struct sched_cluster *preferred_cluster; + struct rcu_head rcu; + u64 last_update; +}; + +extern struct list_head cluster_head; +extern struct sched_cluster *sched_cluster[NR_CPUS]; + +struct cpu_cycle { + u64 cycles; + u64 time; +}; + +#define for_each_sched_cluster(cluster) \ + list_for_each_entry_rcu(cluster, &cluster_head, list) + +extern unsigned int sched_disable_window_stats; +#endif /* CONFIG_SCHED_HMP */ + /* CFS-related fields in a runqueue */ struct cfs_rq { struct load_weight load; @@ -424,11 +511,12 @@ struct cfs_rq { struct list_head leaf_cfs_rq_list; struct task_group *tg; /* group that "owns" this runqueue */ -#ifdef CONFIG_SCHED_WALT - u64 cumulative_runnable_avg; +#ifdef CONFIG_CFS_BANDWIDTH + +#ifdef CONFIG_SCHED_HMP + struct hmp_sched_stats hmp_stats; #endif -#ifdef CONFIG_CFS_BANDWIDTH int runtime_enabled; u64 runtime_expires; s64 runtime_remaining; @@ -698,6 +786,38 @@ struct rq { u64 max_idle_balance_cost; #endif +#ifdef CONFIG_SCHED_HMP + struct sched_cluster *cluster; + struct cpumask freq_domain_cpumask; + struct hmp_sched_stats hmp_stats; + + int cstate, wakeup_latency, wakeup_energy; + u64 window_start; + u64 load_reported_window; + unsigned long hmp_flags; + + u64 cur_irqload; + u64 avg_irqload; + u64 irqload_ts; + unsigned int static_cpu_pwr_cost; + struct task_struct *ed_task; + struct cpu_cycle cc; + u64 old_busy_time, old_busy_time_group; + u64 old_estimated_time; + u64 curr_runnable_sum; + u64 prev_runnable_sum; + u64 nt_curr_runnable_sum; + u64 nt_prev_runnable_sum; + struct group_cpu_time grp_time; + struct load_subtractions load_subs[NUM_TRACKED_WINDOWS]; + DECLARE_BITMAP_ARRAY(top_tasks_bitmap, + NUM_TRACKED_WINDOWS, NUM_LOAD_INDICES); + u8 *top_tasks[NUM_TRACKED_WINDOWS]; + u8 curr_table; + int prev_top; + int curr_top; +#endif + #ifdef CONFIG_SCHED_WALT u64 cumulative_runnable_avg; u64 window_start; @@ -711,7 +831,6 @@ struct rq { u64 cum_window_demand; #endif /* CONFIG_SCHED_WALT */ - #ifdef CONFIG_IRQ_TIME_ACCOUNTING u64 prev_irq_time; #endif @@ -986,6 +1105,652 @@ static inline void sched_ttwu_pending(void) { } #include "stats.h" #include "auto_group.h" +enum sched_boost_policy { + SCHED_BOOST_NONE, + SCHED_BOOST_ON_BIG, + SCHED_BOOST_ON_ALL, +}; + +#ifdef CONFIG_SCHED_HMP + +#define WINDOW_STATS_RECENT 0 +#define WINDOW_STATS_MAX 1 +#define WINDOW_STATS_MAX_RECENT_AVG 2 +#define WINDOW_STATS_AVG 3 +#define WINDOW_STATS_INVALID_POLICY 4 + +#define SCHED_UPMIGRATE_MIN_NICE 15 +#define EXITING_TASK_MARKER 0xdeaddead + +#define UP_MIGRATION 1 +#define DOWN_MIGRATION 2 +#define IRQLOAD_MIGRATION 3 + +extern struct mutex policy_mutex; +extern unsigned int sched_ravg_window; +extern unsigned int sched_disable_window_stats; +extern unsigned int max_possible_freq; +extern unsigned int min_max_freq; +extern unsigned int pct_task_load(struct task_struct *p); +extern unsigned int max_possible_efficiency; +extern unsigned int min_possible_efficiency; +extern unsigned int max_capacity; +extern unsigned int min_capacity; +extern unsigned int max_load_scale_factor; +extern unsigned int max_possible_capacity; +extern unsigned int min_max_possible_capacity; +extern unsigned int max_power_cost; +extern unsigned int sched_init_task_load_windows; +extern unsigned int up_down_migrate_scale_factor; +extern unsigned int sysctl_sched_restrict_cluster_spill; +extern unsigned int sched_pred_alert_load; +extern struct sched_cluster init_cluster; +extern unsigned int __read_mostly sched_short_sleep_task_threshold; +extern unsigned int __read_mostly sched_long_cpu_selection_threshold; +extern unsigned int __read_mostly sched_big_waker_task_load; +extern unsigned int __read_mostly sched_small_wakee_task_load; +extern unsigned int __read_mostly sched_spill_load; +extern unsigned int __read_mostly sched_upmigrate; +extern unsigned int __read_mostly sched_downmigrate; +extern unsigned int __read_mostly sched_load_granule; + +extern void init_new_task_load(struct task_struct *p); +extern u64 sched_ktime_clock(void); +extern int got_boost_kick(void); +extern int register_cpu_cycle_counter_cb(struct cpu_cycle_counter_cb *cb); +extern void update_task_ravg(struct task_struct *p, struct rq *rq, int event, + u64 wallclock, u64 irqtime); +extern bool early_detection_notify(struct rq *rq, u64 wallclock); +extern void clear_ed_task(struct task_struct *p, struct rq *rq); +extern void fixup_busy_time(struct task_struct *p, int new_cpu); +extern void clear_boost_kick(int cpu); +extern void clear_hmp_request(int cpu); +extern void mark_task_starting(struct task_struct *p); +extern void set_window_start(struct rq *rq); +extern void update_cluster_topology(void); +extern void note_task_waking(struct task_struct *p, u64 wallclock); +extern void set_task_last_switch_out(struct task_struct *p, u64 wallclock); +extern void init_clusters(void); +extern void reset_cpu_hmp_stats(int cpu, int reset_cra); +extern unsigned int max_task_load(void); +extern void sched_account_irqtime(int cpu, struct task_struct *curr, + u64 delta, u64 wallclock); +extern void sched_account_irqstart(int cpu, struct task_struct *curr, + u64 wallclock); +extern unsigned int cpu_temp(int cpu); +extern unsigned int nr_eligible_big_tasks(int cpu); +extern int update_preferred_cluster(struct related_thread_group *grp, + struct task_struct *p, u32 old_load); +extern void set_preferred_cluster(struct related_thread_group *grp); +extern void add_new_task_to_grp(struct task_struct *new); +extern unsigned int update_freq_aggregate_threshold(unsigned int threshold); +extern void update_avg_burst(struct task_struct *p); +extern void update_avg(u64 *avg, u64 sample); + +#define NO_BOOST 0 +#define FULL_THROTTLE_BOOST 1 +#define CONSERVATIVE_BOOST 2 +#define RESTRAINED_BOOST 3 + +static inline struct sched_cluster *cpu_cluster(int cpu) +{ + return cpu_rq(cpu)->cluster; +} + +static inline int cpu_capacity(int cpu) +{ + return cpu_rq(cpu)->cluster->capacity; +} + +static inline int cpu_max_possible_capacity(int cpu) +{ + return cpu_rq(cpu)->cluster->max_possible_capacity; +} + +static inline int cpu_load_scale_factor(int cpu) +{ + return cpu_rq(cpu)->cluster->load_scale_factor; +} + +static inline int cpu_efficiency(int cpu) +{ + return cpu_rq(cpu)->cluster->efficiency; +} + +static inline unsigned int cpu_cur_freq(int cpu) +{ + return cpu_rq(cpu)->cluster->cur_freq; +} + +static inline unsigned int cpu_min_freq(int cpu) +{ + return cpu_rq(cpu)->cluster->min_freq; +} + +static inline unsigned int cluster_max_freq(struct sched_cluster *cluster) +{ + /* + * Governor and thermal driver don't know the other party's mitigation + * voting. So struct cluster saves both and return min() for current + * cluster fmax. + */ + return min(cluster->max_mitigated_freq, cluster->max_freq); +} + +static inline unsigned int cpu_max_freq(int cpu) +{ + return cluster_max_freq(cpu_rq(cpu)->cluster); +} + +static inline unsigned int cpu_max_possible_freq(int cpu) +{ + return cpu_rq(cpu)->cluster->max_possible_freq; +} + +static inline int same_cluster(int src_cpu, int dst_cpu) +{ + return cpu_rq(src_cpu)->cluster == cpu_rq(dst_cpu)->cluster; +} + +static inline int cpu_max_power_cost(int cpu) +{ + return cpu_rq(cpu)->cluster->max_power_cost; +} + +static inline int cpu_min_power_cost(int cpu) +{ + return cpu_rq(cpu)->cluster->min_power_cost; +} + +static inline u32 cpu_cycles_to_freq(u64 cycles, u64 period) +{ + return div64_u64(cycles, period); +} + +static inline bool hmp_capable(void) +{ + return max_possible_capacity != min_max_possible_capacity; +} + +static inline bool is_max_capacity_cpu(int cpu) +{ + return cpu_max_possible_capacity(cpu) == max_possible_capacity; +} + +static inline bool is_min_capacity_cpu(int cpu) +{ + return cpu_max_possible_capacity(cpu) == min_max_possible_capacity; +} + +/* + * 'load' is in reference to "best cpu" at its best frequency. + * Scale that in reference to a given cpu, accounting for how bad it is + * in reference to "best cpu". + */ +static inline u64 scale_load_to_cpu(u64 task_load, int cpu) +{ + u64 lsf = cpu_load_scale_factor(cpu); + + if (lsf != 1024) { + task_load *= lsf; + task_load /= 1024; + } + + return task_load; +} + +static inline unsigned int task_load(struct task_struct *p) +{ + return p->ravg.demand; +} + +static inline void +inc_cumulative_runnable_avg(struct hmp_sched_stats *stats, + struct task_struct *p) +{ + u32 task_load; + + if (sched_disable_window_stats) + return; + + task_load = sched_disable_window_stats ? 0 : p->ravg.demand; + + stats->cumulative_runnable_avg += task_load; + stats->pred_demands_sum += p->ravg.pred_demand; +} + +static inline void +dec_cumulative_runnable_avg(struct hmp_sched_stats *stats, + struct task_struct *p) +{ + u32 task_load; + + if (sched_disable_window_stats) + return; + + task_load = sched_disable_window_stats ? 0 : p->ravg.demand; + + stats->cumulative_runnable_avg -= task_load; + + BUG_ON((s64)stats->cumulative_runnable_avg < 0); + + stats->pred_demands_sum -= p->ravg.pred_demand; + BUG_ON((s64)stats->pred_demands_sum < 0); +} + +static inline void +fixup_cumulative_runnable_avg(struct hmp_sched_stats *stats, + struct task_struct *p, s64 task_load_delta, + s64 pred_demand_delta) +{ + if (sched_disable_window_stats) + return; + + stats->cumulative_runnable_avg += task_load_delta; + BUG_ON((s64)stats->cumulative_runnable_avg < 0); + + stats->pred_demands_sum += pred_demand_delta; + BUG_ON((s64)stats->pred_demands_sum < 0); +} + +#define pct_to_real(tunable) \ + (div64_u64((u64)tunable * (u64)max_task_load(), 100)) + +#define real_to_pct(tunable) \ + (div64_u64((u64)tunable * (u64)100, (u64)max_task_load())) + +#define SCHED_HIGH_IRQ_TIMEOUT 3 +static inline u64 sched_irqload(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + s64 delta; + + delta = get_jiffies_64() - rq->irqload_ts; + /* + * Current context can be preempted by irq and rq->irqload_ts can be + * updated by irq context so that delta can be negative. + * But this is okay and we can safely return as this means there + * was recent irq occurrence. + */ + + if (delta < SCHED_HIGH_IRQ_TIMEOUT) + return rq->avg_irqload; + else + return 0; +} + +static inline int sched_cpu_high_irqload(int cpu) +{ + return sched_irqload(cpu) >= sysctl_sched_cpu_high_irqload; +} + +static inline bool task_in_related_thread_group(struct task_struct *p) +{ + return !!(rcu_access_pointer(p->grp) != NULL); +} + +static inline +struct related_thread_group *task_related_thread_group(struct task_struct *p) +{ + return rcu_dereference(p->grp); +} + +#define PRED_DEMAND_DELTA ((s64)new_pred_demand - p->ravg.pred_demand) + +extern void +check_for_freq_change(struct rq *rq, bool check_pred, bool check_groups); + +extern void notify_migration(int src_cpu, int dest_cpu, + bool src_cpu_dead, struct task_struct *p); + +/* Is frequency of two cpus synchronized with each other? */ +static inline int same_freq_domain(int src_cpu, int dst_cpu) +{ + struct rq *rq = cpu_rq(src_cpu); + + if (src_cpu == dst_cpu) + return 1; + + return cpumask_test_cpu(dst_cpu, &rq->freq_domain_cpumask); +} + +#define BOOST_KICK 0 +#define CPU_RESERVED 1 + +static inline int is_reserved(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + return test_bit(CPU_RESERVED, &rq->hmp_flags); +} + +static inline int mark_reserved(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + /* Name boost_flags as hmp_flags? */ + return test_and_set_bit(CPU_RESERVED, &rq->hmp_flags); +} + +static inline void clear_reserved(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + clear_bit(CPU_RESERVED, &rq->hmp_flags); +} + +static inline u64 cpu_cravg_sync(int cpu, int sync) +{ + struct rq *rq = cpu_rq(cpu); + u64 load; + + load = rq->hmp_stats.cumulative_runnable_avg; + + /* + * If load is being checked in a sync wakeup environment, + * we may want to discount the load of the currently running + * task. + */ + if (sync && cpu == smp_processor_id()) { + if (load > rq->curr->ravg.demand) + load -= rq->curr->ravg.demand; + else + load = 0; + } + + return load; +} + +static inline bool is_short_burst_task(struct task_struct *p) +{ + return p->ravg.avg_burst < sysctl_sched_short_burst && + p->ravg.avg_sleep_time > sysctl_sched_short_sleep; +} + +extern void check_for_migration(struct rq *rq, struct task_struct *p); +extern void pre_big_task_count_change(const struct cpumask *cpus); +extern void post_big_task_count_change(const struct cpumask *cpus); +extern void set_hmp_defaults(void); +extern int power_delta_exceeded(unsigned int cpu_cost, unsigned int base_cost); +extern unsigned int power_cost(int cpu, u64 demand); +extern void reset_all_window_stats(u64 window_start, unsigned int window_size); +extern int sched_boost(void); +extern int task_load_will_fit(struct task_struct *p, u64 task_load, int cpu, + enum sched_boost_policy boost_policy); +extern enum sched_boost_policy sched_boost_policy(void); +extern int task_will_fit(struct task_struct *p, int cpu); +extern u64 cpu_load(int cpu); +extern u64 cpu_load_sync(int cpu, int sync); +extern int preferred_cluster(struct sched_cluster *cluster, + struct task_struct *p); +extern void inc_nr_big_task(struct hmp_sched_stats *stats, + struct task_struct *p); +extern void dec_nr_big_task(struct hmp_sched_stats *stats, + struct task_struct *p); +extern void inc_rq_hmp_stats(struct rq *rq, + struct task_struct *p, int change_cra); +extern void dec_rq_hmp_stats(struct rq *rq, + struct task_struct *p, int change_cra); +extern void reset_hmp_stats(struct hmp_sched_stats *stats, int reset_cra); +extern int is_big_task(struct task_struct *p); +extern int upmigrate_discouraged(struct task_struct *p); +extern struct sched_cluster *rq_cluster(struct rq *rq); +extern int nr_big_tasks(struct rq *rq); +extern void fixup_nr_big_tasks(struct hmp_sched_stats *stats, + struct task_struct *p, s64 delta); +extern void reset_task_stats(struct task_struct *p); +extern void reset_cfs_rq_hmp_stats(int cpu, int reset_cra); +extern void _inc_hmp_sched_stats_fair(struct rq *rq, + struct task_struct *p, int change_cra); +extern u64 cpu_upmigrate_discourage_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft); +extern int cpu_upmigrate_discourage_write_u64(struct cgroup_subsys_state *css, + struct cftype *cft, u64 upmigrate_discourage); +extern void sched_boost_parse_dt(void); +extern void clear_top_tasks_bitmap(unsigned long *bitmap); + +#if defined(CONFIG_SCHED_TUNE) && defined(CONFIG_CGROUP_SCHEDTUNE) +extern bool task_sched_boost(struct task_struct *p); +extern int sync_cgroup_colocation(struct task_struct *p, bool insert); +extern bool same_schedtune(struct task_struct *tsk1, struct task_struct *tsk2); +extern void update_cgroup_boost_settings(void); +extern void restore_cgroup_boost_settings(void); + +#else +static inline bool +same_schedtune(struct task_struct *tsk1, struct task_struct *tsk2) +{ + return true; +} + +static inline bool task_sched_boost(struct task_struct *p) +{ + return true; +} + +static inline void update_cgroup_boost_settings(void) { } +static inline void restore_cgroup_boost_settings(void) { } +#endif + +extern int alloc_related_thread_groups(void); + +#else /* CONFIG_SCHED_HMP */ + +struct hmp_sched_stats; +struct related_thread_group; +struct sched_cluster; + +static inline enum sched_boost_policy sched_boost_policy(void) +{ + return SCHED_BOOST_NONE; +} + +static inline bool task_sched_boost(struct task_struct *p) +{ + return true; +} + +static inline int got_boost_kick(void) +{ + return 0; +} + +static inline void update_task_ravg(struct task_struct *p, struct rq *rq, + int event, u64 wallclock, u64 irqtime) { } + +static inline bool early_detection_notify(struct rq *rq, u64 wallclock) +{ + return 0; +} + +static inline void clear_ed_task(struct task_struct *p, struct rq *rq) { } +static inline void fixup_busy_time(struct task_struct *p, int new_cpu) { } +static inline void clear_boost_kick(int cpu) { } +static inline void clear_hmp_request(int cpu) { } +static inline void mark_task_starting(struct task_struct *p) { } +static inline void set_window_start(struct rq *rq) { } +static inline void init_clusters(void) {} +static inline void update_cluster_topology(void) { } +static inline void note_task_waking(struct task_struct *p, u64 wallclock) { } +static inline void set_task_last_switch_out(struct task_struct *p, + u64 wallclock) { } + +static inline int task_will_fit(struct task_struct *p, int cpu) +{ + return 1; +} + +static inline int select_best_cpu(struct task_struct *p, int target, + int reason, int sync) +{ + return 0; +} + +static inline unsigned int power_cost(int cpu, u64 demand) +{ + return SCHED_CAPACITY_SCALE; +} + +static inline int sched_boost(void) +{ + return 0; +} + +static inline int is_big_task(struct task_struct *p) +{ + return 0; +} + +static inline int nr_big_tasks(struct rq *rq) +{ + return 0; +} + +static inline int is_cpu_throttling_imminent(int cpu) +{ + return 0; +} + +static inline int is_task_migration_throttled(struct task_struct *p) +{ + return 0; +} + +static inline unsigned int cpu_temp(int cpu) +{ + return 0; +} + +static inline void +inc_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra) { } + +static inline void +dec_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra) { } + +static inline void +inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p) { } + +static inline void +dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p) { } + +static inline int +preferred_cluster(struct sched_cluster *cluster, struct task_struct *p) +{ + return 1; +} + +static inline struct sched_cluster *rq_cluster(struct rq *rq) +{ + return NULL; +} + +static inline void init_new_task_load(struct task_struct *p) +{ +} + +static inline u64 scale_load_to_cpu(u64 load, int cpu) +{ + return load; +} + +static inline unsigned int nr_eligible_big_tasks(int cpu) +{ + return 0; +} + +static inline bool is_max_capacity_cpu(int cpu) { return true; } + +static inline int pct_task_load(struct task_struct *p) { return 0; } + +static inline int cpu_capacity(int cpu) +{ + return SCHED_LOAD_SCALE; +} + +static inline int same_cluster(int src_cpu, int dst_cpu) { return 1; } + +static inline void inc_cumulative_runnable_avg(struct hmp_sched_stats *stats, + struct task_struct *p) +{ +} + +static inline void dec_cumulative_runnable_avg(struct hmp_sched_stats *stats, + struct task_struct *p) +{ +} + +static inline void sched_account_irqtime(int cpu, struct task_struct *curr, + u64 delta, u64 wallclock) +{ +} + +static inline void sched_account_irqstart(int cpu, struct task_struct *curr, + u64 wallclock) +{ +} + +static inline int sched_cpu_high_irqload(int cpu) { return 0; } + +static inline void set_preferred_cluster(struct related_thread_group *grp) { } + +static inline bool task_in_related_thread_group(struct task_struct *p) +{ + return false; +} + +static inline +struct related_thread_group *task_related_thread_group(struct task_struct *p) +{ + return NULL; +} + +static inline u32 task_load(struct task_struct *p) { return 0; } + +static inline int update_preferred_cluster(struct related_thread_group *grp, + struct task_struct *p, u32 old_load) +{ + return 0; +} + +static inline void add_new_task_to_grp(struct task_struct *new) {} + +#define PRED_DEMAND_DELTA (0) + +static inline void +check_for_freq_change(struct rq *rq, bool check_pred, bool check_groups) { } + +static inline void notify_migration(int src_cpu, int dest_cpu, + bool src_cpu_dead, struct task_struct *p) { } + +static inline int same_freq_domain(int src_cpu, int dst_cpu) +{ + return 1; +} + +static inline void check_for_migration(struct rq *rq, struct task_struct *p) { } +static inline void pre_big_task_count_change(void) { } +static inline void post_big_task_count_change(void) { } +static inline void set_hmp_defaults(void) { } + +static inline void clear_reserved(int cpu) { } +static inline void sched_boost_parse_dt(void) {} +static inline int alloc_related_thread_groups(void) { return 0; } + +#define trace_sched_cpu_load(...) +#define trace_sched_cpu_load_lb(...) +#define trace_sched_cpu_load_cgroup(...) +#define trace_sched_cpu_load_wakeup(...) + +static inline void update_avg_burst(struct task_struct *p) {} + +#endif /* CONFIG_SCHED_HMP */ + +/* + * Returns the rq capacity of any rq in a group. This does not play + * well with groups where rq capacity can change independently. + */ +#define group_rq_capacity(group) cpu_capacity(group_first_cpu(group)) + #ifdef CONFIG_CGROUP_SCHED /* @@ -1032,7 +1797,6 @@ static inline struct task_group *task_group(struct task_struct *p) { return NULL; } - #endif /* CONFIG_CGROUP_SCHED */ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) @@ -1090,7 +1854,7 @@ static __always_inline bool static_branch_##name(struct static_key *key) \ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) #else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */ -#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) +#define sched_feat(x) !!(sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ extern struct static_key_false sched_numa_balancing; @@ -1186,6 +1950,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) #define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ #define WF_FORK 0x02 /* child wakeup after fork */ #define WF_MIGRATED 0x4 /* internal use, task got migrated */ +#define WF_NO_NOTIFIER 0x08 /* do not notify governor */ /* * To aid in avoiding the subversion of "niceness" due to uneven distribution @@ -1240,19 +2005,41 @@ static const u32 prio_to_wmult[40] = { /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, }; +/* + * {de,en}queue flags: + * + * DEQUEUE_SLEEP - task is no longer runnable + * ENQUEUE_WAKEUP - task just became runnable + * + * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks + * are in a known state which allows modification. Such pairs + * should preserve as much state as possible. + * + * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location + * in the runqueue. + * + * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) + * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) + * ENQUEUE_WAKING - sched_class::task_waking was called + * + */ + +#define DEQUEUE_SLEEP 0x01 +#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ +#define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */ + #define ENQUEUE_WAKEUP 0x01 -#define ENQUEUE_HEAD 0x02 +#define ENQUEUE_RESTORE 0x02 +#define ENQUEUE_MOVE 0x04 + +#define ENQUEUE_HEAD 0x08 +#define ENQUEUE_REPLENISH 0x10 #ifdef CONFIG_SMP -#define ENQUEUE_WAKING 0x04 /* sched_class::task_waking was called */ +#define ENQUEUE_WAKING 0x20 #else #define ENQUEUE_WAKING 0x00 #endif -#define ENQUEUE_REPLENISH 0x08 -#define ENQUEUE_RESTORE 0x10 -#define ENQUEUE_WAKEUP_NEW 0x20 - -#define DEQUEUE_SLEEP 0x01 -#define DEQUEUE_SAVE 0x02 +#define ENQUEUE_WAKEUP_NEW 0x40 #define RETRY_TASK ((void *)-1UL) @@ -1319,6 +2106,12 @@ struct sched_class { #ifdef CONFIG_FAIR_GROUP_SCHED void (*task_change_group)(struct task_struct *p, int type); #endif +#ifdef CONFIG_SCHED_HMP + void (*inc_hmp_sched_stats)(struct rq *rq, struct task_struct *p); + void (*dec_hmp_sched_stats)(struct rq *rq, struct task_struct *p); + void (*fixup_hmp_sched_stats)(struct rq *rq, struct task_struct *p, + u32 new_task_load, u32 new_pred_demand); +#endif }; static inline void put_prev_task(struct rq *rq, struct task_struct *prev) @@ -1343,6 +2136,7 @@ extern void init_max_cpu_capacity(struct max_cpu_capacity *mcc); extern void update_group_capacity(struct sched_domain *sd, int cpu); extern void trigger_load_balance(struct rq *rq); +extern void nohz_balance_clear_nohz_mask(int cpu); extern void idle_enter_fair(struct rq *this_rq); extern void idle_exit_fair(struct rq *this_rq); @@ -1400,7 +2194,9 @@ static inline int idle_get_state_idx(struct rq *rq) } #endif +#ifdef CONFIG_SYSRQ_SCHED_DEBUG extern void sysrq_sched_debug_show(void); +#endif extern void sched_init_granularity(void); extern void update_max_interval(void); @@ -1428,6 +2224,7 @@ static inline void __add_nr_running(struct rq *rq, unsigned count) { unsigned prev_nr = rq->nr_running; + sched_update_nr_prod(cpu_of(rq), count, true); rq->nr_running = prev_nr + count; if (prev_nr < 2 && rq->nr_running >= 2) { @@ -1454,6 +2251,7 @@ static inline void __add_nr_running(struct rq *rq, unsigned count) static inline void __sub_nr_running(struct rq *rq, unsigned count) { + sched_update_nr_prod(cpu_of(rq), count, false); rq->nr_running -= count; } @@ -1617,6 +2415,7 @@ static inline unsigned long __cpu_util(int cpu, int delta) util = div64_u64(cpu_rq(cpu)->cumulative_runnable_avg, walt_ravg_window >> SCHED_LOAD_SHIFT); #endif + delta += util; if (delta < 0) return 0; @@ -1644,6 +2443,20 @@ static inline unsigned long cpu_util_freq(int cpu) #endif +#ifdef CONFIG_SCHED_HMP +/* + * HMP and EAS are orthogonal. Hopefully the compiler just elides out all code + * with the energy_aware() check, so that we don't even pay the comparison + * penalty at runtime. + */ +#define energy_aware() false +#else +static inline bool energy_aware(void) +{ + return sched_feat(ENERGY_AWARE); +} +#endif + static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); @@ -1884,6 +2697,11 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) __release(rq2->lock); } +/* + * task_may_not_preempt - check whether a task may not be preemptible soon + */ +extern bool task_may_not_preempt(struct task_struct *task, int cpu); + #else /* CONFIG_SMP */ /* @@ -1951,6 +2769,9 @@ enum rq_nohz_flag_bits { NOHZ_BALANCE_KICK, }; +#define NOHZ_KICK_ANY 0 +#define NOHZ_KICK_RESTRICT 1 + #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) #endif @@ -2032,6 +2853,18 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) { struct update_util_data *data; +#ifdef CONFIG_SCHED_HMP + /* + * Skip if we've already reported, but not if this is an inter-cluster + * migration + */ + if (!sched_disable_window_stats && + (rq->load_reported_window == rq->window_start) && + !(flags & SCHED_CPUFREQ_INTERCLUSTER_MIG)) + return; + rq->load_reported_window = rq->window_start; +#endif + data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); if (data) data->func(data, rq_clock(rq), flags); diff --git a/kernel/sched/sched_avg.c b/kernel/sched/sched_avg.c new file mode 100644 index 000000000000..f03ed685f102 --- /dev/null +++ b/kernel/sched/sched_avg.c @@ -0,0 +1,199 @@ +/* Copyright (c) 2012, 2015-2017, 2018 The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +/* + * Scheduler hook for average runqueue determination + */ +#include <linux/module.h> +#include <linux/percpu.h> +#include <linux/hrtimer.h> +#include <linux/sched.h> +#include <linux/math64.h> + +#include "sched.h" +#include <trace/events/sched.h> + +static DEFINE_PER_CPU(u64, nr_prod_sum); +static DEFINE_PER_CPU(u64, last_time); +static DEFINE_PER_CPU(u64, nr_big_prod_sum); +static DEFINE_PER_CPU(u64, nr); +static DEFINE_PER_CPU(u64, nr_max); + +static DEFINE_PER_CPU(unsigned long, iowait_prod_sum); +static DEFINE_PER_CPU(spinlock_t, nr_lock) = __SPIN_LOCK_UNLOCKED(nr_lock); +static s64 last_get_time; + +#define DIV64_U64_ROUNDUP(X, Y) div64_u64((X) + (Y - 1), Y) +/** + * sched_get_nr_running_avg + * @return: Average nr_running, iowait and nr_big_tasks value since last poll. + * Returns the avg * 100 to return up to two decimal points + * of accuracy. + * + * Obtains the average nr_running value since the last poll. + * This function may not be called concurrently with itself + */ +void sched_get_nr_running_avg(int *avg, int *iowait_avg, int *big_avg, + unsigned int *max_nr, unsigned int *big_max_nr) +{ + int cpu; + u64 curr_time = sched_clock(); + u64 diff = curr_time - last_get_time; + u64 tmp_avg = 0, tmp_iowait = 0, tmp_big_avg = 0; + + *avg = 0; + *iowait_avg = 0; + *big_avg = 0; + *max_nr = 0; + *big_max_nr = 0; + + if (!diff) + return; + + /* read and reset nr_running counts */ + for_each_possible_cpu(cpu) { + unsigned long flags; + + spin_lock_irqsave(&per_cpu(nr_lock, cpu), flags); + curr_time = sched_clock(); + diff = curr_time - per_cpu(last_time, cpu); + BUG_ON((s64)diff < 0); + + tmp_avg += per_cpu(nr_prod_sum, cpu); + tmp_avg += per_cpu(nr, cpu) * diff; + + tmp_big_avg += per_cpu(nr_big_prod_sum, cpu); + tmp_big_avg += nr_eligible_big_tasks(cpu) * diff; + + tmp_iowait += per_cpu(iowait_prod_sum, cpu); + tmp_iowait += nr_iowait_cpu(cpu) * diff; + + per_cpu(last_time, cpu) = curr_time; + + per_cpu(nr_prod_sum, cpu) = 0; + per_cpu(nr_big_prod_sum, cpu) = 0; + per_cpu(iowait_prod_sum, cpu) = 0; + + if (*max_nr < per_cpu(nr_max, cpu)) + *max_nr = per_cpu(nr_max, cpu); + + if (is_max_capacity_cpu(cpu)) { + if (*big_max_nr < per_cpu(nr_max, cpu)) + *big_max_nr = per_cpu(nr_max, cpu); + } + + per_cpu(nr_max, cpu) = per_cpu(nr, cpu); + spin_unlock_irqrestore(&per_cpu(nr_lock, cpu), flags); + } + + diff = curr_time - last_get_time; + last_get_time = curr_time; + + /* + * Any task running on BIG cluster and BIG tasks running on little + * cluster contributes to big_avg. Small or medium tasks can also + * run on BIG cluster when co-location and scheduler boost features + * are activated. We don't want these tasks to downmigrate to little + * cluster when BIG CPUs are available but isolated. Round up the + * average values so that core_ctl aggressively unisolate BIG CPUs. + */ + *avg = (int)DIV64_U64_ROUNDUP(tmp_avg, diff); + *big_avg = (int)DIV64_U64_ROUNDUP(tmp_big_avg, diff); + *iowait_avg = (int)DIV64_U64_ROUNDUP(tmp_iowait, diff); + + trace_sched_get_nr_running_avg(*avg, *big_avg, *iowait_avg, + *max_nr, *big_max_nr); + + BUG_ON(*avg < 0 || *big_avg < 0 || *iowait_avg < 0); + pr_debug("%s - avg:%d big_avg:%d iowait_avg:%d\n", + __func__, *avg, *big_avg, *iowait_avg); +} +EXPORT_SYMBOL(sched_get_nr_running_avg); + +static DEFINE_PER_CPU(atomic64_t, last_busy_time) = ATOMIC64_INIT(0); + +#define BUSY_NR_RUN 3 +#define BUSY_LOAD_FACTOR 10 + +#ifdef CONFIG_SCHED_HMP +static inline void update_last_busy_time(int cpu, bool dequeue, + unsigned long prev_nr_run, u64 curr_time) +{ + bool nr_run_trigger = false, load_trigger = false; + + if (!hmp_capable() || is_min_capacity_cpu(cpu)) + return; + + if (prev_nr_run >= BUSY_NR_RUN && per_cpu(nr, cpu) < BUSY_NR_RUN) + nr_run_trigger = true; + + if (dequeue) { + u64 load; + + load = cpu_rq(cpu)->hmp_stats.cumulative_runnable_avg; + load = scale_load_to_cpu(load, cpu); + + if (load * BUSY_LOAD_FACTOR > sched_ravg_window) + load_trigger = true; + } + + if (nr_run_trigger || load_trigger) + atomic64_set(&per_cpu(last_busy_time, cpu), curr_time); +} +#else +static inline void update_last_busy_time(int cpu, bool dequeue, + unsigned long prev_nr_run, u64 curr_time) +{ +} +#endif + +/** + * sched_update_nr_prod + * @cpu: The core id of the nr running driver. + * @delta: Adjust nr by 'delta' amount + * @inc: Whether we are increasing or decreasing the count + * @return: N/A + * + * Update average with latest nr_running value for CPU + */ +void sched_update_nr_prod(int cpu, long delta, bool inc) +{ + u64 diff; + u64 curr_time; + unsigned long flags, nr_running; + + spin_lock_irqsave(&per_cpu(nr_lock, cpu), flags); + nr_running = per_cpu(nr, cpu); + curr_time = sched_clock(); + diff = curr_time - per_cpu(last_time, cpu); + BUG_ON((s64)diff < 0); + per_cpu(last_time, cpu) = curr_time; + per_cpu(nr, cpu) = nr_running + (inc ? delta : -delta); + + BUG_ON((s64)per_cpu(nr, cpu) < 0); + + if (per_cpu(nr, cpu) > per_cpu(nr_max, cpu)) + per_cpu(nr_max, cpu) = per_cpu(nr, cpu); + + update_last_busy_time(cpu, !inc, nr_running, curr_time); + + per_cpu(nr_prod_sum, cpu) += nr_running * diff; + per_cpu(nr_big_prod_sum, cpu) += nr_eligible_big_tasks(cpu) * diff; + per_cpu(iowait_prod_sum, cpu) += nr_iowait_cpu(cpu) * diff; + spin_unlock_irqrestore(&per_cpu(nr_lock, cpu), flags); +} +EXPORT_SYMBOL(sched_update_nr_prod); + +u64 sched_get_cpu_last_busy_time(int cpu) +{ + return atomic64_read(&per_cpu(last_busy_time, cpu)); +} diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index a5567ccd8803..3278c81cefb1 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -1,5 +1,4 @@ #include "sched.h" -#include "walt.h" /* * stop-task scheduling class. @@ -19,6 +18,41 @@ select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags, } #endif /* CONFIG_SMP */ +#ifdef CONFIG_SCHED_HMP + +static void +inc_hmp_sched_stats_stop(struct rq *rq, struct task_struct *p) +{ + inc_cumulative_runnable_avg(&rq->hmp_stats, p); +} + +static void +dec_hmp_sched_stats_stop(struct rq *rq, struct task_struct *p) +{ + dec_cumulative_runnable_avg(&rq->hmp_stats, p); +} + +static void +fixup_hmp_sched_stats_stop(struct rq *rq, struct task_struct *p, + u32 new_task_load, u32 new_pred_demand) +{ + s64 task_load_delta = (s64)new_task_load - task_load(p); + s64 pred_demand_delta = PRED_DEMAND_DELTA; + + fixup_cumulative_runnable_avg(&rq->hmp_stats, p, task_load_delta, + pred_demand_delta); +} + +#else /* CONFIG_SCHED_HMP */ + +static inline void +inc_hmp_sched_stats_stop(struct rq *rq, struct task_struct *p) { } + +static inline void +dec_hmp_sched_stats_stop(struct rq *rq, struct task_struct *p) { } + +#endif /* CONFIG_SCHED_HMP */ + static void check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags) { @@ -44,14 +78,14 @@ static void enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) { add_nr_running(rq, 1); - walt_inc_cumulative_runnable_avg(rq, p); + inc_hmp_sched_stats_stop(rq, p); } static void dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) { sub_nr_running(rq, 1); - walt_dec_cumulative_runnable_avg(rq, p); + dec_hmp_sched_stats_stop(rq, p); } static void yield_task_stop(struct rq *rq) @@ -138,4 +172,9 @@ const struct sched_class stop_sched_class = { .prio_changed = prio_changed_stop, .switched_to = switched_to_stop, .update_curr = update_curr_stop, +#ifdef CONFIG_SCHED_HMP + .inc_hmp_sched_stats = inc_hmp_sched_stats_stop, + .dec_hmp_sched_stats = dec_hmp_sched_stats_stop, + .fixup_hmp_sched_stats = fixup_hmp_sched_stats_stop, +#endif }; diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index d444fc1a4d58..b84d13750604 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -121,6 +121,33 @@ struct schedtune { /* Boost value for tasks on that SchedTune CGroup */ int boost; +#ifdef CONFIG_SCHED_HMP + /* Toggle ability to override sched boost enabled */ + bool sched_boost_no_override; + + /* + * Controls whether a cgroup is eligible for sched boost or not. This + * can temporariliy be disabled by the kernel based on the no_override + * flag above. + */ + bool sched_boost_enabled; + + /* + * This tracks the default value of sched_boost_enabled and is used + * restore the value following any temporary changes to that flag. + */ + bool sched_boost_enabled_backup; + + /* + * Controls whether tasks of this cgroup should be colocated with each + * other and tasks of other cgroups that have the same flag turned on. + */ + bool colocate; + + /* Controls whether further updates are allowed to the colocate flag */ + bool colocate_update_disabled; +#endif + /* Performance Boost (B) region threshold params */ int perf_boost_idx; @@ -134,7 +161,7 @@ struct schedtune { static inline struct schedtune *css_st(struct cgroup_subsys_state *css) { - return css ? container_of(css, struct schedtune, css) : NULL; + return container_of(css, struct schedtune, css); } static inline struct schedtune *task_schedtune(struct task_struct *tsk) @@ -159,6 +186,13 @@ static inline struct schedtune *parent_st(struct schedtune *st) static struct schedtune root_schedtune = { .boost = 0, +#ifdef CONFIG_SCHED_HMP + .sched_boost_no_override = false, + .sched_boost_enabled = true, + .sched_boost_enabled_backup = true, + .colocate = false, + .colocate_update_disabled = false, +#endif .perf_boost_idx = 0, .perf_constrain_idx = 0, .prefer_idle = 0, @@ -239,6 +273,121 @@ struct boost_groups { /* Boost groups affecting each CPU in the system */ DEFINE_PER_CPU(struct boost_groups, cpu_boost_groups); +#ifdef CONFIG_SCHED_HMP +static inline void init_sched_boost(struct schedtune *st) +{ + st->sched_boost_no_override = false; + st->sched_boost_enabled = true; + st->sched_boost_enabled_backup = st->sched_boost_enabled; + st->colocate = false; + st->colocate_update_disabled = false; +} + +bool same_schedtune(struct task_struct *tsk1, struct task_struct *tsk2) +{ + return task_schedtune(tsk1) == task_schedtune(tsk2); +} + +void update_cgroup_boost_settings(void) +{ + int i; + + for (i = 0; i < BOOSTGROUPS_COUNT; i++) { + if (!allocated_group[i]) + break; + + if (allocated_group[i]->sched_boost_no_override) + continue; + + allocated_group[i]->sched_boost_enabled = false; + } +} + +void restore_cgroup_boost_settings(void) +{ + int i; + + for (i = 0; i < BOOSTGROUPS_COUNT; i++) { + if (!allocated_group[i]) + break; + + allocated_group[i]->sched_boost_enabled = + allocated_group[i]->sched_boost_enabled_backup; + } +} + +bool task_sched_boost(struct task_struct *p) +{ + struct schedtune *st = task_schedtune(p); + + return st->sched_boost_enabled; +} + +static u64 +sched_boost_override_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct schedtune *st = css_st(css); + + return st->sched_boost_no_override; +} + +static int sched_boost_override_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 override) +{ + struct schedtune *st = css_st(css); + + st->sched_boost_no_override = !!override; + + return 0; +} + +static u64 sched_boost_enabled_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct schedtune *st = css_st(css); + + return st->sched_boost_enabled; +} + +static int sched_boost_enabled_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 enable) +{ + struct schedtune *st = css_st(css); + + st->sched_boost_enabled = !!enable; + st->sched_boost_enabled_backup = st->sched_boost_enabled; + + return 0; +} + +static u64 sched_colocate_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct schedtune *st = css_st(css); + + return st->colocate; +} + +static int sched_colocate_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 colocate) +{ + struct schedtune *st = css_st(css); + + if (st->colocate_update_disabled) + return -EPERM; + + st->colocate = !!colocate; + st->colocate_update_disabled = true; + return 0; +} + +#else /* CONFIG_SCHED_HMP */ + +static inline void init_sched_boost(struct schedtune *st) { } + +#endif /* CONFIG_SCHED_HMP */ + static void schedtune_cpu_update(int cpu) { @@ -569,7 +718,7 @@ prefer_idle_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 prefer_idle) { struct schedtune *st = css_st(css); - st->prefer_idle = prefer_idle; + st->prefer_idle = !!prefer_idle; return 0; } @@ -619,6 +768,22 @@ boost_write(struct cgroup_subsys_state *css, struct cftype *cft, return 0; } +static void schedtune_attach(struct cgroup_taskset *tset) +{ + struct task_struct *task; + struct cgroup_subsys_state *css; + struct schedtune *st; + bool colocate; + + cgroup_taskset_first(tset, &css); + st = css_st(css); + + colocate = st->colocate; + + cgroup_taskset_for_each(task, css, tset) + sync_cgroup_colocation(task, colocate); +} + static struct cftype files[] = { { .name = "boost", @@ -630,6 +795,23 @@ static struct cftype files[] = { .read_u64 = prefer_idle_read, .write_u64 = prefer_idle_write, }, +#ifdef CONFIG_SCHED_HMP + { + .name = "sched_boost_no_override", + .read_u64 = sched_boost_override_read, + .write_u64 = sched_boost_override_write, + }, + { + .name = "sched_boost_enabled", + .read_u64 = sched_boost_enabled_read, + .write_u64 = sched_boost_enabled_write, + }, + { + .name = "colocate", + .read_u64 = sched_colocate_read, + .write_u64 = sched_colocate_write, + }, +#endif { } /* terminate */ }; @@ -683,6 +865,7 @@ schedtune_css_alloc(struct cgroup_subsys_state *parent_css) /* Initialize per CPUs boost group support */ st->idx = idx; + init_sched_boost(st); if (schedtune_boostgroup_init(st)) goto release; @@ -720,6 +903,7 @@ struct cgroup_subsys schedtune_cgrp_subsys = { .cancel_attach = schedtune_cancel_attach, .legacy_cftypes = files, .early_init = 1, + .attach = schedtune_attach, }; static inline void @@ -915,7 +1099,8 @@ schedtune_init(void) */ sd = rcu_dereference(per_cpu(sd_ea, cpumask_first(cpu_online_mask))); if (!sd) { - pr_info("schedtune: no energy model data\n"); + if (energy_aware()) + pr_warn("schedtune: no energy model data\n"); goto nodata; } |