summaryrefslogtreecommitdiff
path: root/kernel/sched
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/Makefile5
-rw-r--r--kernel/sched/boost.c217
-rw-r--r--kernel/sched/clock.c2
-rw-r--r--kernel/sched/core.c863
-rw-r--r--kernel/sched/core_ctl.c1171
-rw-r--r--kernel/sched/cpupri.c48
-rw-r--r--kernel/sched/cputime.c17
-rw-r--r--kernel/sched/deadline.c50
-rw-r--r--kernel/sched/debug.c46
-rw-r--r--kernel/sched/energy.c10
-rw-r--r--kernel/sched/fair.c2024
-rw-r--r--kernel/sched/features.h2
-rw-r--r--kernel/sched/hmp.c4416
-rw-r--r--kernel/sched/idle_task.c25
-rw-r--r--kernel/sched/rt.c322
-rw-r--r--kernel/sched/sched.h865
-rw-r--r--kernel/sched/sched_avg.c199
-rw-r--r--kernel/sched/stop_task.c45
-rw-r--r--kernel/sched/tune.c191
19 files changed, 10118 insertions, 400 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 99378130a42f..7dde1b9918e4 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -17,13 +17,14 @@ endif
obj-y += core.o loadavg.o clock.o cputime.o
obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
-obj-y += wait.o completion.o idle.o
+obj-y += wait.o completion.o idle.o sched_avg.o
obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o energy.o
-obj-$(CONFIG_SCHED_WALT) += walt.o
+obj-$(CONFIG_SCHED_HMP) += hmp.o boost.o
obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
obj-$(CONFIG_SCHEDSTATS) += stats.o
obj-$(CONFIG_SCHED_DEBUG) += debug.o
obj-$(CONFIG_SCHED_TUNE) += tune.o
obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
+obj-$(CONFIG_SCHED_CORE_CTL) += core_ctl.o
obj-$(CONFIG_CPU_FREQ) += cpufreq.o
obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
diff --git a/kernel/sched/boost.c b/kernel/sched/boost.c
new file mode 100644
index 000000000000..5bdd51b1e55e
--- /dev/null
+++ b/kernel/sched/boost.c
@@ -0,0 +1,217 @@
+/* Copyright (c) 2012-2016, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include "sched.h"
+#include <linux/of.h>
+#include <linux/sched/core_ctl.h>
+#include <trace/events/sched.h>
+
+/*
+ * Scheduler boost is a mechanism to temporarily place tasks on CPUs
+ * with higher capacity than those where a task would have normally
+ * ended up with their load characteristics. Any entity enabling
+ * boost is responsible for disabling it as well.
+ */
+
+unsigned int sysctl_sched_boost;
+static enum sched_boost_policy boost_policy;
+static enum sched_boost_policy boost_policy_dt = SCHED_BOOST_NONE;
+static DEFINE_MUTEX(boost_mutex);
+static unsigned int freq_aggr_threshold_backup;
+
+static inline void boost_kick(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ if (!test_and_set_bit(BOOST_KICK, &rq->hmp_flags))
+ smp_send_reschedule(cpu);
+}
+
+static void boost_kick_cpus(void)
+{
+ int i;
+ struct cpumask kick_mask;
+
+ if (boost_policy != SCHED_BOOST_ON_BIG)
+ return;
+
+ cpumask_andnot(&kick_mask, cpu_online_mask, cpu_isolated_mask);
+
+ for_each_cpu(i, &kick_mask) {
+ if (cpu_capacity(i) != max_capacity)
+ boost_kick(i);
+ }
+}
+
+int got_boost_kick(void)
+{
+ int cpu = smp_processor_id();
+ struct rq *rq = cpu_rq(cpu);
+
+ return test_bit(BOOST_KICK, &rq->hmp_flags);
+}
+
+void clear_boost_kick(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ clear_bit(BOOST_KICK, &rq->hmp_flags);
+}
+
+/*
+ * Scheduler boost type and boost policy might at first seem unrelated,
+ * however, there exists a connection between them that will allow us
+ * to use them interchangeably during placement decisions. We'll explain
+ * the connection here in one possible way so that the implications are
+ * clear when looking at placement policies.
+ *
+ * When policy = SCHED_BOOST_NONE, type is either none or RESTRAINED
+ * When policy = SCHED_BOOST_ON_ALL or SCHED_BOOST_ON_BIG, type can
+ * neither be none nor RESTRAINED.
+ */
+static void set_boost_policy(int type)
+{
+ if (type == SCHED_BOOST_NONE || type == RESTRAINED_BOOST) {
+ boost_policy = SCHED_BOOST_NONE;
+ return;
+ }
+
+ if (boost_policy_dt) {
+ boost_policy = boost_policy_dt;
+ return;
+ }
+
+ if (min_possible_efficiency != max_possible_efficiency) {
+ boost_policy = SCHED_BOOST_ON_BIG;
+ return;
+ }
+
+ boost_policy = SCHED_BOOST_ON_ALL;
+}
+
+enum sched_boost_policy sched_boost_policy(void)
+{
+ return boost_policy;
+}
+
+static bool verify_boost_params(int old_val, int new_val)
+{
+ /*
+ * Boost can only be turned on or off. There is no possiblity of
+ * switching from one boost type to another or to set the same
+ * kind of boost several times.
+ */
+ return !(!!old_val == !!new_val);
+}
+
+static void _sched_set_boost(int old_val, int type)
+{
+ switch (type) {
+ case NO_BOOST:
+ if (old_val == FULL_THROTTLE_BOOST)
+ core_ctl_set_boost(false);
+ else if (old_val == CONSERVATIVE_BOOST)
+ restore_cgroup_boost_settings();
+ else
+ update_freq_aggregate_threshold(
+ freq_aggr_threshold_backup);
+ break;
+
+ case FULL_THROTTLE_BOOST:
+ core_ctl_set_boost(true);
+ boost_kick_cpus();
+ break;
+
+ case CONSERVATIVE_BOOST:
+ update_cgroup_boost_settings();
+ boost_kick_cpus();
+ break;
+
+ case RESTRAINED_BOOST:
+ freq_aggr_threshold_backup =
+ update_freq_aggregate_threshold(1);
+ break;
+
+ default:
+ WARN_ON(1);
+ return;
+ }
+
+ set_boost_policy(type);
+ sysctl_sched_boost = type;
+ trace_sched_set_boost(type);
+}
+
+void sched_boost_parse_dt(void)
+{
+ struct device_node *sn;
+ const char *boost_policy;
+
+ sn = of_find_node_by_path("/sched-hmp");
+ if (!sn)
+ return;
+
+ if (!of_property_read_string(sn, "boost-policy", &boost_policy)) {
+ if (!strcmp(boost_policy, "boost-on-big"))
+ boost_policy_dt = SCHED_BOOST_ON_BIG;
+ else if (!strcmp(boost_policy, "boost-on-all"))
+ boost_policy_dt = SCHED_BOOST_ON_ALL;
+ }
+}
+
+int sched_set_boost(int type)
+{
+ int ret = 0;
+
+ mutex_lock(&boost_mutex);
+
+ if (verify_boost_params(sysctl_sched_boost, type))
+ _sched_set_boost(sysctl_sched_boost, type);
+ else
+ ret = -EINVAL;
+
+ mutex_unlock(&boost_mutex);
+ return ret;
+}
+
+int sched_boost_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret;
+ unsigned int *data = (unsigned int *)table->data;
+ unsigned int old_val;
+
+ mutex_lock(&boost_mutex);
+
+ old_val = *data;
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+ if (ret || !write)
+ goto done;
+
+ if (verify_boost_params(old_val, *data)) {
+ _sched_set_boost(old_val, *data);
+ } else {
+ *data = old_val;
+ ret = -EINVAL;
+ }
+
+done:
+ mutex_unlock(&boost_mutex);
+ return ret;
+}
+
+int sched_boost(void)
+{
+ return sysctl_sched_boost;
+}
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index caf4041f5b0a..bc54e84675da 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -354,7 +354,7 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
return;
sched_clock_tick();
- touch_softlockup_watchdog();
+ touch_softlockup_watchdog_sched();
}
EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e3ff9a6d11f8..473293dd40a3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -26,6 +26,7 @@
* Thomas Gleixner, Mike Kravetz
*/
+#include <linux/kasan.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/nmi.h>
@@ -74,6 +75,8 @@
#include <linux/binfmts.h>
#include <linux/context_tracking.h>
#include <linux/compiler.h>
+#include <linux/irq.h>
+#include <linux/sched/core_ctl.h>
#include <linux/cpufreq_times.h>
#include <asm/switch_to.h>
@@ -83,14 +86,19 @@
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#endif
+#ifdef CONFIG_MSM_APP_SETTINGS
+#include <asm/app_api.h>
+#endif
#include "sched.h"
#include "../workqueue_internal.h"
#include "../smpboot.h"
+#include "../time/tick-internal.h"
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
-#include "walt.h"
+
+ATOMIC_NOTIFIER_HEAD(load_alert_notifier_head);
DEFINE_MUTEX(sched_domains_mutex);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -856,6 +864,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
if (!(flags & ENQUEUE_RESTORE))
sched_info_queued(rq, p);
p->sched_class->enqueue_task(rq, p, flags);
+ trace_sched_enq_deq_task(p, 1, cpumask_bits(&p->cpus_allowed)[0]);
}
static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -864,6 +873,7 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
if (!(flags & DEQUEUE_SAVE))
sched_info_dequeued(rq, p);
p->sched_class->dequeue_task(rq, p, flags);
+ trace_sched_enq_deq_task(p, 0, cpumask_bits(&p->cpus_allowed)[0]);
}
void activate_task(struct rq *rq, struct task_struct *p, int flags)
@@ -879,6 +889,9 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
if (task_contributes_to_load(p))
rq->nr_uninterruptible++;
+ if (flags & DEQUEUE_SLEEP)
+ clear_ed_task(p, rq);
+
dequeue_task(rq, p, flags);
}
@@ -1094,8 +1107,8 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new
{
lockdep_assert_held(&rq->lock);
- dequeue_task(rq, p, 0);
p->on_rq = TASK_ON_RQ_MIGRATING;
+ dequeue_task(rq, p, 0);
double_lock_balance(rq, cpu_rq(new_cpu));
set_task_cpu(p, new_cpu);
double_unlock_balance(rq, cpu_rq(new_cpu));
@@ -1105,8 +1118,8 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new
raw_spin_lock(&rq->lock);
BUG_ON(task_cpu(p) != new_cpu);
- p->on_rq = TASK_ON_RQ_QUEUED;
enqueue_task(rq, p, 0);
+ p->on_rq = TASK_ON_RQ_QUEUED;
check_preempt_curr(rq, p, 0);
return rq;
@@ -1128,6 +1141,8 @@ struct migration_arg {
*/
static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_cpu)
{
+ int src_cpu;
+
if (unlikely(!cpu_active(dest_cpu)))
return rq;
@@ -1135,6 +1150,7 @@ static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_
if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
return rq;
+ src_cpu = cpu_of(rq);
rq = move_queued_task(rq, p, dest_cpu);
return rq;
@@ -1150,6 +1166,8 @@ static int migration_cpu_stop(void *data)
struct migration_arg *arg = data;
struct task_struct *p = arg->task;
struct rq *rq = this_rq();
+ int src_cpu = cpu_of(rq);
+ bool moved = false;
/*
* The original target cpu might have gone down and we might
@@ -1170,12 +1188,18 @@ static int migration_cpu_stop(void *data)
* holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
* we're holding p->pi_lock.
*/
- if (task_rq(p) == rq && task_on_rq_queued(p))
+ if (task_rq(p) == rq && task_on_rq_queued(p)) {
rq = __migrate_task(rq, p, arg->dest_cpu);
+ moved = true;
+ }
raw_spin_unlock(&rq->lock);
raw_spin_unlock(&p->pi_lock);
local_irq_enable();
+
+ if (moved)
+ notify_migration(src_cpu, arg->dest_cpu, false, p);
+
return 0;
}
@@ -1234,6 +1258,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
struct rq *rq;
unsigned int dest_cpu;
int ret = 0;
+ cpumask_t allowed_mask;
rq = task_rq_lock(p, &flags);
@@ -1249,18 +1274,25 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
if (cpumask_equal(&p->cpus_allowed, new_mask))
goto out;
- if (!cpumask_intersects(new_mask, cpu_active_mask)) {
- ret = -EINVAL;
- goto out;
+ cpumask_andnot(&allowed_mask, new_mask, cpu_isolated_mask);
+ cpumask_and(&allowed_mask, &allowed_mask, cpu_active_mask);
+
+ dest_cpu = cpumask_any(&allowed_mask);
+ if (dest_cpu >= nr_cpu_ids) {
+ cpumask_and(&allowed_mask, cpu_active_mask, new_mask);
+ dest_cpu = cpumask_any(&allowed_mask);
+ if (dest_cpu >= nr_cpu_ids) {
+ ret = -EINVAL;
+ goto out;
+ }
}
do_set_cpus_allowed(p, new_mask);
/* Can the task run on the task's current CPU? If so, we're done */
- if (cpumask_test_cpu(task_cpu(p), new_mask))
+ if (cpumask_test_cpu(task_cpu(p), &allowed_mask))
goto out;
- dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
if (task_running(rq, p) || p->state == TASK_WAKING) {
struct migration_arg arg = { p, dest_cpu };
/* Need help from migration thread: drop lock and wait. */
@@ -1299,6 +1331,15 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
!p->on_rq);
+ /*
+ * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING,
+ * because schedstat_wait_{start,end} rebase migrating task's wait_start
+ * time relying on p->on_rq.
+ */
+ WARN_ON_ONCE(p->state == TASK_RUNNING &&
+ p->sched_class == &fair_sched_class &&
+ (p->on_rq && !task_on_rq_migrating(p)));
+
#ifdef CONFIG_LOCKDEP
/*
* The caller should hold either p->pi_lock or rq->lock, when changing
@@ -1315,7 +1356,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
#endif
#endif
- trace_sched_migrate_task(p, new_cpu);
+ trace_sched_migrate_task(p, new_cpu, pct_task_load(p));
if (task_cpu(p) != new_cpu) {
if (p->sched_class->migrate_task_rq)
@@ -1323,7 +1364,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
p->se.nr_migrations++;
perf_event_task_migrate(p);
- walt_fixup_busy_time(p, new_cpu);
+ fixup_busy_time(p, new_cpu);
}
__set_task_cpu(p, new_cpu);
@@ -1337,11 +1378,13 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
src_rq = task_rq(p);
dst_rq = cpu_rq(cpu);
+ p->on_rq = TASK_ON_RQ_MIGRATING;
deactivate_task(src_rq, p, 0);
p->on_rq = TASK_ON_RQ_MIGRATING;
set_task_cpu(p, cpu);
p->on_rq = TASK_ON_RQ_QUEUED;
activate_task(dst_rq, p, 0);
+ p->on_rq = TASK_ON_RQ_QUEUED;
check_preempt_curr(dst_rq, p, 0);
} else {
/*
@@ -1527,7 +1570,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
* yield - it could be a while.
*/
if (unlikely(queued)) {
- ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
+ ktime_t to = ktime_set(0, NSEC_PER_MSEC);
set_current_state(TASK_UNINTERRUPTIBLE);
schedule_hrtimeout(&to, HRTIMER_MODE_REL);
@@ -1573,12 +1616,13 @@ EXPORT_SYMBOL_GPL(kick_process);
/*
* ->cpus_allowed is protected by both rq->lock and p->pi_lock
*/
-static int select_fallback_rq(int cpu, struct task_struct *p)
+static int select_fallback_rq(int cpu, struct task_struct *p, bool allow_iso)
{
int nid = cpu_to_node(cpu);
const struct cpumask *nodemask = NULL;
- enum { cpuset, possible, fail } state = cpuset;
+ enum { cpuset, possible, fail, bug } state = cpuset;
int dest_cpu;
+ int isolated_candidate = -1;
/*
* If the node that the cpu is on has been offlined, cpu_to_node()
@@ -1594,6 +1638,8 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
continue;
if (!cpu_active(dest_cpu))
continue;
+ if (cpu_isolated(dest_cpu))
+ continue;
if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
return dest_cpu;
}
@@ -1606,6 +1652,16 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
continue;
if (!cpu_active(dest_cpu))
continue;
+ if (cpu_isolated(dest_cpu)) {
+ if (allow_iso)
+ isolated_candidate = dest_cpu;
+ continue;
+ }
+ goto out;
+ }
+
+ if (isolated_candidate != -1) {
+ dest_cpu = isolated_candidate;
goto out;
}
@@ -1624,6 +1680,11 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
break;
case fail:
+ allow_iso = true;
+ state = bug;
+ break;
+
+ case bug:
BUG();
break;
}
@@ -1652,6 +1713,8 @@ static inline
int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags,
int sibling_count_hint)
{
+ bool allow_isolated = (p->flags & PF_KTHREAD);
+
lockdep_assert_held(&p->pi_lock);
if (p->nr_cpus_allowed > 1)
@@ -1669,13 +1732,14 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags,
* not worry about this generic constraint ]
*/
if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
- !cpu_online(cpu)))
- cpu = select_fallback_rq(task_cpu(p), p);
+ !cpu_online(cpu)) ||
+ (cpu_isolated(cpu) && !allow_isolated))
+ cpu = select_fallback_rq(task_cpu(p), p, allow_isolated);
return cpu;
}
-static void update_avg(u64 *avg, u64 sample)
+void update_avg(u64 *avg, u64 sample)
{
s64 diff = sample - *avg;
*avg += diff >> 3;
@@ -1748,6 +1812,7 @@ static void
ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
{
check_preempt_curr(rq, p, wake_flags);
+
p->state = TASK_RUNNING;
trace_sched_wakeup(p);
@@ -1839,6 +1904,8 @@ void sched_ttwu_pending(void)
void scheduler_ipi(void)
{
+ int cpu = smp_processor_id();
+
/*
* Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
* TIF_NEED_RESCHED remotely (for the first time) will also send
@@ -1846,9 +1913,18 @@ void scheduler_ipi(void)
*/
preempt_fold_need_resched();
- if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
+ if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick() &&
+ !got_boost_kick())
return;
+ if (got_boost_kick()) {
+ struct rq *rq = cpu_rq(cpu);
+
+ if (rq->curr->sched_class == &fair_sched_class)
+ check_for_migration(rq, rq->curr);
+ clear_boost_kick(cpu);
+ }
+
/*
* Not all reschedule IPI handlers call irq_enter/irq_exit, since
* traditionally all their work was done from the interrupt return
@@ -1868,7 +1944,7 @@ void scheduler_ipi(void)
/*
* Check if someone kicked us for doing the nohz idle load balance.
*/
- if (unlikely(got_nohz_idle_kick())) {
+ if (unlikely(got_nohz_idle_kick()) && !cpu_isolated(cpu)) {
this_rq()->idle_balance = 1;
raise_softirq_irqoff(SCHED_SOFTIRQ);
}
@@ -1958,11 +2034,17 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
int sibling_count_hint)
{
unsigned long flags;
- int cpu, success = 0;
+ int cpu, src_cpu, success = 0;
#ifdef CONFIG_SMP
+ unsigned int old_load;
struct rq *rq;
u64 wallclock;
+ struct related_thread_group *grp = NULL;
#endif
+ bool freq_notif_allowed = !(wake_flags & WF_NO_NOTIFIER);
+ bool check_group = false;
+
+ wake_flags &= ~WF_NO_NOTIFIER;
/*
* If we are going to wake up a thread waiting for CONDITION we
@@ -1972,13 +2054,14 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
*/
smp_mb__before_spinlock();
raw_spin_lock_irqsave(&p->pi_lock, flags);
+ src_cpu = cpu = task_cpu(p);
+
if (!(p->state & state))
goto out;
trace_sched_waking(p);
success = 1; /* we're going to change ->state */
- cpu = task_cpu(p);
/*
* Ensure we load p->on_rq _after_ p->state, otherwise it would
@@ -2045,11 +2128,20 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
rq = cpu_rq(task_cpu(p));
raw_spin_lock(&rq->lock);
- wallclock = walt_ktime_clock();
- walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
- walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
+ old_load = task_load(p);
+ wallclock = sched_ktime_clock();
+ update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+ update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
+ cpufreq_update_util(rq, 0);
raw_spin_unlock(&rq->lock);
+ rcu_read_lock();
+ grp = task_related_thread_group(p);
+ if (update_preferred_cluster(grp, p, old_load))
+ set_preferred_cluster(grp);
+ rcu_read_unlock();
+ check_group = grp != NULL;
+
p->sched_contributes_to_load = !!task_contributes_to_load(p);
p->state = TASK_WAKING;
@@ -2058,19 +2150,33 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags,
sibling_count_hint);
- if (task_cpu(p) != cpu) {
+
+ /* Refresh src_cpu as it could have changed since we last read it */
+ src_cpu = task_cpu(p);
+ if (src_cpu != cpu) {
wake_flags |= WF_MIGRATED;
set_task_cpu(p, cpu);
}
+ note_task_waking(p, wallclock);
#endif /* CONFIG_SMP */
-
ttwu_queue(p, cpu);
stat:
ttwu_stat(p, cpu, wake_flags);
out:
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ if (freq_notif_allowed) {
+ if (!same_freq_domain(src_cpu, cpu)) {
+ check_for_freq_change(cpu_rq(cpu),
+ false, check_group);
+ check_for_freq_change(cpu_rq(src_cpu),
+ false, check_group);
+ } else if (success) {
+ check_for_freq_change(cpu_rq(cpu), true, false);
+ }
+ }
+
return success;
}
@@ -2086,9 +2192,13 @@ static void try_to_wake_up_local(struct task_struct *p)
{
struct rq *rq = task_rq(p);
- if (WARN_ON_ONCE(rq != this_rq()) ||
- WARN_ON_ONCE(p == current))
+ if (rq != this_rq() || p == current) {
+ printk_deferred("%s: Failed to wakeup task %d (%s), rq = %p,"
+ " this_rq = %p, p = %p, current = %p\n",
+ __func__, task_pid_nr(p), p->comm, rq,
+ this_rq(), p, current);
return;
+ }
lockdep_assert_held(&rq->lock);
@@ -2112,17 +2222,20 @@ static void try_to_wake_up_local(struct task_struct *p)
trace_sched_waking(p);
if (!task_on_rq_queued(p)) {
- u64 wallclock = walt_ktime_clock();
+ u64 wallclock = sched_ktime_clock();
- walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
- walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
+ update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+ update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
+ cpufreq_update_util(rq, 0);
ttwu_activate(rq, p, ENQUEUE_WAKEUP);
+ note_task_waking(p, wallclock);
}
ttwu_do_wakeup(rq, p, 0);
ttwu_stat(p, smp_processor_id(), 0);
out:
raw_spin_unlock(&p->pi_lock);
+ /* Todo : Send cpufreq notifier */
}
/**
@@ -2143,6 +2256,26 @@ int wake_up_process(struct task_struct *p)
}
EXPORT_SYMBOL(wake_up_process);
+/**
+ * wake_up_process_no_notif - Wake up a specific process without notifying
+ * governor
+ * @p: The process to be woken up.
+ *
+ * Attempt to wake up the nominated process and move it to the set of runnable
+ * processes.
+ *
+ * Return: 1 if the process was woken up, 0 if it was already running.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+int wake_up_process_no_notif(struct task_struct *p)
+{
+ WARN_ON(task_is_stopped_or_traced(p));
+ return try_to_wake_up(p, TASK_NORMAL, WF_NO_NOTIFIER, 1);
+}
+EXPORT_SYMBOL(wake_up_process_no_notif);
+
int wake_up_state(struct task_struct *p, unsigned int state)
{
return try_to_wake_up(p, state, 0, 1);
@@ -2167,6 +2300,44 @@ void __dl_clear_params(struct task_struct *p)
dl_se->dl_yielded = 0;
}
+#ifdef CONFIG_SCHED_HMP
+/*
+ * sched_exit() - Set EXITING_TASK_MARKER in task's ravg.demand field
+ *
+ * Stop accounting (exiting) task's future cpu usage
+ *
+ * We need this so that reset_all_windows_stats() can function correctly.
+ * reset_all_window_stats() depends on do_each_thread/for_each_thread task
+ * iterators to reset *all* task's statistics. Exiting tasks however become
+ * invisible to those iterators. sched_exit() is called on a exiting task prior
+ * to being removed from task_list, which will let reset_all_window_stats()
+ * function correctly.
+ */
+void sched_exit(struct task_struct *p)
+{
+ unsigned long flags;
+ struct rq *rq;
+ u64 wallclock;
+
+ sched_set_group_id(p, 0);
+
+ rq = task_rq_lock(p, &flags);
+
+ /* rq->curr == p */
+ wallclock = sched_ktime_clock();
+ update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+ dequeue_task(rq, p, 0);
+ reset_task_stats(p);
+ p->ravg.mark_start = wallclock;
+ p->ravg.sum_history[0] = EXITING_TASK_MARKER;
+
+ enqueue_task(rq, p, 0);
+ clear_ed_task(p, rq);
+ task_rq_unlock(rq, p, &flags);
+ free_task_load_ptrs(p);
+}
+#endif /* CONFIG_SCHED_HMP */
+
/*
* Perform scheduler related setup for a newly forked process p.
* p is forked by current.
@@ -2188,7 +2359,6 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
#endif
INIT_LIST_HEAD(&p->se.group_node);
- walt_init_new_task_load(p);
#ifdef CONFIG_FAIR_GROUP_SCHED
p->se.cfs_rq = NULL;
@@ -2204,6 +2374,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
init_rt_schedtune_timer(&p->rt);
INIT_LIST_HEAD(&p->rt.run_list);
+ p->rt.timeout = 0;
+ p->rt.time_slice = sched_rr_timeslice;
+ p->rt.on_rq = 0;
+ p->rt.on_list = 0;
#ifdef CONFIG_PREEMPT_NOTIFIERS
INIT_HLIST_HEAD(&p->preempt_notifiers);
@@ -2273,7 +2447,10 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
int sched_fork(unsigned long clone_flags, struct task_struct *p)
{
unsigned long flags;
- int cpu = get_cpu();
+ int cpu;
+
+ init_new_task_load(p);
+ cpu = get_cpu();
__sched_fork(clone_flags, p);
/*
@@ -2465,11 +2642,10 @@ void wake_up_new_task(struct task_struct *p)
unsigned long flags;
struct rq *rq;
+ add_new_task_to_grp(p);
raw_spin_lock_irqsave(&p->pi_lock, flags);
p->state = TASK_RUNNING;
- walt_init_new_task_load(p);
-
/* Initialize new task's runnable average */
init_entity_runnable_average(&p->se);
#ifdef CONFIG_SMP
@@ -2484,10 +2660,9 @@ void wake_up_new_task(struct task_struct *p)
__set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0, 1));
#endif
rq = __task_rq_lock(p);
+ mark_task_starting(p);
update_rq_clock(rq);
post_init_entity_util_avg(&p->se);
-
- walt_mark_task_starting(p);
activate_task(rq, p, ENQUEUE_WAKEUP_NEW);
p->on_rq = TASK_ON_RQ_QUEUED;
trace_sched_wakeup_new(p);
@@ -2615,6 +2790,14 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
fire_sched_out_preempt_notifiers(prev, next);
prepare_lock_switch(rq, next);
prepare_arch_switch(next);
+
+#ifdef CONFIG_MSM_APP_SETTINGS
+ if (use_app_setting)
+ switch_app_setting_bit(prev, next);
+
+ if (use_32bit_app_setting || use_32bit_app_setting_pro)
+ switch_32bit_app_setting_bit(prev, next);
+#endif
}
/**
@@ -2906,7 +3089,7 @@ void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
*load = rq->load.weight;
}
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP)
/*
* sched_exec - execve() is a valuable balancing opportunity, because at
@@ -2916,18 +3099,23 @@ void sched_exec(void)
{
struct task_struct *p = current;
unsigned long flags;
- int dest_cpu;
+ int dest_cpu, curr_cpu;
+
+#ifdef CONFIG_SCHED_HMP
+ return;
+#endif
raw_spin_lock_irqsave(&p->pi_lock, flags);
+ curr_cpu = task_cpu(p);
dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0, 1);
if (dest_cpu == smp_processor_id())
goto unlock;
- if (likely(cpu_active(dest_cpu))) {
+ if (likely(cpu_active(dest_cpu) && likely(!cpu_isolated(dest_cpu)))) {
struct migration_arg arg = { p, dest_cpu };
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
- stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
+ stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
return;
}
unlock:
@@ -2994,19 +3182,31 @@ void scheduler_tick(void)
int cpu = smp_processor_id();
struct rq *rq = cpu_rq(cpu);
struct task_struct *curr = rq->curr;
+ u64 wallclock;
+ bool early_notif;
+ u32 old_load;
+ struct related_thread_group *grp;
sched_clock_tick();
raw_spin_lock(&rq->lock);
- walt_set_window_start(rq);
- walt_update_task_ravg(rq->curr, rq, TASK_UPDATE,
- walt_ktime_clock(), 0);
+ old_load = task_load(curr);
+ set_window_start(rq);
update_rq_clock(rq);
curr->sched_class->task_tick(rq, curr, 0);
update_cpu_load_active(rq);
calc_global_load_tick(rq);
+ wallclock = sched_ktime_clock();
+ update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+
+ cpufreq_update_util(rq, 0);
+ early_notif = early_detection_notify(rq, wallclock);
raw_spin_unlock(&rq->lock);
+ if (early_notif)
+ atomic_notifier_call_chain(&load_alert_notifier_head,
+ 0, (void *)(long)cpu);
+
perf_event_task_tick();
#ifdef CONFIG_SMP
@@ -3015,8 +3215,17 @@ void scheduler_tick(void)
#endif
rq_last_tick_reset(rq);
+ rcu_read_lock();
+ grp = task_related_thread_group(curr);
+ if (update_preferred_cluster(grp, curr, old_load))
+ set_preferred_cluster(grp);
+ rcu_read_unlock();
+
if (curr->sched_class == &fair_sched_class)
check_for_migration(rq, curr);
+
+ if (cpu == tick_do_timer_cpu)
+ core_ctl_check(wallclock);
}
#ifdef CONFIG_NO_HZ_FULL
@@ -3059,9 +3268,24 @@ notrace unsigned long get_parent_ip(unsigned long addr)
#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
defined(CONFIG_PREEMPT_TRACER))
+/*
+ * preemptoff stack tracing threshold in ns.
+ * default: 1ms
+ */
+unsigned int sysctl_preemptoff_tracing_threshold_ns = 1000000UL;
+
+struct preempt_store {
+ u64 ts;
+ unsigned long caddr[4];
+ bool irqs_disabled;
+};
+
+static DEFINE_PER_CPU(struct preempt_store, the_ps);
void preempt_count_add(int val)
{
+ struct preempt_store *ps = &per_cpu(the_ps, raw_smp_processor_id());
+
#ifdef CONFIG_DEBUG_PREEMPT
/*
* Underflow?
@@ -3082,6 +3306,13 @@ void preempt_count_add(int val)
#ifdef CONFIG_DEBUG_PREEMPT
current->preempt_disable_ip = ip;
#endif
+ ps->ts = sched_clock();
+ ps->caddr[0] = CALLER_ADDR0;
+ ps->caddr[1] = CALLER_ADDR1;
+ ps->caddr[2] = CALLER_ADDR2;
+ ps->caddr[3] = CALLER_ADDR3;
+ ps->irqs_disabled = irqs_disabled();
+
trace_preempt_off(CALLER_ADDR0, ip);
}
}
@@ -3104,8 +3335,22 @@ void preempt_count_sub(int val)
return;
#endif
- if (preempt_count() == val)
+ if (preempt_count() == val) {
+ struct preempt_store *ps = &per_cpu(the_ps,
+ raw_smp_processor_id());
+ u64 delta = sched_clock() - ps->ts;
+
+ /*
+ * Trace preempt disable stack if preemption
+ * is disabled for more than the threshold.
+ */
+ if (delta > sysctl_preemptoff_tracing_threshold_ns)
+ trace_sched_preempt_disable(delta, ps->irqs_disabled,
+ ps->caddr[0], ps->caddr[1],
+ ps->caddr[2], ps->caddr[3]);
+
trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
+ }
__preempt_count_sub(val);
}
EXPORT_SYMBOL(preempt_count_sub);
@@ -3118,6 +3363,9 @@ NOKPROBE_SYMBOL(preempt_count_sub);
*/
static noinline void __schedule_bug(struct task_struct *prev)
{
+ /* Save this before calling printk(), since that will clobber it */
+ unsigned long preempt_disable_ip = get_preempt_disable_ip(current);
+
if (oops_in_progress)
return;
@@ -3128,12 +3376,14 @@ static noinline void __schedule_bug(struct task_struct *prev)
print_modules();
if (irqs_disabled())
print_irqtrace_events(prev);
-#ifdef CONFIG_DEBUG_PREEMPT
- if (in_atomic_preempt_off()) {
+ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
+ && in_atomic_preempt_off()) {
pr_err("Preemption disabled at:");
- print_ip_sym(current->preempt_disable_ip);
+ print_ip_sym(preempt_disable_ip);
pr_cont("\n");
}
+#ifdef CONFIG_PANIC_ON_SCHED_BUG
+ BUG();
#endif
dump_stack();
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
@@ -3248,7 +3498,6 @@ static void __sched notrace __schedule(bool preempt)
cpu = smp_processor_id();
rq = cpu_rq(cpu);
- rcu_note_context_switch();
prev = rq->curr;
/*
@@ -3267,13 +3516,16 @@ static void __sched notrace __schedule(bool preempt)
if (sched_feat(HRTICK))
hrtick_clear(rq);
+ local_irq_disable();
+ rcu_note_context_switch();
+
/*
* Make sure that signal_pending_state()->signal_pending() below
* can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
* done by the caller to avoid the race with signal_wake_up().
*/
smp_mb__before_spinlock();
- raw_spin_lock_irq(&rq->lock);
+ raw_spin_lock(&rq->lock);
lockdep_pin_lock(&rq->lock);
rq->clock_skip_update <<= 1; /* promote REQ to ACT */
@@ -3306,14 +3558,20 @@ static void __sched notrace __schedule(bool preempt)
update_rq_clock(rq);
next = pick_next_task(rq, prev);
- wallclock = walt_ktime_clock();
- walt_update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0);
- walt_update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0);
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
rq->clock_skip_update = 0;
+ BUG_ON(task_cpu(next) != cpu_of(rq));
+
+ wallclock = sched_ktime_clock();
if (likely(prev != next)) {
+ update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0);
+ update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0);
+ cpufreq_update_util(rq, 0);
+ if (!is_idle_task(prev) && !prev->on_rq)
+ update_avg_burst(prev);
+
#ifdef CONFIG_SCHED_WALT
if (!prev->on_rq)
prev->last_sleep_ts = wallclock;
@@ -3322,10 +3580,14 @@ static void __sched notrace __schedule(bool preempt)
rq->curr = next;
++*switch_count;
+ set_task_last_switch_out(prev, wallclock);
+
trace_sched_switch(preempt, prev, next);
rq = context_switch(rq, prev, next); /* unlocks the rq */
cpu = cpu_of(rq);
} else {
+ update_task_ravg(prev, rq, TASK_UPDATE, wallclock, 0);
+ cpufreq_update_util(rq, 0);
lockdep_unpin_lock(&rq->lock);
raw_spin_unlock_irq(&rq->lock);
}
@@ -3510,7 +3772,7 @@ EXPORT_SYMBOL(default_wake_function);
*/
void rt_mutex_setprio(struct task_struct *p, int prio)
{
- int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE;
+ int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
struct rq *rq;
const struct sched_class *prev_class;
@@ -3539,11 +3801,15 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
trace_sched_pi_setprio(p, prio);
oldprio = p->prio;
+
+ if (oldprio == prio)
+ queue_flag &= ~DEQUEUE_MOVE;
+
prev_class = p->sched_class;
queued = task_on_rq_queued(p);
running = task_current(rq, p);
if (queued)
- dequeue_task(rq, p, DEQUEUE_SAVE);
+ dequeue_task(rq, p, queue_flag);
if (running)
put_prev_task(rq, p);
@@ -3562,7 +3828,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
(pi_task && dl_prio(pi_task->prio) &&
dl_entity_preempt(&pi_task->dl, &p->dl))) {
p->dl.dl_boosted = 1;
- enqueue_flag |= ENQUEUE_REPLENISH;
+ queue_flag |= ENQUEUE_REPLENISH;
} else
p->dl.dl_boosted = 0;
p->sched_class = &dl_sched_class;
@@ -3570,7 +3836,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
if (dl_prio(oldprio))
p->dl.dl_boosted = 0;
if (oldprio < prio)
- enqueue_flag |= ENQUEUE_HEAD;
+ queue_flag |= ENQUEUE_HEAD;
p->sched_class = &rt_sched_class;
} else {
if (dl_prio(oldprio))
@@ -3585,7 +3851,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
if (running)
p->sched_class->set_curr_task(rq);
if (queued)
- enqueue_task(rq, p, enqueue_flag);
+ enqueue_task(rq, p, queue_flag);
check_class_changed(rq, p, prev_class, oldprio);
out_unlock:
@@ -3944,6 +4210,7 @@ static int __sched_setscheduler(struct task_struct *p,
const struct sched_class *prev_class;
struct rq *rq;
int reset_on_fork;
+ int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
/* The pi code expects interrupts enabled */
BUG_ON(pi && in_interrupt());
@@ -4127,17 +4394,14 @@ change:
* itself.
*/
new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
- if (new_effective_prio == oldprio) {
- __setscheduler_params(p, attr);
- task_rq_unlock(rq, p, &flags);
- return 0;
- }
+ if (new_effective_prio == oldprio)
+ queue_flags &= ~DEQUEUE_MOVE;
}
queued = task_on_rq_queued(p);
running = task_current(rq, p);
if (queued)
- dequeue_task(rq, p, DEQUEUE_SAVE);
+ dequeue_task(rq, p, queue_flags);
if (running)
put_prev_task(rq, p);
@@ -4147,15 +4411,14 @@ change:
if (running)
p->sched_class->set_curr_task(rq);
if (queued) {
- int enqueue_flags = ENQUEUE_RESTORE;
/*
* We enqueue to tail when the priority of a task is
* increased (user space view).
*/
- if (oldprio <= p->prio)
- enqueue_flags |= ENQUEUE_HEAD;
+ if (oldprio < p->prio)
+ queue_flags |= ENQUEUE_HEAD;
- enqueue_task(rq, p, enqueue_flags);
+ enqueue_task(rq, p, queue_flags);
}
check_class_changed(rq, p, prev_class, oldprio);
@@ -4233,7 +4496,7 @@ int sched_setscheduler_nocheck(struct task_struct *p, int policy,
{
return _sched_setscheduler(p, policy, param, false);
}
-EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
+EXPORT_SYMBOL(sched_setscheduler_nocheck);
static int
do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
@@ -4553,6 +4816,8 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
cpumask_var_t cpus_allowed, new_mask;
struct task_struct *p;
int retval;
+ int dest_cpu;
+ cpumask_t allowed_mask;
rcu_read_lock();
@@ -4614,20 +4879,26 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
}
#endif
again:
- retval = __set_cpus_allowed_ptr(p, new_mask, true);
-
- if (!retval) {
- cpuset_cpus_allowed(p, cpus_allowed);
- if (!cpumask_subset(new_mask, cpus_allowed)) {
- /*
- * We must have raced with a concurrent cpuset
- * update. Just reset the cpus_allowed to the
- * cpuset's cpus_allowed
- */
- cpumask_copy(new_mask, cpus_allowed);
- goto again;
+ cpumask_andnot(&allowed_mask, new_mask, cpu_isolated_mask);
+ dest_cpu = cpumask_any_and(cpu_active_mask, &allowed_mask);
+ if (dest_cpu < nr_cpu_ids) {
+ retval = __set_cpus_allowed_ptr(p, new_mask, true);
+ if (!retval) {
+ cpuset_cpus_allowed(p, cpus_allowed);
+ if (!cpumask_subset(new_mask, cpus_allowed)) {
+ /*
+ * We must have raced with a concurrent cpuset
+ * update. Just reset the cpus_allowed to the
+ * cpuset's cpus_allowed
+ */
+ cpumask_copy(new_mask, cpus_allowed);
+ goto again;
+ }
}
+ } else {
+ retval = -EINVAL;
}
+
out_free_new_mask:
free_cpumask_var(new_mask);
out_free_cpus_allowed:
@@ -4691,6 +4962,15 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
raw_spin_lock_irqsave(&p->pi_lock, flags);
cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
+
+ /*
+ * The userspace tasks are forbidden to run on
+ * isolated CPUs. So exclude isolated CPUs from
+ * the getaffinity.
+ */
+ if (!(p->flags & PF_KTHREAD))
+ cpumask_andnot(mask, mask, cpu_isolated_mask);
+
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
out_unlock:
@@ -5112,6 +5392,8 @@ void show_state_filter(unsigned long state_filter)
sched_show_task(p);
}
+ touch_all_softlockup_watchdogs();
+
#ifdef CONFIG_SCHED_DEBUG
sysrq_sched_debug_show();
#endif
@@ -5149,6 +5431,8 @@ void init_idle(struct task_struct *idle, int cpu)
idle->state = TASK_RUNNING;
idle->se.exec_start = sched_clock();
+ kasan_unpoison_task_stack(idle);
+
#ifdef CONFIG_SMP
/*
* Its possible that init_idle() gets called multiple times on a task,
@@ -5371,18 +5655,54 @@ static struct task_struct fake_task = {
};
/*
- * Migrate all tasks from the rq, sleeping tasks will be migrated by
- * try_to_wake_up()->select_task_rq().
+ * Remove a task from the runqueue and pretend that it's migrating. This
+ * should prevent migrations for the detached task and disallow further
+ * changes to tsk_cpus_allowed.
+ */
+static void
+detach_one_task(struct task_struct *p, struct rq *rq, struct list_head *tasks)
+{
+ lockdep_assert_held(&rq->lock);
+
+ p->on_rq = TASK_ON_RQ_MIGRATING;
+ deactivate_task(rq, p, 0);
+ list_add(&p->se.group_node, tasks);
+}
+
+static void attach_tasks(struct list_head *tasks, struct rq *rq)
+{
+ struct task_struct *p;
+
+ lockdep_assert_held(&rq->lock);
+
+ while (!list_empty(tasks)) {
+ p = list_first_entry(tasks, struct task_struct, se.group_node);
+ list_del_init(&p->se.group_node);
+
+ BUG_ON(task_rq(p) != rq);
+ activate_task(rq, p, 0);
+ p->on_rq = TASK_ON_RQ_QUEUED;
+ }
+}
+
+/*
+ * Migrate all tasks (not pinned if pinned argument say so) from the rq,
+ * sleeping tasks will be migrated by try_to_wake_up()->select_task_rq().
*
* Called with rq->lock held even though we'er in stop_machine() and
* there's no concurrency possible, we hold the required locks anyway
* because of lock validation efforts.
*/
-static void migrate_tasks(struct rq *dead_rq)
+static void migrate_tasks(struct rq *dead_rq, bool migrate_pinned_tasks)
{
struct rq *rq = dead_rq;
struct task_struct *next, *stop = rq->stop;
int dest_cpu;
+ unsigned int num_pinned_kthreads = 1; /* this thread */
+ LIST_HEAD(tasks);
+ cpumask_t avail_cpus;
+
+ cpumask_andnot(&avail_cpus, cpu_online_mask, cpu_isolated_mask);
/*
* Fudge the rq selection such that the below task selection loop
@@ -5418,6 +5738,14 @@ static void migrate_tasks(struct rq *dead_rq)
BUG_ON(!next);
next->sched_class->put_prev_task(rq, next);
+ if (!migrate_pinned_tasks && next->flags & PF_KTHREAD &&
+ !cpumask_intersects(&avail_cpus, &next->cpus_allowed)) {
+ detach_one_task(next, rq, &tasks);
+ num_pinned_kthreads += 1;
+ lockdep_unpin_lock(&rq->lock);
+ continue;
+ }
+
/*
* Rules for changing task_struct::cpus_allowed are holding
* both pi_lock and rq->lock, such that holding either
@@ -5436,26 +5764,271 @@ static void migrate_tasks(struct rq *dead_rq)
* Since we're inside stop-machine, _nothing_ should have
* changed the task, WARN if weird stuff happened, because in
* that case the above rq->lock drop is a fail too.
+ * However, during cpu isolation the load balancer might have
+ * interferred since we don't stop all CPUs. Ignore warning for
+ * this case.
*/
- if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {
+ if (task_rq(next) != rq || !task_on_rq_queued(next)) {
+ WARN_ON(migrate_pinned_tasks);
raw_spin_unlock(&next->pi_lock);
continue;
}
/* Find suitable destination for @next, with force if needed. */
- dest_cpu = select_fallback_rq(dead_rq->cpu, next);
+ dest_cpu = select_fallback_rq(dead_rq->cpu, next, false);
rq = __migrate_task(rq, next, dest_cpu);
if (rq != dead_rq) {
+ raw_spin_unlock(&next->pi_lock);
raw_spin_unlock(&rq->lock);
+ notify_migration(dead_rq->cpu, dest_cpu, true, next);
rq = dead_rq;
+ raw_spin_lock(&next->pi_lock);
raw_spin_lock(&rq->lock);
}
raw_spin_unlock(&next->pi_lock);
}
rq->stop = stop;
+
+ if (num_pinned_kthreads > 1)
+ attach_tasks(&tasks, rq);
+}
+
+static void set_rq_online(struct rq *rq);
+static void set_rq_offline(struct rq *rq);
+
+int do_isolation_work_cpu_stop(void *data)
+{
+ unsigned int cpu = smp_processor_id();
+ struct rq *rq = cpu_rq(cpu);
+
+ watchdog_disable(cpu);
+
+ irq_migrate_all_off_this_cpu();
+
+ local_irq_disable();
+
+ sched_ttwu_pending();
+
+ raw_spin_lock(&rq->lock);
+
+ /*
+ * Temporarily mark the rq as offline. This will allow us to
+ * move tasks off the CPU.
+ */
+ if (rq->rd) {
+ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+ set_rq_offline(rq);
+ }
+
+ migrate_tasks(rq, false);
+
+ if (rq->rd)
+ set_rq_online(rq);
+ raw_spin_unlock(&rq->lock);
+
+ /*
+ * We might have been in tickless state. Clear NOHZ flags to avoid
+ * us being kicked for helping out with balancing
+ */
+ nohz_balance_clear_nohz_mask(cpu);
+
+ clear_hmp_request(cpu);
+ local_irq_enable();
+ return 0;
+}
+
+int do_unisolation_work_cpu_stop(void *data)
+{
+ watchdog_enable(smp_processor_id());
+ return 0;
+}
+
+static void init_sched_groups_capacity(int cpu, struct sched_domain *sd);
+
+static void sched_update_group_capacities(int cpu)
+{
+ struct sched_domain *sd;
+
+ mutex_lock(&sched_domains_mutex);
+ rcu_read_lock();
+
+ for_each_domain(cpu, sd) {
+ int balance_cpu = group_balance_cpu(sd->groups);
+
+ init_sched_groups_capacity(cpu, sd);
+ /*
+ * Need to ensure this is also called with balancing
+ * cpu.
+ */
+ if (cpu != balance_cpu)
+ init_sched_groups_capacity(balance_cpu, sd);
+ }
+
+ rcu_read_unlock();
+ mutex_unlock(&sched_domains_mutex);
+}
+
+static unsigned int cpu_isolation_vote[NR_CPUS];
+
+int sched_isolate_count(const cpumask_t *mask, bool include_offline)
+{
+ cpumask_t count_mask = CPU_MASK_NONE;
+
+ if (include_offline) {
+ cpumask_complement(&count_mask, cpu_online_mask);
+ cpumask_or(&count_mask, &count_mask, cpu_isolated_mask);
+ cpumask_and(&count_mask, &count_mask, mask);
+ } else {
+ cpumask_and(&count_mask, mask, cpu_isolated_mask);
+ }
+
+ return cpumask_weight(&count_mask);
}
+
+/*
+ * 1) CPU is isolated and cpu is offlined:
+ * Unisolate the core.
+ * 2) CPU is not isolated and CPU is offlined:
+ * No action taken.
+ * 3) CPU is offline and request to isolate
+ * Request ignored.
+ * 4) CPU is offline and isolated:
+ * Not a possible state.
+ * 5) CPU is online and request to isolate
+ * Normal case: Isolate the CPU
+ * 6) CPU is not isolated and comes back online
+ * Nothing to do
+ *
+ * Note: The client calling sched_isolate_cpu() is repsonsible for ONLY
+ * calling sched_unisolate_cpu() on a CPU that the client previously isolated.
+ * Client is also responsible for unisolating when a core goes offline
+ * (after CPU is marked offline).
+ */
+int sched_isolate_cpu(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ cpumask_t avail_cpus;
+ int ret_code = 0;
+ u64 start_time = 0;
+
+ if (trace_sched_isolate_enabled())
+ start_time = sched_clock();
+
+ cpu_maps_update_begin();
+
+ cpumask_andnot(&avail_cpus, cpu_online_mask, cpu_isolated_mask);
+
+ /* We cannot isolate ALL cpus in the system */
+ if (cpumask_weight(&avail_cpus) == 1) {
+ ret_code = -EINVAL;
+ goto out;
+ }
+
+ if (!cpu_online(cpu)) {
+ ret_code = -EINVAL;
+ goto out;
+ }
+
+ if (++cpu_isolation_vote[cpu] > 1)
+ goto out;
+
+ /*
+ * There is a race between watchdog being enabled by hotplug and
+ * core isolation disabling the watchdog. When a CPU is hotplugged in
+ * and the hotplug lock has been released the watchdog thread might
+ * not have run yet to enable the watchdog.
+ * We have to wait for the watchdog to be enabled before proceeding.
+ */
+ if (!watchdog_configured(cpu)) {
+ msleep(20);
+ if (!watchdog_configured(cpu)) {
+ --cpu_isolation_vote[cpu];
+ ret_code = -EBUSY;
+ goto out;
+ }
+ }
+
+ set_cpu_isolated(cpu, true);
+ cpumask_clear_cpu(cpu, &avail_cpus);
+
+ /* Migrate timers */
+ smp_call_function_any(&avail_cpus, hrtimer_quiesce_cpu, &cpu, 1);
+ smp_call_function_any(&avail_cpus, timer_quiesce_cpu, &cpu, 1);
+
+ stop_cpus(cpumask_of(cpu), do_isolation_work_cpu_stop, 0);
+
+ calc_load_migrate(rq);
+ update_max_interval();
+ sched_update_group_capacities(cpu);
+
+out:
+ cpu_maps_update_done();
+ trace_sched_isolate(cpu, cpumask_bits(cpu_isolated_mask)[0],
+ start_time, 1);
+ return ret_code;
+}
+
+/*
+ * Note: The client calling sched_isolate_cpu() is repsonsible for ONLY
+ * calling sched_unisolate_cpu() on a CPU that the client previously isolated.
+ * Client is also responsible for unisolating when a core goes offline
+ * (after CPU is marked offline).
+ */
+int sched_unisolate_cpu_unlocked(int cpu)
+{
+ int ret_code = 0;
+ struct rq *rq = cpu_rq(cpu);
+ u64 start_time = 0;
+
+ if (trace_sched_isolate_enabled())
+ start_time = sched_clock();
+
+ if (!cpu_isolation_vote[cpu]) {
+ ret_code = -EINVAL;
+ goto out;
+ }
+
+ if (--cpu_isolation_vote[cpu])
+ goto out;
+
+ if (cpu_online(cpu)) {
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ rq->age_stamp = sched_clock_cpu(cpu);
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+ }
+
+ set_cpu_isolated(cpu, false);
+ update_max_interval();
+ sched_update_group_capacities(cpu);
+
+ if (cpu_online(cpu)) {
+ stop_cpus(cpumask_of(cpu), do_unisolation_work_cpu_stop, 0);
+
+ /* Kick CPU to immediately do load balancing */
+ if (!test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)))
+ smp_send_reschedule(cpu);
+ }
+
+out:
+ trace_sched_isolate(cpu, cpumask_bits(cpu_isolated_mask)[0],
+ start_time, 0);
+ return ret_code;
+}
+
+int sched_unisolate_cpu(int cpu)
+{
+ int ret_code;
+
+ cpu_maps_update_begin();
+ ret_code = sched_unisolate_cpu_unlocked(cpu);
+ cpu_maps_update_done();
+ return ret_code;
+}
+
#endif /* CONFIG_HOTPLUG_CPU */
#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -5743,7 +6316,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
case CPU_UP_PREPARE:
raw_spin_lock_irqsave(&rq->lock, flags);
- walt_set_window_start(rq);
+ set_window_start(rq);
raw_spin_unlock_irqrestore(&rq->lock, flags);
rq->calc_load_update = calc_load_update;
break;
@@ -5764,17 +6337,18 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
sched_ttwu_pending();
/* Update our root-domain */
raw_spin_lock_irqsave(&rq->lock, flags);
- walt_migrate_sync_cpu(cpu);
+
if (rq->rd) {
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
set_rq_offline(rq);
}
- migrate_tasks(rq);
+ migrate_tasks(rq, true);
BUG_ON(rq->nr_running != 1); /* the migration thread */
raw_spin_unlock_irqrestore(&rq->lock, flags);
break;
case CPU_DEAD:
+ clear_hmp_request(cpu);
calc_load_migrate(rq);
break;
#endif
@@ -6319,6 +6893,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
{
struct rq *rq = cpu_rq(cpu);
struct sched_domain *tmp;
+ unsigned long next_balance = rq->next_balance;
/* Remove the sched domains which do not contribute to scheduling. */
for (tmp = sd; tmp; ) {
@@ -6350,6 +6925,17 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
sd->child = NULL;
}
+ for (tmp = sd; tmp; ) {
+ unsigned long interval;
+
+ interval = msecs_to_jiffies(tmp->balance_interval);
+ if (time_after(next_balance, tmp->last_balance + interval))
+ next_balance = tmp->last_balance + interval;
+
+ tmp = tmp->parent;
+ }
+ rq->next_balance = next_balance;
+
sched_domain_debug(sd, cpu);
rq_attach_root(rq, rd);
@@ -6599,11 +7185,14 @@ build_sched_groups(struct sched_domain *sd, int cpu)
static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
{
struct sched_group *sg = sd->groups;
+ cpumask_t avail_mask;
WARN_ON(!sg);
do {
- sg->group_weight = cpumask_weight(sched_group_cpus(sg));
+ cpumask_andnot(&avail_mask, sched_group_cpus(sg),
+ cpu_isolated_mask);
+ sg->group_weight = cpumask_weight(&avail_mask);
sg = sg->next;
} while (sg != sd->groups);
@@ -7336,6 +7925,9 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
pr_err(" the %s domain not a subset of the %s domain\n",
child->name, sd->name);
#endif
+#ifdef CONFIG_PANIC_ON_SCHED_BUG
+ BUG();
+#endif
/* Fixup, ensure @sd has at least @child cpus. */
cpumask_or(sched_domain_span(sd),
sched_domain_span(sd),
@@ -7400,7 +7992,8 @@ static int build_sched_domains(const struct cpumask *cpu_map,
continue;
for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent, tl++) {
- init_sched_energy(i, sd, tl->energy);
+ if (energy_aware())
+ init_sched_energy(i, sd, tl->energy);
claim_allocations(i, sd);
init_sched_groups_capacity(i, sd);
}
@@ -7722,6 +8315,8 @@ void __init sched_init_smp(void)
hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
+ update_cluster_topology();
+
init_hrtick();
/* Move init over to a non-isolated CPU */
@@ -7740,6 +8335,7 @@ void __init sched_init_smp(void)
}
#endif /* CONFIG_SMP */
+
int in_sched_functions(unsigned long addr)
{
return in_lock_functions(addr) ||
@@ -7763,6 +8359,15 @@ void __init sched_init(void)
int i, j;
unsigned long alloc_size = 0, ptr;
+#ifdef CONFIG_SCHED_HMP
+ pr_info("HMP scheduling enabled.\n");
+#endif
+
+ BUG_ON(num_possible_cpus() > BITS_PER_LONG);
+
+ sched_boost_parse_dt();
+ init_clusters();
+
#ifdef CONFIG_FAIR_GROUP_SCHED
alloc_size += 2 * nr_cpu_ids * sizeof(void **);
#endif
@@ -7879,12 +8484,53 @@ void __init sched_init(void)
rq->online = 0;
rq->idle_stamp = 0;
rq->avg_idle = 2*sysctl_sched_migration_cost;
- rq->max_idle_balance_cost = sysctl_sched_migration_cost;
-#ifdef CONFIG_SCHED_WALT
+#ifdef CONFIG_SCHED_HMP
+ cpumask_set_cpu(i, &rq->freq_domain_cpumask);
+ rq->hmp_stats.cumulative_runnable_avg = 0;
+ rq->window_start = 0;
+ rq->hmp_stats.nr_big_tasks = 0;
+ rq->hmp_flags = 0;
rq->cur_irqload = 0;
rq->avg_irqload = 0;
rq->irqload_ts = 0;
+ rq->static_cpu_pwr_cost = 0;
+ rq->cc.cycles = 1;
+ rq->cc.time = 1;
+ rq->cstate = 0;
+ rq->wakeup_latency = 0;
+ rq->wakeup_energy = 0;
+
+ /*
+ * All cpus part of same cluster by default. This avoids the
+ * need to check for rq->cluster being non-NULL in hot-paths
+ * like select_best_cpu()
+ */
+ rq->cluster = &init_cluster;
+ rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
+ rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0;
+ memset(&rq->grp_time, 0, sizeof(struct group_cpu_time));
+ rq->old_busy_time = 0;
+ rq->old_estimated_time = 0;
+ rq->old_busy_time_group = 0;
+ rq->hmp_stats.pred_demands_sum = 0;
+ rq->curr_table = 0;
+ rq->prev_top = 0;
+ rq->curr_top = 0;
+
+ for (j = 0; j < NUM_TRACKED_WINDOWS; j++) {
+ memset(&rq->load_subs[j], 0,
+ sizeof(struct load_subtractions));
+
+ rq->top_tasks[j] = kcalloc(NUM_LOAD_INDICES,
+ sizeof(u8), GFP_NOWAIT);
+
+ /* No other choice */
+ BUG_ON(!rq->top_tasks[j]);
+
+ clear_top_tasks_bitmap(rq->top_tasks_bitmap[j]);
+ }
#endif
+ rq->max_idle_balance_cost = sysctl_sched_migration_cost;
INIT_LIST_HEAD(&rq->cfs_tasks);
@@ -7900,6 +8546,11 @@ void __init sched_init(void)
atomic_set(&rq->nr_iowait, 0);
}
+ i = alloc_related_thread_groups();
+ BUG_ON(i);
+
+ set_hmp_defaults();
+
set_load_weight(&init_task);
#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -7924,6 +8575,7 @@ void __init sched_init(void)
* when this runqueue becomes "idle".
*/
init_idle(current, smp_processor_id());
+ init_new_task_load(current);
calc_load_update = jiffies + LOAD_FREQ;
@@ -7977,6 +8629,7 @@ EXPORT_SYMBOL(__might_sleep);
void ___might_sleep(const char *file, int line, int preempt_offset)
{
static unsigned long prev_jiffy; /* ratelimiting */
+ unsigned long preempt_disable_ip;
rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
@@ -7989,6 +8642,9 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
return;
prev_jiffy = jiffies;
+ /* Save this before calling printk(), since that will clobber it */
+ preempt_disable_ip = get_preempt_disable_ip(current);
+
printk(KERN_ERR
"BUG: sleeping function called from invalid context at %s:%d\n",
file, line);
@@ -8003,12 +8659,14 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
debug_show_held_locks(current);
if (irqs_disabled())
print_irqtrace_events(current);
-#ifdef CONFIG_DEBUG_PREEMPT
- if (!preempt_count_equals(preempt_offset)) {
+ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
+ && !preempt_count_equals(preempt_offset)) {
pr_err("Preemption disabled at:");
- print_ip_sym(current->preempt_disable_ip);
+ print_ip_sym(preempt_disable_ip);
pr_cont("\n");
}
+#ifdef CONFIG_PANIC_ON_SCHED_BUG
+ BUG();
#endif
dump_stack();
}
@@ -8220,7 +8878,7 @@ void sched_move_task(struct task_struct *tsk)
queued = task_on_rq_queued(tsk);
if (queued)
- dequeue_task(rq, tsk, DEQUEUE_SAVE);
+ dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
if (unlikely(running))
put_prev_task(rq, tsk);
@@ -8229,7 +8887,7 @@ void sched_move_task(struct task_struct *tsk)
if (unlikely(running))
tsk->sched_class->set_curr_task(rq);
if (queued)
- enqueue_task(rq, tsk, ENQUEUE_RESTORE);
+ enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE);
task_rq_unlock(rq, tsk, &flags);
}
@@ -8611,7 +9269,7 @@ int sched_rr_handler(struct ctl_table *table, int write,
#ifdef CONFIG_CGROUP_SCHED
-static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
+inline struct task_group *css_tg(struct cgroup_subsys_state *css)
{
return css ? container_of(css, struct task_group, css) : NULL;
}
@@ -9011,6 +9669,13 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
#endif /* CONFIG_RT_GROUP_SCHED */
static struct cftype cpu_files[] = {
+#ifdef CONFIG_SCHED_HMP
+ {
+ .name = "upmigrate_discourage",
+ .read_u64 = cpu_upmigrate_discourage_read_u64,
+ .write_u64 = cpu_upmigrate_discourage_write_u64,
+ },
+#endif
#ifdef CONFIG_FAIR_GROUP_SCHED
{
.name = "shares",
diff --git a/kernel/sched/core_ctl.c b/kernel/sched/core_ctl.c
new file mode 100644
index 000000000000..2f060a570061
--- /dev/null
+++ b/kernel/sched/core_ctl.c
@@ -0,0 +1,1171 @@
+/* Copyright (c) 2014-2017, 2020 The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#define pr_fmt(fmt) "core_ctl: " fmt
+
+#include <linux/init.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/cpufreq.h>
+#include <linux/kthread.h>
+#include <linux/sched.h>
+#include <linux/sched/rt.h>
+
+#include <trace/events/sched.h>
+#include "sched.h"
+
+#define MAX_CPUS_PER_CLUSTER 4
+#define MAX_CLUSTERS 2
+
+struct cluster_data {
+ bool inited;
+ unsigned int min_cpus;
+ unsigned int max_cpus;
+ unsigned int offline_delay_ms;
+ unsigned int busy_up_thres[MAX_CPUS_PER_CLUSTER];
+ unsigned int busy_down_thres[MAX_CPUS_PER_CLUSTER];
+ unsigned int active_cpus;
+ unsigned int num_cpus;
+ unsigned int nr_isolated_cpus;
+ cpumask_t cpu_mask;
+ unsigned int need_cpus;
+ unsigned int task_thres;
+ unsigned int max_nr;
+ s64 need_ts;
+ struct list_head lru;
+ bool pending;
+ spinlock_t pending_lock;
+ bool is_big_cluster;
+ bool enable;
+ int nrrun;
+ bool nrrun_changed;
+ struct task_struct *core_ctl_thread;
+ unsigned int first_cpu;
+ unsigned int boost;
+ struct kobject kobj;
+};
+
+struct cpu_data {
+ bool is_busy;
+ unsigned int busy;
+ unsigned int cpu;
+ bool not_preferred;
+ struct cluster_data *cluster;
+ struct list_head sib;
+ bool isolated_by_us;
+ unsigned int max_nr;
+};
+
+static DEFINE_PER_CPU(struct cpu_data, cpu_state);
+static struct cluster_data cluster_state[MAX_CLUSTERS];
+static unsigned int num_clusters;
+
+#define for_each_cluster(cluster, idx) \
+ for (; (idx) < num_clusters && ((cluster) = &cluster_state[idx]);\
+ (idx)++)
+
+static DEFINE_SPINLOCK(state_lock);
+static void apply_need(struct cluster_data *state);
+static void wake_up_core_ctl_thread(struct cluster_data *state);
+static bool initialized;
+
+static unsigned int get_active_cpu_count(const struct cluster_data *cluster);
+
+/* ========================= sysfs interface =========================== */
+
+static ssize_t store_min_cpus(struct cluster_data *state,
+ const char *buf, size_t count)
+{
+ unsigned int val;
+
+ if (sscanf(buf, "%u\n", &val) != 1)
+ return -EINVAL;
+
+ state->min_cpus = min(val, state->max_cpus);
+ wake_up_core_ctl_thread(state);
+
+ return count;
+}
+
+static ssize_t show_min_cpus(const struct cluster_data *state, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%u\n", state->min_cpus);
+}
+
+static ssize_t store_max_cpus(struct cluster_data *state,
+ const char *buf, size_t count)
+{
+ unsigned int val;
+
+ if (sscanf(buf, "%u\n", &val) != 1)
+ return -EINVAL;
+
+ val = min(val, state->num_cpus);
+ state->max_cpus = val;
+ state->min_cpus = min(state->min_cpus, state->max_cpus);
+ wake_up_core_ctl_thread(state);
+
+ return count;
+}
+
+static ssize_t show_max_cpus(const struct cluster_data *state, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%u\n", state->max_cpus);
+}
+
+static ssize_t store_offline_delay_ms(struct cluster_data *state,
+ const char *buf, size_t count)
+{
+ unsigned int val;
+
+ if (sscanf(buf, "%u\n", &val) != 1)
+ return -EINVAL;
+
+ state->offline_delay_ms = val;
+ apply_need(state);
+
+ return count;
+}
+
+static ssize_t show_task_thres(const struct cluster_data *state, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%u\n", state->task_thres);
+}
+
+static ssize_t store_task_thres(struct cluster_data *state,
+ const char *buf, size_t count)
+{
+ unsigned int val;
+
+ if (sscanf(buf, "%u\n", &val) != 1)
+ return -EINVAL;
+
+ if (val < state->num_cpus)
+ return -EINVAL;
+
+ state->task_thres = val;
+ apply_need(state);
+
+ return count;
+}
+
+static ssize_t show_offline_delay_ms(const struct cluster_data *state,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%u\n", state->offline_delay_ms);
+}
+
+static ssize_t store_busy_up_thres(struct cluster_data *state,
+ const char *buf, size_t count)
+{
+ unsigned int val[MAX_CPUS_PER_CLUSTER];
+ int ret, i;
+
+ ret = sscanf(buf, "%u %u %u %u\n", &val[0], &val[1], &val[2], &val[3]);
+ if (ret != 1 && ret != state->num_cpus)
+ return -EINVAL;
+
+ if (ret == 1) {
+ for (i = 0; i < state->num_cpus; i++)
+ state->busy_up_thres[i] = val[0];
+ } else {
+ for (i = 0; i < state->num_cpus; i++)
+ state->busy_up_thres[i] = val[i];
+ }
+ apply_need(state);
+ return count;
+}
+
+static ssize_t show_busy_up_thres(const struct cluster_data *state, char *buf)
+{
+ int i, count = 0;
+
+ for (i = 0; i < state->num_cpus; i++)
+ count += snprintf(buf + count, PAGE_SIZE - count, "%u ",
+ state->busy_up_thres[i]);
+
+ count += snprintf(buf + count, PAGE_SIZE - count, "\n");
+ return count;
+}
+
+static ssize_t store_busy_down_thres(struct cluster_data *state,
+ const char *buf, size_t count)
+{
+ unsigned int val[MAX_CPUS_PER_CLUSTER];
+ int ret, i;
+
+ ret = sscanf(buf, "%u %u %u %u\n", &val[0], &val[1], &val[2], &val[3]);
+ if (ret != 1 && ret != state->num_cpus)
+ return -EINVAL;
+
+ if (ret == 1) {
+ for (i = 0; i < state->num_cpus; i++)
+ state->busy_down_thres[i] = val[0];
+ } else {
+ for (i = 0; i < state->num_cpus; i++)
+ state->busy_down_thres[i] = val[i];
+ }
+ apply_need(state);
+ return count;
+}
+
+static ssize_t show_busy_down_thres(const struct cluster_data *state, char *buf)
+{
+ int i, count = 0;
+
+ for (i = 0; i < state->num_cpus; i++)
+ count += snprintf(buf + count, PAGE_SIZE - count, "%u ",
+ state->busy_down_thres[i]);
+
+ count += snprintf(buf + count, PAGE_SIZE - count, "\n");
+ return count;
+}
+
+static ssize_t store_is_big_cluster(struct cluster_data *state,
+ const char *buf, size_t count)
+{
+ unsigned int val;
+
+ if (sscanf(buf, "%u\n", &val) != 1)
+ return -EINVAL;
+
+ state->is_big_cluster = val ? 1 : 0;
+ return count;
+}
+
+static ssize_t show_is_big_cluster(const struct cluster_data *state, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%u\n", state->is_big_cluster);
+}
+
+static ssize_t store_enable(struct cluster_data *state,
+ const char *buf, size_t count)
+{
+ unsigned int val;
+ bool bval;
+
+ if (sscanf(buf, "%u\n", &val) != 1)
+ return -EINVAL;
+
+ bval = !!val;
+ if (bval != state->enable) {
+ state->enable = bval;
+ apply_need(state);
+ }
+
+ return count;
+}
+
+static ssize_t show_enable(const struct cluster_data *state, char *buf)
+{
+ return scnprintf(buf, PAGE_SIZE, "%u\n", state->enable);
+}
+
+static ssize_t show_need_cpus(const struct cluster_data *state, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%u\n", state->need_cpus);
+}
+
+static ssize_t show_active_cpus(const struct cluster_data *state, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%u\n", state->active_cpus);
+}
+
+static ssize_t show_global_state(const struct cluster_data *state, char *buf)
+{
+ struct cpu_data *c;
+ struct cluster_data *cluster;
+ ssize_t count = 0;
+ unsigned int cpu;
+
+ spin_lock_irq(&state_lock);
+ for_each_possible_cpu(cpu) {
+ c = &per_cpu(cpu_state, cpu);
+ cluster = c->cluster;
+ if (!cluster || !cluster->inited)
+ continue;
+
+ count += snprintf(buf + count, PAGE_SIZE - count,
+ "CPU%u\n", cpu);
+ count += snprintf(buf + count, PAGE_SIZE - count,
+ "\tCPU: %u\n", c->cpu);
+ count += snprintf(buf + count, PAGE_SIZE - count,
+ "\tOnline: %u\n",
+ cpu_online(c->cpu));
+ count += snprintf(buf + count, PAGE_SIZE - count,
+ "\tIsolated: %u\n",
+ cpu_isolated(c->cpu));
+ count += snprintf(buf + count, PAGE_SIZE - count,
+ "\tFirst CPU: %u\n",
+ cluster->first_cpu);
+ count += snprintf(buf + count, PAGE_SIZE - count,
+ "\tBusy%%: %u\n", c->busy);
+ count += snprintf(buf + count, PAGE_SIZE - count,
+ "\tIs busy: %u\n", c->is_busy);
+ count += snprintf(buf + count, PAGE_SIZE - count,
+ "\tNot preferred: %u\n",
+ c->not_preferred);
+ count += snprintf(buf + count, PAGE_SIZE - count,
+ "\tNr running: %u\n", cluster->nrrun);
+ count += snprintf(buf + count, PAGE_SIZE - count,
+ "\tActive CPUs: %u\n", get_active_cpu_count(cluster));
+ count += snprintf(buf + count, PAGE_SIZE - count,
+ "\tNeed CPUs: %u\n", cluster->need_cpus);
+ count += snprintf(buf + count, PAGE_SIZE - count,
+ "\tNr isolated CPUs: %u\n",
+ cluster->nr_isolated_cpus);
+ count += snprintf(buf + count, PAGE_SIZE - count,
+ "\tBoost: %u\n", (unsigned int) cluster->boost);
+ }
+ spin_unlock_irq(&state_lock);
+
+ return count;
+}
+
+static ssize_t store_not_preferred(struct cluster_data *state,
+ const char *buf, size_t count)
+{
+ struct cpu_data *c;
+ unsigned int i;
+ unsigned int val[MAX_CPUS_PER_CLUSTER];
+ unsigned long flags;
+ int ret;
+
+ ret = sscanf(buf, "%u %u %u %u\n", &val[0], &val[1], &val[2], &val[3]);
+ if (ret != state->num_cpus)
+ return -EINVAL;
+
+ spin_lock_irqsave(&state_lock, flags);
+ for (i = 0; i < state->num_cpus; i++) {
+ c = &per_cpu(cpu_state, i + state->first_cpu);
+ c->not_preferred = val[i];
+ }
+ spin_unlock_irqrestore(&state_lock, flags);
+
+ return count;
+}
+
+static ssize_t show_not_preferred(const struct cluster_data *state, char *buf)
+{
+ struct cpu_data *c;
+ ssize_t count = 0;
+ unsigned long flags;
+ int i;
+
+ spin_lock_irqsave(&state_lock, flags);
+ for (i = 0; i < state->num_cpus; i++) {
+ c = &per_cpu(cpu_state, i + state->first_cpu);
+ count += scnprintf(buf + count, PAGE_SIZE - count,
+ "CPU#%d: %u\n", c->cpu, c->not_preferred);
+ }
+ spin_unlock_irqrestore(&state_lock, flags);
+
+ return count;
+}
+
+
+struct core_ctl_attr {
+ struct attribute attr;
+ ssize_t (*show)(const struct cluster_data *, char *);
+ ssize_t (*store)(struct cluster_data *, const char *, size_t count);
+};
+
+#define core_ctl_attr_ro(_name) \
+static struct core_ctl_attr _name = \
+__ATTR(_name, 0444, show_##_name, NULL)
+
+#define core_ctl_attr_rw(_name) \
+static struct core_ctl_attr _name = \
+__ATTR(_name, 0644, show_##_name, store_##_name)
+
+core_ctl_attr_rw(min_cpus);
+core_ctl_attr_rw(max_cpus);
+core_ctl_attr_rw(offline_delay_ms);
+core_ctl_attr_rw(busy_up_thres);
+core_ctl_attr_rw(busy_down_thres);
+core_ctl_attr_rw(task_thres);
+core_ctl_attr_rw(is_big_cluster);
+core_ctl_attr_ro(need_cpus);
+core_ctl_attr_ro(active_cpus);
+core_ctl_attr_ro(global_state);
+core_ctl_attr_rw(not_preferred);
+core_ctl_attr_rw(enable);
+
+static struct attribute *default_attrs[] = {
+ &min_cpus.attr,
+ &max_cpus.attr,
+ &offline_delay_ms.attr,
+ &busy_up_thres.attr,
+ &busy_down_thres.attr,
+ &task_thres.attr,
+ &is_big_cluster.attr,
+ &enable.attr,
+ &need_cpus.attr,
+ &active_cpus.attr,
+ &global_state.attr,
+ &not_preferred.attr,
+ NULL
+};
+
+#define to_cluster_data(k) container_of(k, struct cluster_data, kobj)
+#define to_attr(a) container_of(a, struct core_ctl_attr, attr)
+static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+ struct cluster_data *data = to_cluster_data(kobj);
+ struct core_ctl_attr *cattr = to_attr(attr);
+ ssize_t ret = -EIO;
+
+ if (cattr->show)
+ ret = cattr->show(data, buf);
+
+ return ret;
+}
+
+static ssize_t store(struct kobject *kobj, struct attribute *attr,
+ const char *buf, size_t count)
+{
+ struct cluster_data *data = to_cluster_data(kobj);
+ struct core_ctl_attr *cattr = to_attr(attr);
+ ssize_t ret = -EIO;
+
+ if (cattr->store)
+ ret = cattr->store(data, buf, count);
+
+ return ret;
+}
+
+static const struct sysfs_ops sysfs_ops = {
+ .show = show,
+ .store = store,
+};
+
+static struct kobj_type ktype_core_ctl = {
+ .sysfs_ops = &sysfs_ops,
+ .default_attrs = default_attrs,
+};
+
+/* ==================== runqueue based core count =================== */
+
+#define RQ_AVG_TOLERANCE 2
+#define RQ_AVG_DEFAULT_MS 20
+static unsigned int rq_avg_period_ms = RQ_AVG_DEFAULT_MS;
+
+static s64 rq_avg_timestamp_ms;
+
+static void update_running_avg(bool trigger_update)
+{
+ int avg, iowait_avg, big_avg, old_nrrun;
+ int old_max_nr, max_nr, big_max_nr;
+ s64 now;
+ unsigned long flags;
+ struct cluster_data *cluster;
+ unsigned int index = 0;
+
+ spin_lock_irqsave(&state_lock, flags);
+
+ now = ktime_to_ms(ktime_get());
+ if (now - rq_avg_timestamp_ms < rq_avg_period_ms - RQ_AVG_TOLERANCE) {
+ spin_unlock_irqrestore(&state_lock, flags);
+ return;
+ }
+ rq_avg_timestamp_ms = now;
+ sched_get_nr_running_avg(&avg, &iowait_avg, &big_avg,
+ &max_nr, &big_max_nr);
+
+ spin_unlock_irqrestore(&state_lock, flags);
+
+ for_each_cluster(cluster, index) {
+ if (!cluster->inited)
+ continue;
+
+ old_nrrun = cluster->nrrun;
+ old_max_nr = cluster->max_nr;
+ cluster->nrrun = cluster->is_big_cluster ? big_avg : avg;
+ cluster->max_nr = cluster->is_big_cluster ? big_max_nr : max_nr;
+
+ if (cluster->nrrun != old_nrrun ||
+ cluster->max_nr != old_max_nr) {
+
+ if (trigger_update)
+ apply_need(cluster);
+ else
+ cluster->nrrun_changed = true;
+ }
+ }
+ return;
+}
+
+#define MAX_NR_THRESHOLD 4
+/* adjust needed CPUs based on current runqueue information */
+static unsigned int apply_task_need(const struct cluster_data *cluster,
+ unsigned int new_need)
+{
+ /* unisolate all cores if there are enough tasks */
+ if (cluster->nrrun >= cluster->task_thres)
+ return cluster->num_cpus;
+
+ /* only unisolate more cores if there are tasks to run */
+ if (cluster->nrrun > new_need)
+ new_need = new_need + 1;
+
+ /*
+ * We don't want tasks to be overcrowded in a cluster.
+ * If any CPU has more than MAX_NR_THRESHOLD in the last
+ * window, bring another CPU to help out.
+ */
+ if (cluster->max_nr > MAX_NR_THRESHOLD)
+ new_need = new_need + 1;
+
+ return new_need;
+}
+
+/* ======================= load based core count ====================== */
+
+static unsigned int apply_limits(const struct cluster_data *cluster,
+ unsigned int need_cpus)
+{
+ return min(max(cluster->min_cpus, need_cpus), cluster->max_cpus);
+}
+
+static unsigned int get_active_cpu_count(const struct cluster_data *cluster)
+{
+ return cluster->num_cpus -
+ sched_isolate_count(&cluster->cpu_mask, true);
+}
+
+static bool is_active(const struct cpu_data *state)
+{
+ return cpu_online(state->cpu) && !cpu_isolated(state->cpu);
+}
+
+static bool adjustment_possible(const struct cluster_data *cluster,
+ unsigned int need)
+{
+ return (need < cluster->active_cpus || (need > cluster->active_cpus &&
+ cluster->nr_isolated_cpus));
+}
+
+static bool eval_need(struct cluster_data *cluster)
+{
+ unsigned long flags;
+ struct cpu_data *c;
+ unsigned int need_cpus = 0, last_need, thres_idx;
+ int ret = 0;
+ bool need_flag = false;
+ unsigned int new_need;
+ s64 now, elapsed;
+
+ if (unlikely(!cluster->inited))
+ return 0;
+
+ spin_lock_irqsave(&state_lock, flags);
+
+ if (cluster->boost || !cluster->enable) {
+ need_cpus = cluster->max_cpus;
+ } else {
+ cluster->active_cpus = get_active_cpu_count(cluster);
+ thres_idx = cluster->active_cpus ? cluster->active_cpus - 1 : 0;
+ list_for_each_entry(c, &cluster->lru, sib) {
+ if (c->busy >= cluster->busy_up_thres[thres_idx] ||
+ sched_cpu_high_irqload(c->cpu))
+ c->is_busy = true;
+ else if (c->busy < cluster->busy_down_thres[thres_idx])
+ c->is_busy = false;
+ need_cpus += c->is_busy;
+ }
+ need_cpus = apply_task_need(cluster, need_cpus);
+ }
+ new_need = apply_limits(cluster, need_cpus);
+ need_flag = adjustment_possible(cluster, new_need);
+
+ last_need = cluster->need_cpus;
+ now = ktime_to_ms(ktime_get());
+
+ if (new_need > cluster->active_cpus) {
+ ret = 1;
+ } else {
+ if (new_need == last_need) {
+ cluster->need_ts = now;
+ spin_unlock_irqrestore(&state_lock, flags);
+ return 0;
+ }
+
+ elapsed = now - cluster->need_ts;
+ ret = elapsed >= cluster->offline_delay_ms;
+ }
+
+ if (ret) {
+ cluster->need_ts = now;
+ cluster->need_cpus = new_need;
+ }
+ trace_core_ctl_eval_need(cluster->first_cpu, last_need, new_need,
+ ret && need_flag);
+ spin_unlock_irqrestore(&state_lock, flags);
+
+ return ret && need_flag;
+}
+
+static void apply_need(struct cluster_data *cluster)
+{
+ if (eval_need(cluster))
+ wake_up_core_ctl_thread(cluster);
+}
+
+static int core_ctl_set_busy(unsigned int cpu, unsigned int busy)
+{
+ struct cpu_data *c = &per_cpu(cpu_state, cpu);
+ struct cluster_data *cluster = c->cluster;
+ unsigned int old_is_busy = c->is_busy;
+
+ if (!cluster || !cluster->inited)
+ return 0;
+
+ update_running_avg(false);
+ if (c->busy == busy && !cluster->nrrun_changed)
+ return 0;
+ c->busy = busy;
+ cluster->nrrun_changed = false;
+
+ apply_need(cluster);
+ trace_core_ctl_set_busy(cpu, busy, old_is_busy, c->is_busy);
+ return 0;
+}
+
+/* ========================= core count enforcement ==================== */
+
+static void wake_up_core_ctl_thread(struct cluster_data *cluster)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&cluster->pending_lock, flags);
+ cluster->pending = true;
+ spin_unlock_irqrestore(&cluster->pending_lock, flags);
+
+ wake_up_process_no_notif(cluster->core_ctl_thread);
+}
+
+static u64 core_ctl_check_timestamp;
+static u64 core_ctl_check_interval;
+
+static bool do_check(u64 wallclock)
+{
+ bool do_check = false;
+ unsigned long flags;
+
+ spin_lock_irqsave(&state_lock, flags);
+ if ((wallclock - core_ctl_check_timestamp) >= core_ctl_check_interval) {
+ core_ctl_check_timestamp = wallclock;
+ do_check = true;
+ }
+ spin_unlock_irqrestore(&state_lock, flags);
+ return do_check;
+}
+
+int core_ctl_set_boost(bool boost)
+{
+ unsigned int index = 0;
+ struct cluster_data *cluster;
+ unsigned long flags;
+ int ret = 0;
+ bool boost_state_changed = false;
+
+ if (unlikely(!initialized))
+ return 0;
+
+ spin_lock_irqsave(&state_lock, flags);
+ for_each_cluster(cluster, index) {
+ if (cluster->is_big_cluster) {
+ if (boost) {
+ boost_state_changed = !cluster->boost;
+ ++cluster->boost;
+ } else {
+ if (!cluster->boost) {
+ pr_err("Error turning off boost. Boost already turned off\n");
+ ret = -EINVAL;
+ } else {
+ --cluster->boost;
+ boost_state_changed = !cluster->boost;
+ }
+ }
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&state_lock, flags);
+
+ if (boost_state_changed)
+ apply_need(cluster);
+
+ trace_core_ctl_set_boost(cluster->boost, ret);
+
+ return ret;
+}
+EXPORT_SYMBOL(core_ctl_set_boost);
+
+void core_ctl_check(u64 wallclock)
+{
+ if (unlikely(!initialized))
+ return;
+
+ if (do_check(wallclock)) {
+ unsigned int index = 0;
+ struct cluster_data *cluster;
+
+ update_running_avg(true);
+
+ for_each_cluster(cluster, index) {
+ if (eval_need(cluster))
+ wake_up_core_ctl_thread(cluster);
+ }
+ }
+}
+
+static void move_cpu_lru(struct cpu_data *cpu_data)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&state_lock, flags);
+ list_del(&cpu_data->sib);
+ list_add_tail(&cpu_data->sib, &cpu_data->cluster->lru);
+ spin_unlock_irqrestore(&state_lock, flags);
+}
+
+static void try_to_isolate(struct cluster_data *cluster, unsigned int need)
+{
+ struct cpu_data *c, *tmp;
+ unsigned long flags;
+ unsigned int num_cpus = cluster->num_cpus;
+ unsigned int nr_isolated = 0;
+
+ /*
+ * Protect against entry being removed (and added at tail) by other
+ * thread (hotplug).
+ */
+ spin_lock_irqsave(&state_lock, flags);
+ list_for_each_entry_safe(c, tmp, &cluster->lru, sib) {
+ if (!num_cpus--)
+ break;
+
+ if (!is_active(c))
+ continue;
+ if (cluster->active_cpus == need)
+ break;
+ /* Don't offline busy CPUs. */
+ if (c->is_busy)
+ continue;
+
+ spin_unlock_irqrestore(&state_lock, flags);
+
+ pr_debug("Trying to isolate CPU%u\n", c->cpu);
+ if (!sched_isolate_cpu(c->cpu)) {
+ c->isolated_by_us = true;
+ move_cpu_lru(c);
+ nr_isolated++;
+ } else {
+ pr_debug("Unable to isolate CPU%u\n", c->cpu);
+ }
+ cluster->active_cpus = get_active_cpu_count(cluster);
+ spin_lock_irqsave(&state_lock, flags);
+ }
+ cluster->nr_isolated_cpus += nr_isolated;
+ spin_unlock_irqrestore(&state_lock, flags);
+
+ /*
+ * If the number of active CPUs is within the limits, then
+ * don't force isolation of any busy CPUs.
+ */
+ if (cluster->active_cpus <= cluster->max_cpus)
+ return;
+
+ nr_isolated = 0;
+ num_cpus = cluster->num_cpus;
+ spin_lock_irqsave(&state_lock, flags);
+ list_for_each_entry_safe(c, tmp, &cluster->lru, sib) {
+ if (!num_cpus--)
+ break;
+
+ if (!is_active(c))
+ continue;
+ if (cluster->active_cpus <= cluster->max_cpus)
+ break;
+
+ spin_unlock_irqrestore(&state_lock, flags);
+
+ pr_debug("Trying to isolate CPU%u\n", c->cpu);
+ if (!sched_isolate_cpu(c->cpu)) {
+ c->isolated_by_us = true;
+ move_cpu_lru(c);
+ nr_isolated++;
+ } else {
+ pr_debug("Unable to isolate CPU%u\n", c->cpu);
+ }
+ cluster->active_cpus = get_active_cpu_count(cluster);
+ spin_lock_irqsave(&state_lock, flags);
+ }
+ cluster->nr_isolated_cpus += nr_isolated;
+ spin_unlock_irqrestore(&state_lock, flags);
+
+}
+
+static void __try_to_unisolate(struct cluster_data *cluster,
+ unsigned int need, bool force)
+{
+ struct cpu_data *c, *tmp;
+ unsigned long flags;
+ unsigned int num_cpus = cluster->num_cpus;
+ unsigned int nr_unisolated = 0;
+
+ /*
+ * Protect against entry being removed (and added at tail) by other
+ * thread (hotplug).
+ */
+ spin_lock_irqsave(&state_lock, flags);
+ list_for_each_entry_safe(c, tmp, &cluster->lru, sib) {
+ if (!num_cpus--)
+ break;
+
+ if (!c->isolated_by_us)
+ continue;
+ if ((cpu_online(c->cpu) && !cpu_isolated(c->cpu)) ||
+ (!force && c->not_preferred))
+ continue;
+ if (cluster->active_cpus == need)
+ break;
+
+ spin_unlock_irqrestore(&state_lock, flags);
+
+ pr_debug("Trying to unisolate CPU%u\n", c->cpu);
+ if (!sched_unisolate_cpu(c->cpu)) {
+ c->isolated_by_us = false;
+ move_cpu_lru(c);
+ nr_unisolated++;
+ } else {
+ pr_debug("Unable to unisolate CPU%u\n", c->cpu);
+ }
+ cluster->active_cpus = get_active_cpu_count(cluster);
+ spin_lock_irqsave(&state_lock, flags);
+ }
+ cluster->nr_isolated_cpus -= nr_unisolated;
+ spin_unlock_irqrestore(&state_lock, flags);
+}
+
+static void try_to_unisolate(struct cluster_data *cluster, unsigned int need)
+{
+ bool force_use_non_preferred = false;
+
+ __try_to_unisolate(cluster, need, force_use_non_preferred);
+
+ if (cluster->active_cpus == need)
+ return;
+
+ force_use_non_preferred = true;
+ __try_to_unisolate(cluster, need, force_use_non_preferred);
+}
+
+static void __ref do_core_ctl(struct cluster_data *cluster)
+{
+ unsigned int need;
+
+ need = apply_limits(cluster, cluster->need_cpus);
+
+ if (adjustment_possible(cluster, need)) {
+ pr_debug("Trying to adjust group %u from %u to %u\n",
+ cluster->first_cpu, cluster->active_cpus, need);
+
+ if (cluster->active_cpus > need)
+ try_to_isolate(cluster, need);
+ else if (cluster->active_cpus < need)
+ try_to_unisolate(cluster, need);
+ }
+}
+
+static int __ref try_core_ctl(void *data)
+{
+ struct cluster_data *cluster = data;
+ unsigned long flags;
+
+ while (1) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ spin_lock_irqsave(&cluster->pending_lock, flags);
+ if (!cluster->pending) {
+ spin_unlock_irqrestore(&cluster->pending_lock, flags);
+ schedule();
+ if (kthread_should_stop())
+ break;
+ spin_lock_irqsave(&cluster->pending_lock, flags);
+ }
+ set_current_state(TASK_RUNNING);
+ cluster->pending = false;
+ spin_unlock_irqrestore(&cluster->pending_lock, flags);
+
+ do_core_ctl(cluster);
+ }
+
+ return 0;
+}
+
+static int __ref cpu_callback(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ uint32_t cpu = (uintptr_t)hcpu;
+ struct cpu_data *state = &per_cpu(cpu_state, cpu);
+ struct cluster_data *cluster = state->cluster;
+ unsigned int need;
+ bool do_wakeup, unisolated = false;
+ unsigned long flags;
+
+ if (unlikely(!cluster || !cluster->inited))
+ return NOTIFY_DONE;
+
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_ONLINE:
+ cluster->active_cpus = get_active_cpu_count(cluster);
+
+ /*
+ * Moving to the end of the list should only happen in
+ * CPU_ONLINE and not on CPU_UP_PREPARE to prevent an
+ * infinite list traversal when thermal (or other entities)
+ * reject trying to online CPUs.
+ */
+ move_cpu_lru(state);
+ break;
+
+ case CPU_DEAD:
+ /*
+ * We don't want to have a CPU both offline and isolated.
+ * So unisolate a CPU that went down if it was isolated by us.
+ */
+ if (state->isolated_by_us) {
+ sched_unisolate_cpu_unlocked(cpu);
+ state->isolated_by_us = false;
+ unisolated = true;
+ }
+
+ /* Move a CPU to the end of the LRU when it goes offline. */
+ move_cpu_lru(state);
+
+ state->busy = 0;
+ cluster->active_cpus = get_active_cpu_count(cluster);
+ break;
+ default:
+ return NOTIFY_DONE;
+ }
+
+ need = apply_limits(cluster, cluster->need_cpus);
+ spin_lock_irqsave(&state_lock, flags);
+ if (unisolated)
+ cluster->nr_isolated_cpus--;
+ do_wakeup = adjustment_possible(cluster, need);
+ spin_unlock_irqrestore(&state_lock, flags);
+ if (do_wakeup)
+ wake_up_core_ctl_thread(cluster);
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block __refdata cpu_notifier = {
+ .notifier_call = cpu_callback,
+};
+
+/* ============================ init code ============================== */
+
+static cpumask_var_t core_ctl_disable_cpumask;
+static bool core_ctl_disable_cpumask_present;
+
+static int __init core_ctl_disable_setup(char *str)
+{
+ if (!*str)
+ return -EINVAL;
+
+ alloc_bootmem_cpumask_var(&core_ctl_disable_cpumask);
+
+ if (cpulist_parse(str, core_ctl_disable_cpumask) < 0) {
+ free_bootmem_cpumask_var(core_ctl_disable_cpumask);
+ return -EINVAL;
+ }
+
+ core_ctl_disable_cpumask_present = true;
+ pr_info("disable_cpumask=%*pbl\n",
+ cpumask_pr_args(core_ctl_disable_cpumask));
+
+ return 0;
+}
+early_param("core_ctl_disable_cpumask", core_ctl_disable_setup);
+
+static bool should_skip(const struct cpumask *mask)
+{
+ if (!core_ctl_disable_cpumask_present)
+ return false;
+
+ /*
+ * We operate on a cluster basis. Disable the core_ctl for
+ * a cluster, if all of it's cpus are specified in
+ * core_ctl_disable_cpumask
+ */
+ return cpumask_subset(mask, core_ctl_disable_cpumask);
+}
+
+static struct cluster_data *find_cluster_by_first_cpu(unsigned int first_cpu)
+{
+ unsigned int i;
+
+ for (i = 0; i < num_clusters; ++i) {
+ if (cluster_state[i].first_cpu == first_cpu)
+ return &cluster_state[i];
+ }
+
+ return NULL;
+}
+
+static int cluster_init(const struct cpumask *mask)
+{
+ struct device *dev;
+ unsigned int first_cpu = cpumask_first(mask);
+ struct cluster_data *cluster;
+ struct cpu_data *state;
+ unsigned int cpu;
+ struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+
+ if (should_skip(mask))
+ return 0;
+
+ if (find_cluster_by_first_cpu(first_cpu))
+ return 0;
+
+ dev = get_cpu_device(first_cpu);
+ if (!dev)
+ return -ENODEV;
+
+ pr_info("Creating CPU group %d\n", first_cpu);
+
+ if (num_clusters == MAX_CLUSTERS) {
+ pr_err("Unsupported number of clusters. Only %u supported\n",
+ MAX_CLUSTERS);
+ return -EINVAL;
+ }
+ cluster = &cluster_state[num_clusters];
+ ++num_clusters;
+
+ cpumask_copy(&cluster->cpu_mask, mask);
+ cluster->num_cpus = cpumask_weight(mask);
+ if (cluster->num_cpus > MAX_CPUS_PER_CLUSTER) {
+ pr_err("HW configuration not supported\n");
+ return -EINVAL;
+ }
+ cluster->first_cpu = first_cpu;
+ cluster->min_cpus = 1;
+ cluster->max_cpus = cluster->num_cpus;
+ cluster->need_cpus = cluster->num_cpus;
+ cluster->offline_delay_ms = 100;
+ cluster->task_thres = UINT_MAX;
+ cluster->nrrun = cluster->num_cpus;
+ cluster->enable = true;
+ INIT_LIST_HEAD(&cluster->lru);
+ spin_lock_init(&cluster->pending_lock);
+
+ for_each_cpu(cpu, mask) {
+ pr_info("Init CPU%u state\n", cpu);
+
+ state = &per_cpu(cpu_state, cpu);
+ state->cluster = cluster;
+ state->cpu = cpu;
+ list_add_tail(&state->sib, &cluster->lru);
+ }
+ cluster->active_cpus = get_active_cpu_count(cluster);
+
+ cluster->core_ctl_thread = kthread_run(try_core_ctl, (void *) cluster,
+ "core_ctl/%d", first_cpu);
+ if (IS_ERR(cluster->core_ctl_thread))
+ return PTR_ERR(cluster->core_ctl_thread);
+
+ sched_setscheduler_nocheck(cluster->core_ctl_thread, SCHED_FIFO,
+ &param);
+
+ cluster->inited = true;
+
+ kobject_init(&cluster->kobj, &ktype_core_ctl);
+ return kobject_add(&cluster->kobj, &dev->kobj, "core_ctl");
+}
+
+static int cpufreq_policy_cb(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct cpufreq_policy *policy = data;
+ int ret;
+
+ switch (val) {
+ case CPUFREQ_CREATE_POLICY:
+ ret = cluster_init(policy->related_cpus);
+ if (ret)
+ pr_warn("unable to create core ctl group: %d\n", ret);
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block cpufreq_pol_nb = {
+ .notifier_call = cpufreq_policy_cb,
+};
+
+static int cpufreq_gov_cb(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct cpufreq_govinfo *info = data;
+
+ switch (val) {
+ case CPUFREQ_LOAD_CHANGE:
+ core_ctl_set_busy(info->cpu, info->load);
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block cpufreq_gov_nb = {
+ .notifier_call = cpufreq_gov_cb,
+};
+
+static int __init core_ctl_init(void)
+{
+ unsigned int cpu;
+
+ if (should_skip(cpu_possible_mask))
+ return 0;
+
+ core_ctl_check_interval = (rq_avg_period_ms - RQ_AVG_TOLERANCE)
+ * NSEC_PER_MSEC;
+
+ register_cpu_notifier(&cpu_notifier);
+ cpufreq_register_notifier(&cpufreq_pol_nb, CPUFREQ_POLICY_NOTIFIER);
+ cpufreq_register_notifier(&cpufreq_gov_nb, CPUFREQ_GOVINFO_NOTIFIER);
+
+ cpu_maps_update_begin();
+ for_each_online_cpu(cpu) {
+ struct cpufreq_policy *policy;
+ int ret;
+
+ policy = cpufreq_cpu_get(cpu);
+ if (policy) {
+ ret = cluster_init(policy->related_cpus);
+ if (ret)
+ pr_warn("unable to create core ctl group: %d\n"
+ , ret);
+ cpufreq_cpu_put(policy);
+ }
+ }
+ cpu_maps_update_done();
+ initialized = true;
+ return 0;
+}
+
+late_initcall(core_ctl_init);
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 981fcd7dc394..14225d5d8617 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -27,6 +27,8 @@
* of the License.
*/
+#include "sched.h"
+
#include <linux/gfp.h>
#include <linux/sched.h>
#include <linux/sched/rt.h>
@@ -51,6 +53,27 @@ static int convert_prio(int prio)
}
/**
+ * drop_nopreempt_cpus - remove a cpu from the mask if it is likely
+ * non-preemptible
+ * @lowest_mask: mask with selected CPUs (non-NULL)
+ */
+static void
+drop_nopreempt_cpus(struct cpumask *lowest_mask)
+{
+ unsigned int cpu = cpumask_first(lowest_mask);
+
+ while (cpu < nr_cpu_ids) {
+ /* unlocked access */
+ struct task_struct *task = READ_ONCE(cpu_rq(cpu)->curr);
+
+ if (task_may_not_preempt(task, cpu))
+ cpumask_clear_cpu(cpu, lowest_mask);
+
+ cpu = cpumask_next(cpu, lowest_mask);
+ }
+}
+
+/**
* cpupri_find - find the best (lowest-pri) CPU in the system
* @cp: The cpupri context
* @p: The task
@@ -70,9 +93,11 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
{
int idx = 0;
int task_pri = convert_prio(p->prio);
+ bool drop_nopreempts = task_pri <= MAX_RT_PRIO;
BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES);
+retry:
for (idx = 0; idx < task_pri; idx++) {
struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
int skip = 0;
@@ -108,7 +133,8 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
if (lowest_mask) {
cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
-
+ if (drop_nopreempts)
+ drop_nopreempt_cpus(lowest_mask);
/*
* We have to ensure that we have at least one bit
* still set in the array, since the map could have
@@ -123,7 +149,14 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
return 1;
}
-
+ /*
+ * If we can't find any non-preemptible cpu's, retry so we can
+ * find the lowest priority target and avoid priority inversion.
+ */
+ if (drop_nopreempts) {
+ drop_nopreempts = false;
+ goto retry;
+ }
return 0;
}
@@ -246,3 +279,14 @@ void cpupri_cleanup(struct cpupri *cp)
for (i = 0; i < CPUPRI_NR_PRIORITIES; i++)
free_cpumask_var(cp->pri_to_cpu[i].mask);
}
+
+/*
+ * cpupri_check_rt - check if CPU has a RT task
+ * should be called from rcu-sched read section.
+ */
+bool cpupri_check_rt(void)
+{
+ int cpu = raw_smp_processor_id();
+
+ return cpu_rq(cpu)->rd->cpupri.cpu_to_pri[cpu] > CPUPRI_NORMAL;
+}
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index fc2cfd6b2941..e6ec68c15aa3 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -6,7 +6,6 @@
#include <linux/context_tracking.h>
#include <linux/cpufreq_times.h>
#include "sched.h"
-#include "walt.h"
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -51,10 +50,8 @@ void irqtime_account_irq(struct task_struct *curr)
unsigned long flags;
s64 delta;
int cpu;
-#ifdef CONFIG_SCHED_WALT
u64 wallclock;
bool account = true;
-#endif
if (!sched_clock_irqtime)
return;
@@ -62,10 +59,8 @@ void irqtime_account_irq(struct task_struct *curr)
local_irq_save(flags);
cpu = smp_processor_id();
-#ifdef CONFIG_SCHED_WALT
wallclock = sched_clock_cpu(cpu);
-#endif
- delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
+ delta = wallclock - __this_cpu_read(irq_start_time);
__this_cpu_add(irq_start_time, delta);
irq_time_write_begin();
@@ -79,16 +74,16 @@ void irqtime_account_irq(struct task_struct *curr)
__this_cpu_add(cpu_hardirq_time, delta);
else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
__this_cpu_add(cpu_softirq_time, delta);
-#ifdef CONFIG_SCHED_WALT
else
account = false;
-#endif
irq_time_write_end();
-#ifdef CONFIG_SCHED_WALT
+
if (account)
- walt_account_irqtime(cpu, curr, delta, wallclock);
-#endif
+ sched_account_irqtime(cpu, curr, delta, wallclock);
+ else if (curr != this_cpu_ksoftirqd())
+ sched_account_irqstart(cpu, curr, wallclock);
+
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(irqtime_account_irq);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 5c6ffddcafcd..188c8388a63f 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -291,9 +291,11 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
/*
* By now the task is replenished and enqueued; migrate it.
*/
+ p->on_rq = TASK_ON_RQ_MIGRATING;
deactivate_task(rq, p, 0);
set_task_cpu(p, later_rq->cpu);
activate_task(later_rq, p, 0);
+ p->on_rq = TASK_ON_RQ_QUEUED;
if (!fallback)
resched_curr(later_rq);
@@ -992,6 +994,41 @@ static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {}
#endif /* CONFIG_SMP */
+#ifdef CONFIG_SCHED_HMP
+
+static void
+inc_hmp_sched_stats_dl(struct rq *rq, struct task_struct *p)
+{
+ inc_cumulative_runnable_avg(&rq->hmp_stats, p);
+}
+
+static void
+dec_hmp_sched_stats_dl(struct rq *rq, struct task_struct *p)
+{
+ dec_cumulative_runnable_avg(&rq->hmp_stats, p);
+}
+
+static void
+fixup_hmp_sched_stats_dl(struct rq *rq, struct task_struct *p,
+ u32 new_task_load, u32 new_pred_demand)
+{
+ s64 task_load_delta = (s64)new_task_load - task_load(p);
+ s64 pred_demand_delta = PRED_DEMAND_DELTA;
+
+ fixup_cumulative_runnable_avg(&rq->hmp_stats, p, task_load_delta,
+ pred_demand_delta);
+}
+
+#else /* CONFIG_SCHED_HMP */
+
+static inline void
+inc_hmp_sched_stats_dl(struct rq *rq, struct task_struct *p) { }
+
+static inline void
+dec_hmp_sched_stats_dl(struct rq *rq, struct task_struct *p) { }
+
+#endif /* CONFIG_SCHED_HMP */
+
static inline
void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
{
@@ -1001,7 +1038,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
WARN_ON(!dl_prio(prio));
dl_rq->dl_nr_running++;
add_nr_running(rq_of_dl_rq(dl_rq), 1);
- walt_inc_cumulative_runnable_avg(rq_of_dl_rq(dl_rq), dl_task_of(dl_se));
+ inc_hmp_sched_stats_dl(rq_of_dl_rq(dl_rq), dl_task_of(dl_se));
inc_dl_deadline(dl_rq, deadline);
inc_dl_migration(dl_se, dl_rq);
@@ -1016,7 +1053,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
WARN_ON(!dl_rq->dl_nr_running);
dl_rq->dl_nr_running--;
sub_nr_running(rq_of_dl_rq(dl_rq), 1);
- walt_dec_cumulative_runnable_avg(rq_of_dl_rq(dl_rq), dl_task_of(dl_se));
+ dec_hmp_sched_stats_dl(rq_of_dl_rq(dl_rq), dl_task_of(dl_se));
dec_dl_deadline(dl_rq, dl_se->deadline);
dec_dl_migration(dl_se, dl_rq);
@@ -1712,6 +1749,7 @@ retry:
goto retry;
}
+ next_task->on_rq = TASK_ON_RQ_MIGRATING;
deactivate_task(rq, next_task, 0);
clear_average_bw(&next_task->dl, &rq->dl);
next_task->on_rq = TASK_ON_RQ_MIGRATING;
@@ -1719,6 +1757,7 @@ retry:
next_task->on_rq = TASK_ON_RQ_QUEUED;
add_average_bw(&next_task->dl, &later_rq->dl);
activate_task(later_rq, next_task, 0);
+ next_task->on_rq = TASK_ON_RQ_QUEUED;
ret = 1;
resched_curr(later_rq);
@@ -1804,6 +1843,7 @@ static void pull_dl_task(struct rq *this_rq)
resched = true;
+ p->on_rq = TASK_ON_RQ_MIGRATING;
deactivate_task(src_rq, p, 0);
clear_average_bw(&p->dl, &src_rq->dl);
p->on_rq = TASK_ON_RQ_MIGRATING;
@@ -1811,6 +1851,7 @@ static void pull_dl_task(struct rq *this_rq)
p->on_rq = TASK_ON_RQ_QUEUED;
add_average_bw(&p->dl, &this_rq->dl);
activate_task(this_rq, p, 0);
+ p->on_rq = TASK_ON_RQ_QUEUED;
dmin = p->dl.deadline;
/* Is there any other task even earlier? */
@@ -2012,6 +2053,11 @@ const struct sched_class dl_sched_class = {
.switched_to = switched_to_dl,
.update_curr = update_curr_dl,
+#ifdef CONFIG_SCHED_HMP
+ .inc_hmp_sched_stats = inc_hmp_sched_stats_dl,
+ .dec_hmp_sched_stats = dec_hmp_sched_stats_dl,
+ .fixup_hmp_sched_stats = fixup_hmp_sched_stats_dl,
+#endif
};
#ifdef CONFIG_SCHED_DEBUG
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 7f7116622631..ed8e6bb4531b 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -227,6 +227,14 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
cfs_rq->throttled);
SEQ_printf(m, " .%-30s: %d\n", "throttle_count",
cfs_rq->throttle_count);
+ SEQ_printf(m, " .%-30s: %d\n", "runtime_enabled",
+ cfs_rq->runtime_enabled);
+#ifdef CONFIG_SCHED_HMP
+ SEQ_printf(m, " .%-30s: %d\n", "nr_big_tasks",
+ cfs_rq->hmp_stats.nr_big_tasks);
+ SEQ_printf(m, " .%-30s: %llu\n", "cumulative_runnable_avg",
+ cfs_rq->hmp_stats.cumulative_runnable_avg);
+#endif
#endif
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -306,6 +314,23 @@ do { \
P(cpu_load[2]);
P(cpu_load[3]);
P(cpu_load[4]);
+#ifdef CONFIG_SMP
+ P(cpu_capacity);
+#endif
+#ifdef CONFIG_SCHED_HMP
+ P(static_cpu_pwr_cost);
+ P(cluster->static_cluster_pwr_cost);
+ P(cluster->load_scale_factor);
+ P(cluster->capacity);
+ P(cluster->max_possible_capacity);
+ P(cluster->efficiency);
+ P(cluster->cur_freq);
+ P(cluster->max_freq);
+ P(cluster->exec_scale_factor);
+ P(hmp_stats.nr_big_tasks);
+ SEQ_printf(m, " .%-30s: %llu\n", "hmp_stats.cumulative_runnable_avg",
+ rq->hmp_stats.cumulative_runnable_avg);
+#endif
#undef P
#undef PN
@@ -386,6 +411,15 @@ static void sched_debug_header(struct seq_file *m)
PN(sysctl_sched_wakeup_granularity);
P(sysctl_sched_child_runs_first);
P(sysctl_sched_features);
+#ifdef CONFIG_SCHED_HMP
+ P(sched_upmigrate);
+ P(sched_downmigrate);
+ P(sched_init_task_load_windows);
+ P(min_capacity);
+ P(max_capacity);
+ P(sched_ravg_window);
+ P(sched_load_granule);
+#endif
#undef PN
#undef P
@@ -408,6 +442,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
return 0;
}
+#ifdef CONFIG_SYSRQ_SCHED_DEBUG
void sysrq_sched_debug_show(void)
{
int cpu;
@@ -417,6 +452,7 @@ void sysrq_sched_debug_show(void)
print_cpu(NULL, cpu);
}
+#endif
/*
* This itererator needs some explanation.
@@ -547,6 +583,9 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
{
unsigned long nr_switches;
+ unsigned int load_avg;
+
+ load_avg = pct_task_load(p);
SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr(p),
get_nr_threads(p));
@@ -624,6 +663,13 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
P(se.statistics.nr_wakeups_cas_attempts);
P(se.statistics.nr_wakeups_cas_count);
+#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
+ __P(load_avg);
+#ifdef CONFIG_SCHED_HMP
+ P(ravg.demand);
+#endif
+#endif
+
{
u64 avg_atom, avg_per_cpu;
diff --git a/kernel/sched/energy.c b/kernel/sched/energy.c
index b0656b7a93e3..50d183b1e156 100644
--- a/kernel/sched/energy.c
+++ b/kernel/sched/energy.c
@@ -27,7 +27,10 @@
#include <linux/sched_energy.h>
#include <linux/stddef.h>
+#include "sched.h"
+
struct sched_group_energy *sge_array[NR_CPUS][NR_SD_LEVELS];
+bool sched_energy_aware;
static void free_resources(void)
{
@@ -56,6 +59,13 @@ void init_sched_energy_costs(void)
int sd_level, i, nstates, cpu;
const __be32 *val;
+ if (!energy_aware()) {
+ sched_energy_aware = false;
+ return;
+ }
+
+ sched_energy_aware = true;
+
for_each_possible_cpu(cpu) {
cn = of_get_cpu_node(cpu, NULL);
if (!cn) {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 125ff775fe05..78bd960c3527 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -32,9 +32,8 @@
#include <linux/task_work.h>
#include <linux/module.h>
-#include <trace/events/sched.h>
-
#include "sched.h"
+#include <trace/events/sched.h>
#include "tune.h"
#include "walt.h"
@@ -56,12 +55,6 @@ unsigned int normalized_sysctl_sched_latency = 6000000ULL;
unsigned int sysctl_sched_sync_hint_enable = 1;
unsigned int sysctl_sched_cstate_aware = 1;
-#ifdef CONFIG_SCHED_WALT
-unsigned int sysctl_sched_use_walt_cpu_util = 1;
-unsigned int sysctl_sched_use_walt_task_util = 1;
-__read_mostly unsigned int sysctl_sched_walt_cpu_high_irqload =
- (10 * NSEC_PER_MSEC);
-#endif
/*
* The initial- and re-scaling of tunables is configurable
* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
@@ -254,6 +247,9 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight
return mul_u64_u32_shr(delta_exec, fact, shift);
}
+#ifdef CONFIG_SMP
+static int active_load_balance_cpu_stop(void *data);
+#endif
const struct sched_class fair_sched_class;
@@ -891,12 +887,56 @@ static void update_curr_fair(struct rq *rq)
update_curr(cfs_rq_of(&rq->curr->se));
}
+#ifdef CONFIG_SCHEDSTATS
+static inline void
+update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ u64 wait_start = rq_clock(rq_of(cfs_rq));
+
+ if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
+ likely(wait_start > se->statistics.wait_start))
+ wait_start -= se->statistics.wait_start;
+
+ se->statistics.wait_start = wait_start;
+}
+
+static void
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ struct task_struct *p;
+ u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
+
+ if (entity_is_task(se)) {
+ p = task_of(se);
+ if (task_on_rq_migrating(p)) {
+ /*
+ * Preserve migrating task's wait time so wait_start
+ * time stamp can be adjusted to accumulate wait time
+ * prior to migration.
+ */
+ se->statistics.wait_start = delta;
+ return;
+ }
+ trace_sched_stat_wait(p, delta);
+ }
+
+ se->statistics.wait_max = max(se->statistics.wait_max, delta);
+ se->statistics.wait_count++;
+ se->statistics.wait_sum += delta;
+ se->statistics.wait_start = 0;
+}
+#else
static inline void
update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
}
+static inline void
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+}
+#endif
+
/*
* Task is being enqueued - update stats:
*/
@@ -910,23 +950,6 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
update_stats_wait_start(cfs_rq, se);
}
-static void
-update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
- schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
- rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
- schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
- schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
- rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
-#ifdef CONFIG_SCHEDSTATS
- if (entity_is_task(se)) {
- trace_sched_stat_wait(task_of(se),
- rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
- }
-#endif
- schedstat_set(se->statistics.wait_start, 0);
-}
-
static inline void
update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
@@ -2633,7 +2656,27 @@ static inline void update_cfs_shares(struct sched_entity *se)
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_SMP
-/* Precomputed fixed inverse multiplies for multiplication by y^n */
+u32 sched_get_wake_up_idle(struct task_struct *p)
+{
+ u32 enabled = p->flags & PF_WAKE_UP_IDLE;
+
+ return !!enabled;
+}
+EXPORT_SYMBOL(sched_get_wake_up_idle);
+
+int sched_set_wake_up_idle(struct task_struct *p, int wake_up_idle)
+{
+ int enable = !!wake_up_idle;
+
+ if (enable)
+ p->flags |= PF_WAKE_UP_IDLE;
+ else
+ p->flags &= ~PF_WAKE_UP_IDLE;
+
+ return 0;
+}
+EXPORT_SYMBOL(sched_set_wake_up_idle);
+
static const u32 runnable_avg_yN_inv[] = {
0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
@@ -2713,6 +2756,1068 @@ static u32 __compute_runnable_contrib(u64 n)
return contrib + runnable_avg_yN_sum[n];
}
+#ifdef CONFIG_SCHED_HMP
+
+/* CPU selection flag */
+#define SBC_FLAG_PREV_CPU 0x1
+#define SBC_FLAG_BEST_CAP_CPU 0x2
+#define SBC_FLAG_CPU_COST 0x4
+#define SBC_FLAG_MIN_COST 0x8
+#define SBC_FLAG_IDLE_LEAST_LOADED 0x10
+#define SBC_FLAG_IDLE_CSTATE 0x20
+#define SBC_FLAG_COST_CSTATE_TIE_BREAKER 0x40
+#define SBC_FLAG_COST_CSTATE_PREV_CPU_TIE_BREAKER 0x80
+#define SBC_FLAG_CSTATE_LOAD 0x100
+#define SBC_FLAG_BEST_SIBLING 0x200
+#define SBC_FLAG_WAKER_CPU 0x400
+#define SBC_FLAG_PACK_TASK 0x800
+
+/* Cluster selection flag */
+#define SBC_FLAG_COLOC_CLUSTER 0x10000
+#define SBC_FLAG_WAKER_CLUSTER 0x20000
+#define SBC_FLAG_BACKUP_CLUSTER 0x40000
+#define SBC_FLAG_BOOST_CLUSTER 0x80000
+
+struct cpu_select_env {
+ struct task_struct *p;
+ struct related_thread_group *rtg;
+ u8 reason;
+ u8 need_idle:1;
+ u8 need_waker_cluster:1;
+ u8 sync:1;
+ enum sched_boost_policy boost_policy;
+ u8 pack_task:1;
+ int prev_cpu;
+ DECLARE_BITMAP(candidate_list, NR_CPUS);
+ DECLARE_BITMAP(backup_list, NR_CPUS);
+ u64 task_load;
+ u64 cpu_load;
+ u32 sbc_best_flag;
+ u32 sbc_best_cluster_flag;
+ struct cpumask search_cpus;
+};
+
+struct cluster_cpu_stats {
+ int best_idle_cpu, least_loaded_cpu;
+ int best_capacity_cpu, best_cpu, best_sibling_cpu;
+ int min_cost, best_sibling_cpu_cost;
+ int best_cpu_wakeup_latency;
+ u64 min_load, best_load, best_sibling_cpu_load;
+ s64 highest_spare_capacity;
+};
+
+/*
+ * Should task be woken to any available idle cpu?
+ *
+ * Waking tasks to idle cpu has mixed implications on both performance and
+ * power. In many cases, scheduler can't estimate correctly impact of using idle
+ * cpus on either performance or power. PF_WAKE_UP_IDLE allows external kernel
+ * module to pass a strong hint to scheduler that the task in question should be
+ * woken to idle cpu, generally to improve performance.
+ */
+static inline int wake_to_idle(struct task_struct *p)
+{
+ return (current->flags & PF_WAKE_UP_IDLE) ||
+ (p->flags & PF_WAKE_UP_IDLE);
+}
+
+static int spill_threshold_crossed(struct cpu_select_env *env, struct rq *rq)
+{
+ u64 total_load;
+
+ total_load = env->task_load + env->cpu_load;
+
+ if (total_load > sched_spill_load ||
+ (rq->nr_running + 1) > sysctl_sched_spill_nr_run)
+ return 1;
+
+ return 0;
+}
+
+static int skip_cpu(int cpu, struct cpu_select_env *env)
+{
+ int tcpu = task_cpu(env->p);
+ int skip = 0;
+
+ if (!env->reason)
+ return 0;
+
+ if (is_reserved(cpu))
+ return 1;
+
+ switch (env->reason) {
+ case UP_MIGRATION:
+ skip = !idle_cpu(cpu);
+ break;
+ case IRQLOAD_MIGRATION:
+ /* Purposely fall through */
+ default:
+ skip = (cpu == tcpu);
+ break;
+ }
+
+ return skip;
+}
+
+static inline int
+acceptable_capacity(struct sched_cluster *cluster, struct cpu_select_env *env)
+{
+ int tcpu;
+
+ if (!env->reason)
+ return 1;
+
+ tcpu = task_cpu(env->p);
+ switch (env->reason) {
+ case UP_MIGRATION:
+ return cluster->capacity > cpu_capacity(tcpu);
+
+ case DOWN_MIGRATION:
+ return cluster->capacity < cpu_capacity(tcpu);
+
+ default:
+ break;
+ }
+
+ return 1;
+}
+
+static int
+skip_cluster(struct sched_cluster *cluster, struct cpu_select_env *env)
+{
+ if (!test_bit(cluster->id, env->candidate_list))
+ return 1;
+
+ if (!acceptable_capacity(cluster, env)) {
+ __clear_bit(cluster->id, env->candidate_list);
+ return 1;
+ }
+
+ return 0;
+}
+
+static struct sched_cluster *
+select_least_power_cluster(struct cpu_select_env *env)
+{
+ struct sched_cluster *cluster;
+
+ if (env->rtg) {
+ int cpu = cluster_first_cpu(env->rtg->preferred_cluster);
+
+ env->task_load = scale_load_to_cpu(task_load(env->p), cpu);
+
+ if (task_load_will_fit(env->p, env->task_load,
+ cpu, env->boost_policy)) {
+ env->sbc_best_cluster_flag |= SBC_FLAG_COLOC_CLUSTER;
+
+ if (env->boost_policy == SCHED_BOOST_NONE)
+ return env->rtg->preferred_cluster;
+
+ for_each_sched_cluster(cluster) {
+ if (cluster != env->rtg->preferred_cluster) {
+ __set_bit(cluster->id,
+ env->backup_list);
+ __clear_bit(cluster->id,
+ env->candidate_list);
+ }
+ }
+
+ return env->rtg->preferred_cluster;
+ }
+
+ /*
+ * Since the task load does not fit on the preferred
+ * cluster anymore, pretend that the task does not
+ * have any preferred cluster. This allows the waking
+ * task to get the appropriate CPU it needs as per the
+ * non co-location placement policy without having to
+ * wait until the preferred cluster is updated.
+ */
+ env->rtg = NULL;
+ }
+
+ for_each_sched_cluster(cluster) {
+ if (!skip_cluster(cluster, env)) {
+ int cpu = cluster_first_cpu(cluster);
+
+ env->task_load = scale_load_to_cpu(task_load(env->p),
+ cpu);
+ if (task_load_will_fit(env->p, env->task_load, cpu,
+ env->boost_policy))
+ return cluster;
+
+ __set_bit(cluster->id, env->backup_list);
+ __clear_bit(cluster->id, env->candidate_list);
+ }
+ }
+
+ return NULL;
+}
+
+static struct sched_cluster *
+next_candidate(const unsigned long *list, int start, int end)
+{
+ int cluster_id;
+
+ cluster_id = find_next_bit(list, end, start - 1 + 1);
+ if (cluster_id >= end)
+ return NULL;
+
+ return sched_cluster[cluster_id];
+}
+
+static void
+update_spare_capacity(struct cluster_cpu_stats *stats,
+ struct cpu_select_env *env, int cpu, int capacity,
+ u64 cpu_load)
+{
+ s64 spare_capacity = sched_ravg_window - cpu_load;
+
+ if (spare_capacity > 0 &&
+ (spare_capacity > stats->highest_spare_capacity ||
+ (spare_capacity == stats->highest_spare_capacity &&
+ ((!env->need_waker_cluster &&
+ capacity > cpu_capacity(stats->best_capacity_cpu)) ||
+ (env->need_waker_cluster &&
+ cpu_rq(cpu)->nr_running <
+ cpu_rq(stats->best_capacity_cpu)->nr_running))))) {
+ /*
+ * If sync waker is the only runnable of CPU, cr_avg of the
+ * CPU is 0 so we have high chance to place the wakee on the
+ * waker's CPU which likely causes preemtion of the waker.
+ * This can lead migration of preempted waker. Place the
+ * wakee on the real idle CPU when it's possible by checking
+ * nr_running to avoid such preemption.
+ */
+ stats->highest_spare_capacity = spare_capacity;
+ stats->best_capacity_cpu = cpu;
+ }
+}
+
+static inline void find_backup_cluster(
+struct cpu_select_env *env, struct cluster_cpu_stats *stats)
+{
+ struct sched_cluster *next = NULL;
+ int i;
+ struct cpumask search_cpus;
+
+ extern int num_clusters;
+
+ while (!bitmap_empty(env->backup_list, num_clusters)) {
+ next = next_candidate(env->backup_list, 0, num_clusters);
+ __clear_bit(next->id, env->backup_list);
+
+ cpumask_and(&search_cpus, &env->search_cpus, &next->cpus);
+ for_each_cpu(i, &search_cpus) {
+ trace_sched_cpu_load_wakeup(cpu_rq(i), idle_cpu(i),
+ sched_irqload(i), power_cost(i, task_load(env->p) +
+ cpu_cravg_sync(i, env->sync)), 0);
+
+ update_spare_capacity(stats, env, i, next->capacity,
+ cpu_load_sync(i, env->sync));
+ }
+ env->sbc_best_cluster_flag = SBC_FLAG_BACKUP_CLUSTER;
+ }
+}
+
+struct sched_cluster *
+next_best_cluster(struct sched_cluster *cluster, struct cpu_select_env *env,
+ struct cluster_cpu_stats *stats)
+{
+ struct sched_cluster *next = NULL;
+
+ extern int num_clusters;
+
+ __clear_bit(cluster->id, env->candidate_list);
+
+ if (env->rtg && preferred_cluster(cluster, env->p))
+ return NULL;
+
+ do {
+ if (bitmap_empty(env->candidate_list, num_clusters))
+ return NULL;
+
+ next = next_candidate(env->candidate_list, 0, num_clusters);
+ if (next) {
+ if (next->min_power_cost > stats->min_cost) {
+ clear_bit(next->id, env->candidate_list);
+ next = NULL;
+ continue;
+ }
+
+ if (skip_cluster(next, env))
+ next = NULL;
+ }
+ } while (!next);
+
+ env->task_load = scale_load_to_cpu(task_load(env->p),
+ cluster_first_cpu(next));
+ return next;
+}
+
+#ifdef CONFIG_SCHED_HMP_CSTATE_AWARE
+static void __update_cluster_stats(int cpu, struct cluster_cpu_stats *stats,
+ struct cpu_select_env *env, int cpu_cost)
+{
+ int wakeup_latency;
+ int prev_cpu = env->prev_cpu;
+
+ wakeup_latency = cpu_rq(cpu)->wakeup_latency;
+
+ if (env->need_idle) {
+ stats->min_cost = cpu_cost;
+ if (idle_cpu(cpu)) {
+ if (wakeup_latency < stats->best_cpu_wakeup_latency ||
+ (wakeup_latency == stats->best_cpu_wakeup_latency &&
+ cpu == prev_cpu)) {
+ stats->best_idle_cpu = cpu;
+ stats->best_cpu_wakeup_latency = wakeup_latency;
+ }
+ } else {
+ if (env->cpu_load < stats->min_load ||
+ (env->cpu_load == stats->min_load &&
+ cpu == prev_cpu)) {
+ stats->least_loaded_cpu = cpu;
+ stats->min_load = env->cpu_load;
+ }
+ }
+
+ return;
+ }
+
+ if (cpu_cost < stats->min_cost) {
+ stats->min_cost = cpu_cost;
+ stats->best_cpu_wakeup_latency = wakeup_latency;
+ stats->best_load = env->cpu_load;
+ stats->best_cpu = cpu;
+ env->sbc_best_flag = SBC_FLAG_CPU_COST;
+ return;
+ }
+
+ /* CPU cost is the same. Start breaking the tie by C-state */
+
+ if (wakeup_latency > stats->best_cpu_wakeup_latency)
+ return;
+
+ if (wakeup_latency < stats->best_cpu_wakeup_latency) {
+ stats->best_cpu_wakeup_latency = wakeup_latency;
+ stats->best_load = env->cpu_load;
+ stats->best_cpu = cpu;
+ env->sbc_best_flag = SBC_FLAG_COST_CSTATE_TIE_BREAKER;
+ return;
+ }
+
+ /* C-state is the same. Use prev CPU to break the tie */
+ if (cpu == prev_cpu) {
+ stats->best_cpu = cpu;
+ env->sbc_best_flag = SBC_FLAG_COST_CSTATE_PREV_CPU_TIE_BREAKER;
+ return;
+ }
+
+ if (stats->best_cpu != prev_cpu &&
+ ((wakeup_latency == 0 && env->cpu_load < stats->best_load) ||
+ (wakeup_latency > 0 && env->cpu_load > stats->best_load))) {
+ stats->best_load = env->cpu_load;
+ stats->best_cpu = cpu;
+ env->sbc_best_flag = SBC_FLAG_CSTATE_LOAD;
+ }
+}
+#else /* CONFIG_SCHED_HMP_CSTATE_AWARE */
+static void __update_cluster_stats(int cpu, struct cluster_cpu_stats *stats,
+ struct cpu_select_env *env, int cpu_cost)
+{
+ int prev_cpu = env->prev_cpu;
+
+ if (cpu != prev_cpu && cpus_share_cache(prev_cpu, cpu)) {
+ if (stats->best_sibling_cpu_cost > cpu_cost ||
+ (stats->best_sibling_cpu_cost == cpu_cost &&
+ stats->best_sibling_cpu_load > env->cpu_load)) {
+ stats->best_sibling_cpu_cost = cpu_cost;
+ stats->best_sibling_cpu_load = env->cpu_load;
+ stats->best_sibling_cpu = cpu;
+ }
+ }
+
+ if ((cpu_cost < stats->min_cost) ||
+ ((stats->best_cpu != prev_cpu &&
+ stats->min_load > env->cpu_load) || cpu == prev_cpu)) {
+ if (env->need_idle) {
+ if (idle_cpu(cpu)) {
+ stats->min_cost = cpu_cost;
+ stats->best_idle_cpu = cpu;
+ }
+ } else {
+ stats->min_cost = cpu_cost;
+ stats->min_load = env->cpu_load;
+ stats->best_cpu = cpu;
+ env->sbc_best_flag = SBC_FLAG_MIN_COST;
+ }
+ }
+}
+#endif /* CONFIG_SCHED_HMP_CSTATE_AWARE */
+
+static void update_cluster_stats(int cpu, struct cluster_cpu_stats *stats,
+ struct cpu_select_env *env)
+{
+ int cpu_cost;
+
+ /*
+ * We try to find the least loaded *busy* CPU irrespective
+ * of the power cost.
+ */
+ if (env->pack_task)
+ cpu_cost = cpu_min_power_cost(cpu);
+
+ else
+ cpu_cost = power_cost(cpu, task_load(env->p) +
+ cpu_cravg_sync(cpu, env->sync));
+
+ if (cpu_cost <= stats->min_cost)
+ __update_cluster_stats(cpu, stats, env, cpu_cost);
+}
+
+static void find_best_cpu_in_cluster(struct sched_cluster *c,
+ struct cpu_select_env *env, struct cluster_cpu_stats *stats)
+{
+ int i;
+ struct cpumask search_cpus;
+
+ cpumask_and(&search_cpus, &env->search_cpus, &c->cpus);
+
+ env->need_idle = wake_to_idle(env->p) || c->wake_up_idle;
+
+ for_each_cpu(i, &search_cpus) {
+ env->cpu_load = cpu_load_sync(i, env->sync);
+
+ trace_sched_cpu_load_wakeup(cpu_rq(i), idle_cpu(i),
+ sched_irqload(i),
+ power_cost(i, task_load(env->p) +
+ cpu_cravg_sync(i, env->sync)), 0);
+
+ if (skip_cpu(i, env))
+ continue;
+
+ update_spare_capacity(stats, env, i, c->capacity,
+ env->cpu_load);
+
+ /*
+ * need_idle takes precedence over sched boost but when both
+ * are set, idlest CPU with in all the clusters is selected
+ * when boost_policy = BOOST_ON_ALL whereas idlest CPU in the
+ * big cluster is selected within boost_policy = BOOST_ON_BIG.
+ */
+ if ((!env->need_idle &&
+ env->boost_policy != SCHED_BOOST_NONE) ||
+ env->need_waker_cluster ||
+ sched_cpu_high_irqload(i) ||
+ spill_threshold_crossed(env, cpu_rq(i)))
+ continue;
+
+ update_cluster_stats(i, stats, env);
+ }
+}
+
+static inline void init_cluster_cpu_stats(struct cluster_cpu_stats *stats)
+{
+ stats->best_cpu = stats->best_idle_cpu = -1;
+ stats->best_capacity_cpu = stats->best_sibling_cpu = -1;
+ stats->min_cost = stats->best_sibling_cpu_cost = INT_MAX;
+ stats->min_load = stats->best_sibling_cpu_load = ULLONG_MAX;
+ stats->highest_spare_capacity = 0;
+ stats->least_loaded_cpu = -1;
+ stats->best_cpu_wakeup_latency = INT_MAX;
+ /* No need to initialize stats->best_load */
+}
+
+static inline bool env_has_special_flags(struct cpu_select_env *env)
+{
+ if (env->need_idle || env->boost_policy != SCHED_BOOST_NONE ||
+ env->reason)
+ return true;
+
+ return false;
+}
+
+static inline bool
+bias_to_prev_cpu(struct cpu_select_env *env, struct cluster_cpu_stats *stats)
+{
+ int prev_cpu;
+ struct task_struct *task = env->p;
+ struct sched_cluster *cluster;
+
+ if (!task->ravg.mark_start || !sched_short_sleep_task_threshold)
+ return false;
+
+ prev_cpu = env->prev_cpu;
+ if (!cpumask_test_cpu(prev_cpu, &env->search_cpus))
+ return false;
+
+ if (task->ravg.mark_start - task->last_cpu_selected_ts >=
+ sched_long_cpu_selection_threshold)
+ return false;
+
+ /*
+ * This function should be used by task wake up path only as it's
+ * assuming p->last_switch_out_ts as last sleep time.
+ * p->last_switch_out_ts can denote last preemption time as well as
+ * last sleep time.
+ */
+ if (task->ravg.mark_start - task->last_switch_out_ts >=
+ sched_short_sleep_task_threshold)
+ return false;
+
+ env->task_load = scale_load_to_cpu(task_load(task), prev_cpu);
+ cluster = cpu_rq(prev_cpu)->cluster;
+
+ if (!task_load_will_fit(task, env->task_load, prev_cpu,
+ sched_boost_policy())) {
+
+ __set_bit(cluster->id, env->backup_list);
+ __clear_bit(cluster->id, env->candidate_list);
+ return false;
+ }
+
+ env->cpu_load = cpu_load_sync(prev_cpu, env->sync);
+ if (sched_cpu_high_irqload(prev_cpu) ||
+ spill_threshold_crossed(env, cpu_rq(prev_cpu))) {
+ update_spare_capacity(stats, env, prev_cpu,
+ cluster->capacity, env->cpu_load);
+ cpumask_clear_cpu(prev_cpu, &env->search_cpus);
+ return false;
+ }
+
+ return true;
+}
+
+static inline bool
+wake_to_waker_cluster(struct cpu_select_env *env)
+{
+ return env->sync &&
+ task_load(current) > sched_big_waker_task_load &&
+ task_load(env->p) < sched_small_wakee_task_load;
+}
+
+static inline bool
+bias_to_waker_cpu(struct cpu_select_env *env, int cpu)
+{
+ return sysctl_sched_prefer_sync_wakee_to_waker &&
+ cpu_rq(cpu)->nr_running == 1 &&
+ cpumask_test_cpu(cpu, &env->search_cpus);
+}
+
+static inline int
+cluster_allowed(struct cpu_select_env *env, struct sched_cluster *cluster)
+{
+ return cpumask_intersects(&env->search_cpus, &cluster->cpus);
+}
+
+/* return cheapest cpu that can fit this task */
+static int select_best_cpu(struct task_struct *p, int target, int reason,
+ int sync)
+{
+ struct sched_cluster *cluster, *pref_cluster = NULL;
+ struct cluster_cpu_stats stats;
+ struct related_thread_group *grp;
+ unsigned int sbc_flag = 0;
+ int cpu = raw_smp_processor_id();
+ bool special;
+
+ struct cpu_select_env env = {
+ .p = p,
+ .reason = reason,
+ .need_idle = wake_to_idle(p),
+ .need_waker_cluster = 0,
+ .sync = sync,
+ .prev_cpu = target,
+ .rtg = NULL,
+ .sbc_best_flag = 0,
+ .sbc_best_cluster_flag = 0,
+ .pack_task = false,
+ };
+
+ env.boost_policy = task_sched_boost(p) ?
+ sched_boost_policy() : SCHED_BOOST_NONE;
+
+ bitmap_copy(env.candidate_list, all_cluster_ids, NR_CPUS);
+ bitmap_zero(env.backup_list, NR_CPUS);
+
+ cpumask_and(&env.search_cpus, tsk_cpus_allowed(p), cpu_active_mask);
+ cpumask_andnot(&env.search_cpus, &env.search_cpus, cpu_isolated_mask);
+
+ init_cluster_cpu_stats(&stats);
+ special = env_has_special_flags(&env);
+
+ rcu_read_lock();
+
+ grp = task_related_thread_group(p);
+
+ if (grp && grp->preferred_cluster) {
+ pref_cluster = grp->preferred_cluster;
+ if (!cluster_allowed(&env, pref_cluster))
+ clear_bit(pref_cluster->id, env.candidate_list);
+ else
+ env.rtg = grp;
+ } else if (!special) {
+ cluster = cpu_rq(cpu)->cluster;
+ if (wake_to_waker_cluster(&env)) {
+ if (bias_to_waker_cpu(&env, cpu)) {
+ target = cpu;
+ sbc_flag = SBC_FLAG_WAKER_CLUSTER |
+ SBC_FLAG_WAKER_CPU;
+ goto out;
+ } else if (cluster_allowed(&env, cluster)) {
+ env.need_waker_cluster = 1;
+ bitmap_zero(env.candidate_list, NR_CPUS);
+ __set_bit(cluster->id, env.candidate_list);
+ env.sbc_best_cluster_flag =
+ SBC_FLAG_WAKER_CLUSTER;
+ }
+ } else if (bias_to_prev_cpu(&env, &stats)) {
+ sbc_flag = SBC_FLAG_PREV_CPU;
+ goto out;
+ }
+ }
+
+ if (!special && is_short_burst_task(p)) {
+ env.pack_task = true;
+ sbc_flag = SBC_FLAG_PACK_TASK;
+ }
+retry:
+ cluster = select_least_power_cluster(&env);
+
+ if (!cluster)
+ goto out;
+
+ /*
+ * 'cluster' now points to the minimum power cluster which can satisfy
+ * task's perf goals. Walk down the cluster list starting with that
+ * cluster. For non-small tasks, skip clusters that don't have
+ * mostly_idle/idle cpus
+ */
+
+ do {
+ find_best_cpu_in_cluster(cluster, &env, &stats);
+
+ } while ((cluster = next_best_cluster(cluster, &env, &stats)));
+
+ if (env.need_idle) {
+ if (stats.best_idle_cpu >= 0) {
+ target = stats.best_idle_cpu;
+ sbc_flag |= SBC_FLAG_IDLE_CSTATE;
+ } else if (stats.least_loaded_cpu >= 0) {
+ target = stats.least_loaded_cpu;
+ sbc_flag |= SBC_FLAG_IDLE_LEAST_LOADED;
+ }
+ } else if (stats.best_cpu >= 0) {
+ if (stats.best_sibling_cpu >= 0 &&
+ stats.best_cpu != task_cpu(p) &&
+ stats.min_cost == stats.best_sibling_cpu_cost) {
+ stats.best_cpu = stats.best_sibling_cpu;
+ sbc_flag |= SBC_FLAG_BEST_SIBLING;
+ }
+ sbc_flag |= env.sbc_best_flag;
+ target = stats.best_cpu;
+ } else {
+ if (env.rtg && env.boost_policy == SCHED_BOOST_NONE) {
+ env.rtg = NULL;
+ goto retry;
+ }
+
+ /*
+ * With boost_policy == SCHED_BOOST_ON_BIG, we reach here with
+ * backup_list = little cluster, candidate_list = none and
+ * stats->best_capacity_cpu points the best spare capacity
+ * CPU among the CPUs in the big cluster.
+ */
+ if (env.boost_policy == SCHED_BOOST_ON_BIG &&
+ stats.best_capacity_cpu >= 0)
+ sbc_flag |= SBC_FLAG_BOOST_CLUSTER;
+ else
+ find_backup_cluster(&env, &stats);
+
+ if (stats.best_capacity_cpu >= 0) {
+ target = stats.best_capacity_cpu;
+ sbc_flag |= SBC_FLAG_BEST_CAP_CPU;
+ }
+ }
+ p->last_cpu_selected_ts = sched_ktime_clock();
+out:
+ sbc_flag |= env.sbc_best_cluster_flag;
+ rcu_read_unlock();
+ trace_sched_task_load(p, sched_boost_policy() && task_sched_boost(p),
+ env.reason, env.sync, env.need_idle, sbc_flag, target);
+ return target;
+}
+
+#ifdef CONFIG_CFS_BANDWIDTH
+
+static inline struct task_group *next_task_group(struct task_group *tg)
+{
+ tg = list_entry_rcu(tg->list.next, typeof(struct task_group), list);
+
+ return (&tg->list == &task_groups) ? NULL : tg;
+}
+
+/* Iterate over all cfs_rq in a cpu */
+#define for_each_cfs_rq(cfs_rq, tg, cpu) \
+ for (tg = container_of(&task_groups, struct task_group, list); \
+ ((tg = next_task_group(tg)) && (cfs_rq = tg->cfs_rq[cpu]));)
+
+void reset_cfs_rq_hmp_stats(int cpu, int reset_cra)
+{
+ struct task_group *tg;
+ struct cfs_rq *cfs_rq;
+
+ rcu_read_lock();
+
+ for_each_cfs_rq(cfs_rq, tg, cpu)
+ reset_hmp_stats(&cfs_rq->hmp_stats, reset_cra);
+
+ rcu_read_unlock();
+}
+
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
+
+static void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+ struct task_struct *p, int change_cra);
+static void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+ struct task_struct *p, int change_cra);
+
+/* Add task's contribution to a cpu' HMP statistics */
+void _inc_hmp_sched_stats_fair(struct rq *rq,
+ struct task_struct *p, int change_cra)
+{
+ struct cfs_rq *cfs_rq;
+ struct sched_entity *se = &p->se;
+
+ /*
+ * Although below check is not strictly required (as
+ * inc/dec_nr_big_task and inc/dec_cumulative_runnable_avg called
+ * from inc_cfs_rq_hmp_stats() have similar checks), we gain a bit on
+ * efficiency by short-circuiting for_each_sched_entity() loop when
+ * sched_disable_window_stats
+ */
+ if (sched_disable_window_stats)
+ return;
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+ inc_cfs_rq_hmp_stats(cfs_rq, p, change_cra);
+ if (cfs_rq_throttled(cfs_rq))
+ break;
+ }
+
+ /* Update rq->hmp_stats only if we didn't find any throttled cfs_rq */
+ if (!se)
+ inc_rq_hmp_stats(rq, p, change_cra);
+}
+
+/* Remove task's contribution from a cpu' HMP statistics */
+static void
+_dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p, int change_cra)
+{
+ struct cfs_rq *cfs_rq;
+ struct sched_entity *se = &p->se;
+
+ /* See comment on efficiency in _inc_hmp_sched_stats_fair */
+ if (sched_disable_window_stats)
+ return;
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+ dec_cfs_rq_hmp_stats(cfs_rq, p, change_cra);
+ if (cfs_rq_throttled(cfs_rq))
+ break;
+ }
+
+ /* Update rq->hmp_stats only if we didn't find any throttled cfs_rq */
+ if (!se)
+ dec_rq_hmp_stats(rq, p, change_cra);
+}
+
+static void inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p)
+{
+ _inc_hmp_sched_stats_fair(rq, p, 1);
+}
+
+static void dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p)
+{
+ _dec_hmp_sched_stats_fair(rq, p, 1);
+}
+
+static void fixup_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p,
+ u32 new_task_load, u32 new_pred_demand)
+{
+ struct cfs_rq *cfs_rq;
+ struct sched_entity *se = &p->se;
+ s64 task_load_delta = (s64)new_task_load - task_load(p);
+ s64 pred_demand_delta = PRED_DEMAND_DELTA;
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+
+ fixup_cumulative_runnable_avg(&cfs_rq->hmp_stats, p,
+ task_load_delta,
+ pred_demand_delta);
+ fixup_nr_big_tasks(&cfs_rq->hmp_stats, p, task_load_delta);
+ if (cfs_rq_throttled(cfs_rq))
+ break;
+ }
+
+ /* Fix up rq->hmp_stats only if we didn't find any throttled cfs_rq */
+ if (!se) {
+ fixup_cumulative_runnable_avg(&rq->hmp_stats, p,
+ task_load_delta,
+ pred_demand_delta);
+ fixup_nr_big_tasks(&rq->hmp_stats, p, task_load_delta);
+ }
+}
+
+static int task_will_be_throttled(struct task_struct *p);
+
+#else /* CONFIG_CFS_BANDWIDTH */
+
+inline void reset_cfs_rq_hmp_stats(int cpu, int reset_cra) { }
+
+static void
+inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p)
+{
+ inc_nr_big_task(&rq->hmp_stats, p);
+ inc_cumulative_runnable_avg(&rq->hmp_stats, p);
+}
+
+static void
+dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p)
+{
+ dec_nr_big_task(&rq->hmp_stats, p);
+ dec_cumulative_runnable_avg(&rq->hmp_stats, p);
+}
+static void
+fixup_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p,
+ u32 new_task_load, u32 new_pred_demand)
+{
+ s64 task_load_delta = (s64)new_task_load - task_load(p);
+ s64 pred_demand_delta = PRED_DEMAND_DELTA;
+
+ fixup_cumulative_runnable_avg(&rq->hmp_stats, p, task_load_delta,
+ pred_demand_delta);
+ fixup_nr_big_tasks(&rq->hmp_stats, p, task_load_delta);
+}
+
+static inline int task_will_be_throttled(struct task_struct *p)
+{
+ return 0;
+}
+
+void _inc_hmp_sched_stats_fair(struct rq *rq,
+ struct task_struct *p, int change_cra)
+{
+ inc_nr_big_task(&rq->hmp_stats, p);
+}
+
+#endif /* CONFIG_CFS_BANDWIDTH */
+
+/*
+ * Reset balance_interval at all sched_domain levels of given cpu, so that it
+ * honors kick.
+ */
+static inline void reset_balance_interval(int cpu)
+{
+ struct sched_domain *sd;
+
+ if (cpu >= nr_cpu_ids)
+ return;
+
+ rcu_read_lock();
+ for_each_domain(cpu, sd)
+ sd->balance_interval = 0;
+ rcu_read_unlock();
+}
+
+/*
+ * Check if a task is on the "wrong" cpu (i.e its current cpu is not the ideal
+ * cpu as per its demand or priority)
+ *
+ * Returns reason why task needs to be migrated
+ */
+static inline int migration_needed(struct task_struct *p, int cpu)
+{
+ int nice;
+ struct related_thread_group *grp;
+
+ if (p->state != TASK_RUNNING || p->nr_cpus_allowed == 1)
+ return 0;
+
+ /* No need to migrate task that is about to be throttled */
+ if (task_will_be_throttled(p))
+ return 0;
+
+ if (sched_boost_policy() == SCHED_BOOST_ON_BIG &&
+ cpu_capacity(cpu) != max_capacity && task_sched_boost(p))
+ return UP_MIGRATION;
+
+ if (sched_cpu_high_irqload(cpu))
+ return IRQLOAD_MIGRATION;
+
+ nice = task_nice(p);
+ rcu_read_lock();
+ grp = task_related_thread_group(p);
+ /*
+ * Don't assume higher capacity means higher power. If the task
+ * is running on the power efficient CPU, avoid migrating it
+ * to a lower capacity cluster.
+ */
+ if (!grp && (nice > SCHED_UPMIGRATE_MIN_NICE ||
+ upmigrate_discouraged(p)) &&
+ cpu_capacity(cpu) > min_capacity &&
+ cpu_max_power_cost(cpu) == max_power_cost) {
+ rcu_read_unlock();
+ return DOWN_MIGRATION;
+ }
+
+ if (!task_will_fit(p, cpu)) {
+ rcu_read_unlock();
+ return UP_MIGRATION;
+ }
+ rcu_read_unlock();
+
+ return 0;
+}
+
+static inline int
+kick_active_balance(struct rq *rq, struct task_struct *p, int new_cpu)
+{
+ unsigned long flags;
+ int rc = 0;
+
+ /* Invoke active balance to force migrate currently running task */
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ if (!rq->active_balance) {
+ rq->active_balance = 1;
+ rq->push_cpu = new_cpu;
+ get_task_struct(p);
+ rq->push_task = p;
+ rc = 1;
+ }
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+ return rc;
+}
+
+static DEFINE_RAW_SPINLOCK(migration_lock);
+
+static bool do_migration(int reason, int new_cpu, int cpu)
+{
+ if ((reason == UP_MIGRATION || reason == DOWN_MIGRATION)
+ && same_cluster(new_cpu, cpu))
+ return false;
+
+ /* Inter cluster high irqload migrations are OK */
+ return new_cpu != cpu;
+}
+
+/*
+ * Check if currently running task should be migrated to a better cpu.
+ *
+ * Todo: Effect this via changes to nohz_balancer_kick() and load balance?
+ */
+void check_for_migration(struct rq *rq, struct task_struct *p)
+{
+ int cpu = cpu_of(rq), new_cpu;
+ int active_balance = 0, reason;
+
+ reason = migration_needed(p, cpu);
+ if (!reason)
+ return;
+
+ raw_spin_lock(&migration_lock);
+ new_cpu = select_best_cpu(p, cpu, reason, 0);
+
+ if (do_migration(reason, new_cpu, cpu)) {
+ active_balance = kick_active_balance(rq, p, new_cpu);
+ if (active_balance)
+ mark_reserved(new_cpu);
+ }
+
+ raw_spin_unlock(&migration_lock);
+
+ if (active_balance)
+ stop_one_cpu_nowait(cpu, active_load_balance_cpu_stop, rq,
+ &rq->active_balance_work);
+}
+
+#ifdef CONFIG_CFS_BANDWIDTH
+
+static void init_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq)
+{
+ cfs_rq->hmp_stats.nr_big_tasks = 0;
+ cfs_rq->hmp_stats.cumulative_runnable_avg = 0;
+ cfs_rq->hmp_stats.pred_demands_sum = 0;
+}
+
+static void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+ struct task_struct *p, int change_cra)
+{
+ inc_nr_big_task(&cfs_rq->hmp_stats, p);
+ if (change_cra)
+ inc_cumulative_runnable_avg(&cfs_rq->hmp_stats, p);
+}
+
+static void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+ struct task_struct *p, int change_cra)
+{
+ dec_nr_big_task(&cfs_rq->hmp_stats, p);
+ if (change_cra)
+ dec_cumulative_runnable_avg(&cfs_rq->hmp_stats, p);
+}
+
+static void inc_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats,
+ struct cfs_rq *cfs_rq)
+{
+ stats->nr_big_tasks += cfs_rq->hmp_stats.nr_big_tasks;
+ stats->cumulative_runnable_avg +=
+ cfs_rq->hmp_stats.cumulative_runnable_avg;
+ stats->pred_demands_sum += cfs_rq->hmp_stats.pred_demands_sum;
+}
+
+static void dec_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats,
+ struct cfs_rq *cfs_rq)
+{
+ stats->nr_big_tasks -= cfs_rq->hmp_stats.nr_big_tasks;
+ stats->cumulative_runnable_avg -=
+ cfs_rq->hmp_stats.cumulative_runnable_avg;
+ stats->pred_demands_sum -= cfs_rq->hmp_stats.pred_demands_sum;
+
+ BUG_ON(stats->nr_big_tasks < 0 ||
+ (s64)stats->cumulative_runnable_avg < 0);
+ BUG_ON((s64)stats->pred_demands_sum < 0);
+}
+
+#else /* CONFIG_CFS_BANDWIDTH */
+
+static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+ struct task_struct *p, int change_cra) { }
+
+static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+ struct task_struct *p, int change_cra) { }
+
+#endif /* CONFIG_CFS_BANDWIDTH */
+
+#else /* CONFIG_SCHED_HMP */
+
+static inline void init_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq) { }
+
+static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+ struct task_struct *p, int change_cra) { }
+
+static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+ struct task_struct *p, int change_cra) { }
+
+#define dec_throttled_cfs_rq_hmp_stats(...)
+#define inc_throttled_cfs_rq_hmp_stats(...)
+
+#endif /* CONFIG_SCHED_HMP */
+
#if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10
#error "load tracking assumes 2^10 as unit"
#endif
@@ -2836,6 +3941,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
if (cfs_rq)
cfs_rq->runnable_load_sum += weight * scaled_delta;
}
+
if (running)
sa->util_sum += scaled_delta * scale_cpu;
@@ -3434,6 +4540,12 @@ static inline int idle_balance(struct rq *rq)
return 0;
}
+static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+ struct task_struct *p, int change_cra) { }
+
+static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+ struct task_struct *p, int change_cra) { }
+
#endif /* CONFIG_SMP */
static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -4060,6 +5172,35 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
return cfs_bandwidth_used() && cfs_rq->throttled;
}
+#ifdef CONFIG_SCHED_HMP
+/*
+ * Check if task is part of a hierarchy where some cfs_rq does not have any
+ * runtime left.
+ *
+ * We can't rely on throttled_hierarchy() to do this test, as
+ * cfs_rq->throttle_count will not be updated yet when this function is called
+ * from scheduler_tick()
+ */
+static int task_will_be_throttled(struct task_struct *p)
+{
+ struct sched_entity *se = &p->se;
+ struct cfs_rq *cfs_rq;
+
+ if (!cfs_bandwidth_used())
+ return 0;
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+ if (!cfs_rq->runtime_enabled)
+ continue;
+ if (cfs_rq->runtime_remaining <= 0)
+ return 1;
+ }
+
+ return 0;
+}
+#endif
+
/* check whether cfs_rq, or any parent, is throttled */
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
{
@@ -4139,13 +5280,16 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
if (dequeue)
dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
qcfs_rq->h_nr_running -= task_delta;
+ dec_throttled_cfs_rq_hmp_stats(&qcfs_rq->hmp_stats, cfs_rq);
if (qcfs_rq->load.weight)
dequeue = 0;
}
- if (!se)
+ if (!se) {
sub_nr_running(rq, task_delta);
+ dec_throttled_cfs_rq_hmp_stats(&rq->hmp_stats, cfs_rq);
+ }
cfs_rq->throttled = 1;
cfs_rq->throttled_clock = rq_clock(rq);
@@ -4170,6 +5314,12 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
start_cfs_bandwidth(cfs_b);
raw_spin_unlock(&cfs_b->lock);
+
+ /* Log effect on hmp stats after throttling */
+ trace_sched_cpu_load_cgroup(rq, idle_cpu(cpu_of(rq)),
+ sched_irqload(cpu_of(rq)),
+ power_cost(cpu_of(rq), 0),
+ cpu_temp(cpu_of(rq)));
}
void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
@@ -4179,6 +5329,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
struct sched_entity *se;
int enqueue = 1;
long task_delta;
+ struct cfs_rq *tcfs_rq __maybe_unused = cfs_rq;
se = cfs_rq->tg->se[cpu_of(rq)];
@@ -4206,17 +5357,26 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
if (enqueue)
enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
cfs_rq->h_nr_running += task_delta;
+ inc_throttled_cfs_rq_hmp_stats(&cfs_rq->hmp_stats, tcfs_rq);
if (cfs_rq_throttled(cfs_rq))
break;
}
- if (!se)
+ if (!se) {
add_nr_running(rq, task_delta);
+ inc_throttled_cfs_rq_hmp_stats(&rq->hmp_stats, tcfs_rq);
+ }
/* determine whether we need to wake up potentially idle cpu */
if (rq->curr == rq->idle && rq->cfs.nr_running)
resched_curr(rq);
+
+ /* Log effect on hmp stats after un-throttling */
+ trace_sched_cpu_load_cgroup(rq, idle_cpu(cpu_of(rq)),
+ sched_irqload(cpu_of(rq)),
+ power_cost(cpu_of(rq), 0),
+ cpu_temp(cpu_of(rq)));
}
static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
@@ -4602,6 +5762,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
cfs_rq->runtime_enabled = 0;
INIT_LIST_HEAD(&cfs_rq->throttled_list);
+ init_cfs_rq_hmp_stats(cfs_rq);
}
void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
@@ -4717,7 +5878,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
WARN_ON(task_rq(p) != rq);
- if (cfs_rq->nr_running > 1) {
+ if (rq->cfs.h_nr_running > 1) {
u64 slice = sched_slice(cfs_rq, se);
u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
s64 delta = slice - ran;
@@ -4733,8 +5894,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
/*
* called from enqueue/dequeue and updates the hrtick when the
- * current task is from our class and nr_running is low enough
- * to matter.
+ * current task is from our class.
*/
static void hrtick_update(struct rq *rq)
{
@@ -4743,8 +5903,7 @@ static void hrtick_update(struct rq *rq)
if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
return;
- if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
- hrtick_start_fair(rq, curr);
+ hrtick_start_fair(rq, curr);
}
#else /* !CONFIG_SCHED_HRTICK */
static inline void
@@ -4802,7 +5961,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
cfs_rq->h_nr_running++;
- walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p);
+ inc_cfs_rq_hmp_stats(cfs_rq, p, 1);
flags = ENQUEUE_WAKEUP;
}
@@ -4810,7 +5969,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_running++;
- walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p);
+ inc_cfs_rq_hmp_stats(cfs_rq, p, 1);
if (cfs_rq_throttled(cfs_rq))
break;
@@ -4819,8 +5978,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
update_cfs_shares(se);
}
- if (!se)
+ if (!se) {
add_nr_running(rq, 1);
+ inc_rq_hmp_stats(rq, p, 1);
+ }
#ifdef CONFIG_SMP
@@ -4843,8 +6004,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
*/
schedtune_enqueue_task(p, cpu_of(rq));
- if (!se) {
- walt_inc_cumulative_runnable_avg(rq, p);
+ if (energy_aware() && !se) {
if (!task_new && !rq->rd->overutilized &&
cpu_overutilized(rq->cpu)) {
rq->rd->overutilized = true;
@@ -4882,7 +6042,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
cfs_rq->h_nr_running--;
- walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p);
+ dec_cfs_rq_hmp_stats(cfs_rq, p, 1);
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight) {
@@ -4902,7 +6062,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_running--;
- walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p);
+ dec_cfs_rq_hmp_stats(cfs_rq, p, 1);
if (cfs_rq_throttled(cfs_rq))
break;
@@ -4911,8 +6071,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
update_cfs_shares(se);
}
- if (!se)
+ if (!se) {
sub_nr_running(rq, 1);
+ dec_rq_hmp_stats(rq, p, 1);
+ }
#ifdef CONFIG_SMP
@@ -4925,8 +6087,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
*/
schedtune_dequeue_task(p, cpu_of(rq));
- if (!se)
- walt_dec_cumulative_runnable_avg(rq, p);
#endif /* CONFIG_SMP */
hrtick_update(rq);
@@ -5339,11 +6499,6 @@ unsigned long capacity_curr_of(int cpu)
>> SCHED_CAPACITY_SHIFT;
}
-static inline bool energy_aware(void)
-{
- return sched_feat(ENERGY_AWARE);
-}
-
struct energy_env {
struct sched_group *sg_top;
struct sched_group *sg_cap;
@@ -5943,12 +7098,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
static inline unsigned long task_util(struct task_struct *p)
{
-#ifdef CONFIG_SCHED_WALT
- if (!walt_disabled && sysctl_sched_use_walt_task_util) {
- unsigned long demand = p->ravg.demand;
- return (demand << 10) / walt_ravg_window;
- }
-#endif
return p->se.avg.util_avg;
}
@@ -6333,6 +7482,10 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
}
}
+ if (!(current->flags & PF_WAKE_UP_IDLE) &&
+ !(p->flags & PF_WAKE_UP_IDLE))
+ return target;
+
/*
* Otherwise, iterate the domains and find an elegible idle cpu.
*/
@@ -6857,6 +8010,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
int want_affine = 0;
int sync = wake_flags & WF_SYNC;
+#ifdef CONFIG_SCHED_HMP
+ return select_best_cpu(p, prev_cpu, 0, sync);
+#endif
+
if (sd_flag & SD_BALANCE_WAKE) {
record_wakee(p);
want_affine = !wake_wide(p, sibling_count_hint) &&
@@ -7443,6 +8600,10 @@ enum group_type {
#define LBF_NEED_BREAK 0x02
#define LBF_DST_PINNED 0x04
#define LBF_SOME_PINNED 0x08
+#define LBF_BIG_TASK_ACTIVE_BALANCE 0x80
+#define LBF_IGNORE_BIG_TASKS 0x100
+#define LBF_IGNORE_PREFERRED_CLUSTER_TASKS 0x200
+#define LBF_MOVED_RELATED_THREAD_GROUP_TASK 0x400
struct lb_env {
struct sched_domain *sd;
@@ -7460,6 +8621,8 @@ struct lb_env {
unsigned int src_grp_nr_running;
/* The set of CPUs under consideration for load-balancing */
struct cpumask *cpus;
+ unsigned int busiest_grp_capacity;
+ unsigned int busiest_nr_running;
unsigned int flags;
@@ -7470,6 +8633,7 @@ struct lb_env {
enum fbq_type fbq_type;
enum group_type busiest_group_type;
struct list_head tasks;
+ enum sched_boost_policy boost_policy;
};
/*
@@ -7567,6 +8731,7 @@ static
int can_migrate_task(struct task_struct *p, struct lb_env *env)
{
int tsk_cache_hot;
+ int twf, group_cpus;
lockdep_assert_held(&env->src_rq->lock);
@@ -7613,6 +8778,39 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
/* Record that we found atleast one task that could run on dst_cpu */
env->flags &= ~LBF_ALL_PINNED;
+ if (cpu_capacity(env->dst_cpu) > cpu_capacity(env->src_cpu)) {
+ if (nr_big_tasks(env->src_rq) && !is_big_task(p))
+ return 0;
+
+ if (env->boost_policy == SCHED_BOOST_ON_BIG &&
+ !task_sched_boost(p))
+ return 0;
+ }
+
+ twf = task_will_fit(p, env->dst_cpu);
+
+ /*
+ * Attempt to not pull tasks that don't fit. We may get lucky and find
+ * one that actually fits.
+ */
+ if (env->flags & LBF_IGNORE_BIG_TASKS && !twf)
+ return 0;
+
+ if (env->flags & LBF_IGNORE_PREFERRED_CLUSTER_TASKS &&
+ !preferred_cluster(rq_cluster(cpu_rq(env->dst_cpu)), p))
+ return 0;
+
+ /*
+ * Group imbalance can sometimes cause work to be pulled across groups
+ * even though the group could have managed the imbalance on its own.
+ * Prevent inter-cluster migrations for big tasks when the number of
+ * tasks is lower than the capacity of the group.
+ */
+ group_cpus = DIV_ROUND_UP(env->busiest_grp_capacity,
+ SCHED_CAPACITY_SCALE);
+ if (!twf && env->busiest_nr_running <= group_cpus)
+ return 0;
+
if (task_running(env->src_rq, p)) {
schedstat_inc(p, se.statistics.nr_failed_migrations_running);
return 0;
@@ -7620,15 +8818,16 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
/*
* Aggressive migration if:
- * 1) destination numa is preferred
- * 2) task is cache cold, or
- * 3) too many balance attempts have failed.
+ * 1) IDLE or NEWLY_IDLE balance.
+ * 2) destination numa is preferred
+ * 3) task is cache cold, or
+ * 4) too many balance attempts have failed.
*/
tsk_cache_hot = migrate_degrades_locality(p, env);
if (tsk_cache_hot == -1)
tsk_cache_hot = task_hot(p, env);
- if (tsk_cache_hot <= 0 ||
+ if (env->idle != CPU_NOT_IDLE || tsk_cache_hot <= 0 ||
env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
if (tsk_cache_hot == 1) {
schedstat_inc(env->sd, lb_hot_gained[env->idle]);
@@ -7648,10 +8847,12 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
{
lockdep_assert_held(&env->src_rq->lock);
- deactivate_task(env->src_rq, p, 0);
p->on_rq = TASK_ON_RQ_MIGRATING;
+ deactivate_task(env->src_rq, p, 0);
double_lock_balance(env->src_rq, env->dst_rq);
set_task_cpu(p, env->dst_cpu);
+ if (task_in_related_thread_group(p))
+ env->flags |= LBF_MOVED_RELATED_THREAD_GROUP_TASK;
double_unlock_balance(env->src_rq, env->dst_rq);
}
@@ -7680,6 +8881,7 @@ static struct task_struct *detach_one_task(struct lb_env *env)
* inside detach_tasks().
*/
schedstat_inc(env->sd, lb_gained[env->idle]);
+
return p;
}
return NULL;
@@ -7699,12 +8901,20 @@ static int detach_tasks(struct lb_env *env)
struct task_struct *p;
unsigned long load;
int detached = 0;
+ int orig_loop = env->loop;
lockdep_assert_held(&env->src_rq->lock);
if (env->imbalance <= 0)
return 0;
+ if (!same_cluster(env->dst_cpu, env->src_cpu))
+ env->flags |= LBF_IGNORE_PREFERRED_CLUSTER_TASKS;
+
+ if (cpu_capacity(env->dst_cpu) < cpu_capacity(env->src_cpu))
+ env->flags |= LBF_IGNORE_BIG_TASKS;
+
+redo:
while (!list_empty(tasks)) {
/*
* We don't want to steal all, otherwise we may be treated likewise,
@@ -7774,6 +8984,15 @@ next:
list_move_tail(&p->se.group_node, tasks);
}
+ if (env->flags & (LBF_IGNORE_BIG_TASKS |
+ LBF_IGNORE_PREFERRED_CLUSTER_TASKS) && !detached) {
+ tasks = &env->src_rq->cfs_tasks;
+ env->flags &= ~(LBF_IGNORE_BIG_TASKS |
+ LBF_IGNORE_PREFERRED_CLUSTER_TASKS);
+ env->loop = orig_loop;
+ goto redo;
+ }
+
/*
* Right now, this is one of only two places we collect this stat
* so we can safely collect detach_one_task() stats here rather
@@ -7792,8 +9011,8 @@ static void attach_task(struct rq *rq, struct task_struct *p)
lockdep_assert_held(&rq->lock);
BUG_ON(task_rq(p) != rq);
- p->on_rq = TASK_ON_RQ_QUEUED;
activate_task(rq, p, 0);
+ p->on_rq = TASK_ON_RQ_QUEUED;
check_preempt_curr(rq, p, 0);
}
@@ -7940,6 +9159,10 @@ struct sg_lb_stats {
unsigned long group_capacity;
unsigned long group_util; /* Total utilization of the group */
unsigned int sum_nr_running; /* Nr tasks running in the group */
+#ifdef CONFIG_SCHED_HMP
+ unsigned long sum_nr_big_tasks;
+ u64 group_cpu_load; /* Scaled load of all CPUs of the group */
+#endif
unsigned int idle_cpus;
unsigned int group_weight;
enum group_type group_type;
@@ -7983,10 +9206,64 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
.avg_load = 0UL,
.sum_nr_running = 0,
.group_type = group_other,
+#ifdef CONFIG_SCHED_HMP
+ .sum_nr_big_tasks = 0UL,
+ .group_cpu_load = 0ULL,
+#endif
},
};
}
+#ifdef CONFIG_SCHED_HMP
+
+static int
+bail_inter_cluster_balance(struct lb_env *env, struct sd_lb_stats *sds)
+{
+ int local_cpu, busiest_cpu;
+ int local_capacity, busiest_capacity;
+ int local_pwr_cost, busiest_pwr_cost;
+ int nr_cpus;
+ int boost = sched_boost();
+
+ if (!sysctl_sched_restrict_cluster_spill ||
+ boost == FULL_THROTTLE_BOOST || boost == CONSERVATIVE_BOOST)
+ return 0;
+
+ local_cpu = group_first_cpu(sds->local);
+ busiest_cpu = group_first_cpu(sds->busiest);
+
+ local_capacity = cpu_max_possible_capacity(local_cpu);
+ busiest_capacity = cpu_max_possible_capacity(busiest_cpu);
+
+ local_pwr_cost = cpu_max_power_cost(local_cpu);
+ busiest_pwr_cost = cpu_max_power_cost(busiest_cpu);
+
+ if (local_pwr_cost <= busiest_pwr_cost)
+ return 0;
+
+ if (local_capacity > busiest_capacity &&
+ sds->busiest_stat.sum_nr_big_tasks)
+ return 0;
+
+ nr_cpus = cpumask_weight(sched_group_cpus(sds->busiest));
+ if ((sds->busiest_stat.group_cpu_load < nr_cpus * sched_spill_load) &&
+ (sds->busiest_stat.sum_nr_running <
+ nr_cpus * sysctl_sched_spill_nr_run))
+ return 1;
+
+ return 0;
+}
+
+#else /* CONFIG_SCHED_HMP */
+
+static inline int
+bail_inter_cluster_balance(struct lb_env *env, struct sd_lb_stats *sds)
+{
+ return 0;
+}
+
+#endif /* CONFIG_SCHED_HMP */
+
/**
* get_sd_load_idx - Obtain the load index for a given sched domain.
* @sd: The sched_domain whose load_idx is to be obtained.
@@ -8130,6 +9407,8 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
struct sched_group_capacity *sgc;
struct rq *rq = cpu_rq(cpu);
+ if (cpumask_test_cpu(cpu, cpu_isolated_mask))
+ continue;
/*
* build_sched_domains() -> init_sched_groups_capacity()
* gets here before we've attached the domains to the
@@ -8161,9 +9440,14 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
do {
struct sched_group_capacity *sgc = group->sgc;
- capacity += sgc->capacity;
- max_capacity = max(sgc->max_capacity, max_capacity);
- min_capacity = min(sgc->min_capacity, min_capacity);
+ cpumask_t *cpus = sched_group_cpus(group);
+
+ /* Revisit this later. This won't work for MT domain */
+ if (!cpu_isolated(cpumask_first(cpus))) {
+ capacity += sgc->capacity;
+ max_capacity = max(sgc->max_capacity, max_capacity);
+ min_capacity = min(sgc->min_capacity, min_capacity);
+ }
group = group->next;
} while (group != child->groups);
}
@@ -8279,7 +9563,7 @@ group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
static inline enum
group_type group_classify(struct sched_group *group,
- struct sg_lb_stats *sgs)
+ struct sg_lb_stats *sgs, struct lb_env *env)
{
if (sgs->group_no_capacity)
return group_overloaded;
@@ -8348,6 +9632,14 @@ static inline void update_sg_lb_stats(struct lb_env *env,
for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
struct rq *rq = cpu_rq(i);
+ trace_sched_cpu_load_lb(cpu_rq(i), idle_cpu(i),
+ sched_irqload(i),
+ power_cost(i, 0),
+ cpu_temp(i));
+
+ if (cpu_isolated(i))
+ continue;
+
/* if we are entering idle and there are CPUs with
* their tick stopped, do an update for them
*/
@@ -8368,6 +9660,11 @@ static inline void update_sg_lb_stats(struct lb_env *env,
if (nr_running > 1)
*overload = true;
+#ifdef CONFIG_SCHED_HMP
+ sgs->sum_nr_big_tasks += rq->hmp_stats.nr_big_tasks;
+ sgs->group_cpu_load += cpu_load(i);
+#endif
+
#ifdef CONFIG_NUMA_BALANCING
sgs->nr_numa_running += rq->nr_numa_running;
sgs->nr_preferred_running += rq->nr_preferred_running;
@@ -8379,25 +9676,62 @@ static inline void update_sg_lb_stats(struct lb_env *env,
if (!nr_running && idle_cpu(i))
sgs->idle_cpus++;
- if (cpu_overutilized(i)) {
+ if (energy_aware() && cpu_overutilized(i)) {
*overutilized = true;
if (!sgs->group_misfit_task && rq->misfit_task)
sgs->group_misfit_task = capacity_of(i);
}
}
- /* Adjust by relative CPU capacity of the group */
- sgs->group_capacity = group->sgc->capacity;
- sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
+ /* Isolated CPU has no weight */
+ if (!group->group_weight) {
+ sgs->group_capacity = 0;
+ sgs->avg_load = 0;
+ sgs->group_no_capacity = 1;
+ sgs->group_type = group_other;
+ sgs->group_weight = group->group_weight;
+ } else {
+ /* Adjust by relative CPU capacity of the group */
+ sgs->group_capacity = group->sgc->capacity;
+ sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) /
+ sgs->group_capacity;
+
+ sgs->group_weight = group->group_weight;
+
+ sgs->group_no_capacity = group_is_overloaded(env, sgs);
+ sgs->group_type = group_classify(group, sgs, env);
+ }
if (sgs->sum_nr_running)
sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
+}
- sgs->group_weight = group->group_weight;
+#ifdef CONFIG_SCHED_HMP
+static bool update_sd_pick_busiest_active_balance(struct lb_env *env,
+ struct sd_lb_stats *sds,
+ struct sched_group *sg,
+ struct sg_lb_stats *sgs)
+{
+ if (env->idle != CPU_NOT_IDLE &&
+ cpu_capacity(env->dst_cpu) > group_rq_capacity(sg)) {
+ if (sgs->sum_nr_big_tasks >
+ sds->busiest_stat.sum_nr_big_tasks) {
+ env->flags |= LBF_BIG_TASK_ACTIVE_BALANCE;
+ return true;
+ }
+ }
- sgs->group_no_capacity = group_is_overloaded(env, sgs);
- sgs->group_type = group_classify(group, sgs);
+ return false;
}
+#else
+static bool update_sd_pick_busiest_active_balance(struct lb_env *env,
+ struct sd_lb_stats *sds,
+ struct sched_group *sg,
+ struct sg_lb_stats *sgs)
+{
+ return false;
+}
+#endif
/**
* update_sd_pick_busiest - return 1 on busiest group
@@ -8419,35 +9753,40 @@ static bool update_sd_pick_busiest(struct lb_env *env,
{
struct sg_lb_stats *busiest = &sds->busiest_stat;
+ if (update_sd_pick_busiest_active_balance(env, sds, sg, sgs))
+ return true;
+
if (sgs->group_type > busiest->group_type)
return true;
if (sgs->group_type < busiest->group_type)
return false;
- /*
- * Candidate sg doesn't face any serious load-balance problems
- * so don't pick it if the local sg is already filled up.
- */
- if (sgs->group_type == group_other &&
- !group_has_capacity(env, &sds->local_stat))
- return false;
+ if (energy_aware()) {
+ /*
+ * Candidate sg doesn't face any serious load-balance problems
+ * so don't pick it if the local sg is already filled up.
+ */
+ if (sgs->group_type == group_other &&
+ !group_has_capacity(env, &sds->local_stat))
+ return false;
- if (sgs->avg_load <= busiest->avg_load)
- return false;
+ if (sgs->avg_load <= busiest->avg_load)
+ return false;
- if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
- goto asym_packing;
+ if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
+ goto asym_packing;
- /*
- * Candidate sg has no more than one task per CPU and
- * has higher per-CPU capacity. Migrating tasks to less
- * capable CPUs may harm throughput. Maximize throughput,
- * power/energy consequences are not considered.
- */
- if (sgs->sum_nr_running <= sgs->group_weight &&
- group_smaller_cpu_capacity(sds->local, sg))
- return false;
+ /*
+ * Candidate sg has no more than one task per CPU and
+ * has higher per-CPU capacity. Migrating tasks to less
+ * capable CPUs may harm throughput. Maximize throughput,
+ * power/energy consequences are not considered.
+ */
+ if (sgs->sum_nr_running <= sgs->group_weight &&
+ group_smaller_cpu_capacity(sds->local, sg))
+ return false;
+ }
asym_packing:
/* This is the busiest node in its class. */
@@ -8555,14 +9894,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
group_has_capacity(env, &sds->local_stat) &&
(sgs->sum_nr_running > 1)) {
sgs->group_no_capacity = 1;
- sgs->group_type = group_classify(sg, sgs);
+ sgs->group_type = group_classify(sg, sgs, env);
}
/*
* Ignore task groups with misfit tasks if local group has no
* capacity or if per-cpu capacity isn't higher.
*/
- if (sgs->group_type == group_misfit_task &&
+ if (energy_aware() &&
+ sgs->group_type == group_misfit_task &&
(!group_has_capacity(env, &sds->local_stat) ||
!group_smaller_cpu_capacity(sg, sds->local)))
sgs->group_type = group_other;
@@ -8570,6 +9910,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
if (update_sd_pick_busiest(env, sds, sg, sgs)) {
sds->busiest = sg;
sds->busiest_stat = *sgs;
+ env->busiest_nr_running = sgs->sum_nr_running;
+ env->busiest_grp_capacity = sgs->group_capacity;
}
next_group:
@@ -8591,12 +9933,12 @@ next_group:
env->dst_rq->rd->overload = overload;
/* Update over-utilization (tipping point, U >= 0) indicator */
- if (env->dst_rq->rd->overutilized != overutilized) {
+ if (energy_aware() && env->dst_rq->rd->overutilized != overutilized) {
env->dst_rq->rd->overutilized = overutilized;
trace_sched_overutilized(overutilized);
}
} else {
- if (!env->dst_rq->rd->overutilized && overutilized) {
+ if (energy_aware() && !env->dst_rq->rd->overutilized && overutilized) {
env->dst_rq->rd->overutilized = true;
trace_sched_overutilized(true);
}
@@ -8748,20 +10090,22 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
*/
if (busiest->avg_load <= sds->avg_load ||
local->avg_load >= sds->avg_load) {
- /* Misfitting tasks should be migrated in any case */
- if (busiest->group_type == group_misfit_task) {
- env->imbalance = busiest->group_misfit_task;
- return;
- }
+ if (energy_aware()) {
+ /* Misfitting tasks should be migrated in any case */
+ if (busiest->group_type == group_misfit_task) {
+ env->imbalance = busiest->group_misfit_task;
+ return;
+ }
- /*
- * Busiest group is overloaded, local is not, use the spare
- * cycles to maximize throughput
- */
- if (busiest->group_type == group_overloaded &&
- local->group_type <= group_misfit_task) {
- env->imbalance = busiest->load_per_task;
- return;
+ /*
+ * Busiest group is overloaded, local is not, use the spare
+ * cycles to maximize throughput
+ */
+ if (busiest->group_type == group_overloaded &&
+ local->group_type <= group_misfit_task) {
+ env->imbalance = busiest->load_per_task;
+ return;
+ }
}
env->imbalance = 0;
@@ -8798,7 +10142,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
) / SCHED_CAPACITY_SCALE;
/* Boost imbalance to allow misfit task to be balanced. */
- if (busiest->group_type == group_misfit_task)
+ if (energy_aware() && busiest->group_type == group_misfit_task)
env->imbalance = max_t(long, env->imbalance,
busiest->group_misfit_task);
@@ -8859,6 +10203,12 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
if (!sds.busiest || busiest->sum_nr_running == 0)
goto out_balanced;
+ if (env->flags & LBF_BIG_TASK_ACTIVE_BALANCE)
+ goto force_balance;
+
+ if (bail_inter_cluster_balance(env, &sds))
+ goto out_balanced;
+
sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
/ sds.total_capacity;
@@ -8879,7 +10229,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
goto force_balance;
/* Misfitting tasks should be dealt with regardless of the avg load */
- if (busiest->group_type == group_misfit_task) {
+ if (energy_aware() && busiest->group_type == group_misfit_task) {
goto force_balance;
}
@@ -8930,6 +10280,60 @@ out_balanced:
return NULL;
}
+#ifdef CONFIG_SCHED_HMP
+static struct rq *find_busiest_queue_hmp(struct lb_env *env,
+ struct sched_group *group)
+{
+ struct rq *busiest = NULL, *busiest_big = NULL;
+ u64 max_runnable_avg = 0, max_runnable_avg_big = 0;
+ int max_nr_big = 0, nr_big;
+ bool find_big = !!(env->flags & LBF_BIG_TASK_ACTIVE_BALANCE);
+ int i;
+ cpumask_t cpus;
+
+ cpumask_andnot(&cpus, sched_group_cpus(group), cpu_isolated_mask);
+
+ for_each_cpu(i, &cpus) {
+ struct rq *rq = cpu_rq(i);
+ u64 cumulative_runnable_avg =
+ rq->hmp_stats.cumulative_runnable_avg;
+
+ if (!cpumask_test_cpu(i, env->cpus))
+ continue;
+
+
+ if (find_big) {
+ nr_big = nr_big_tasks(rq);
+ if (nr_big > max_nr_big ||
+ (nr_big > 0 && nr_big == max_nr_big &&
+ cumulative_runnable_avg > max_runnable_avg_big)) {
+ max_runnable_avg_big = cumulative_runnable_avg;
+ busiest_big = rq;
+ max_nr_big = nr_big;
+ continue;
+ }
+ }
+
+ if (cumulative_runnable_avg > max_runnable_avg) {
+ max_runnable_avg = cumulative_runnable_avg;
+ busiest = rq;
+ }
+ }
+
+ if (busiest_big)
+ return busiest_big;
+
+ env->flags &= ~LBF_BIG_TASK_ACTIVE_BALANCE;
+ return busiest;
+}
+#else
+static inline struct rq *find_busiest_queue_hmp(struct lb_env *env,
+ struct sched_group *group)
+{
+ return NULL;
+}
+#endif
+
/*
* find_busiest_queue - find the busiest runqueue among the cpus in group.
*/
@@ -8940,6 +10344,10 @@ static struct rq *find_busiest_queue(struct lb_env *env,
unsigned long busiest_load = 0, busiest_capacity = 1;
int i;
+#ifdef CONFIG_SCHED_HMP
+ return find_busiest_queue_hmp(env, group);
+#endif
+
for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
unsigned long capacity, wl;
enum fbq_type rt;
@@ -9008,15 +10416,20 @@ static struct rq *find_busiest_queue(struct lb_env *env,
* Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
* so long as it is large enough.
*/
-#define MAX_PINNED_INTERVAL 512
+#define MAX_PINNED_INTERVAL 16
/* Working cpumask for load_balance and load_balance_newidle. */
DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
+#define NEED_ACTIVE_BALANCE_THRESHOLD 10
+
static int need_active_balance(struct lb_env *env)
{
struct sched_domain *sd = env->sd;
+ if (env->flags & LBF_BIG_TASK_ACTIVE_BALANCE)
+ return 1;
+
if (env->idle == CPU_NEWLY_IDLE) {
/*
@@ -9041,7 +10454,8 @@ static int need_active_balance(struct lb_env *env)
return 1;
}
- if ((capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) &&
+ if (energy_aware() &&
+ (capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) &&
((capacity_orig_of(env->src_cpu) < capacity_orig_of(env->dst_cpu))) &&
env->src_rq->cfs.h_nr_running == 1 &&
cpu_overutilized(env->src_cpu) &&
@@ -9049,10 +10463,18 @@ static int need_active_balance(struct lb_env *env)
return 1;
}
- return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
+ return unlikely(sd->nr_balance_failed >
+ sd->cache_nice_tries + NEED_ACTIVE_BALANCE_THRESHOLD);
}
-static int active_load_balance_cpu_stop(void *data);
+static int group_balance_cpu_not_isolated(struct sched_group *sg)
+{
+ cpumask_t cpus;
+
+ cpumask_and(&cpus, sched_group_cpus(sg), sched_group_mask(sg));
+ cpumask_andnot(&cpus, &cpus, cpu_isolated_mask);
+ return cpumask_first(&cpus);
+}
static int should_we_balance(struct lb_env *env)
{
@@ -9071,7 +10493,8 @@ static int should_we_balance(struct lb_env *env)
sg_mask = sched_group_mask(sg);
/* Try to find first idle cpu */
for_each_cpu_and(cpu, sg_cpus, env->cpus) {
- if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))
+ if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu) ||
+ cpu_isolated(cpu))
continue;
balance_cpu = cpu;
@@ -9079,7 +10502,7 @@ static int should_we_balance(struct lb_env *env)
}
if (balance_cpu == -1)
- balance_cpu = group_balance_cpu(sg);
+ balance_cpu = group_balance_cpu_not_isolated(sg);
/*
* First idle cpu or the first cpu(busiest) in this sched group
@@ -9096,23 +10519,29 @@ static int load_balance(int this_cpu, struct rq *this_rq,
struct sched_domain *sd, enum cpu_idle_type idle,
int *continue_balancing)
{
- int ld_moved, cur_ld_moved, active_balance = 0;
+ int ld_moved = 0, cur_ld_moved, active_balance = 0;
struct sched_domain *sd_parent = lb_sd_parent(sd) ? sd->parent : NULL;
- struct sched_group *group;
- struct rq *busiest;
+ struct sched_group *group = NULL;
+ struct rq *busiest = NULL;
unsigned long flags;
struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
struct lb_env env = {
- .sd = sd,
- .dst_cpu = this_cpu,
- .dst_rq = this_rq,
- .dst_grpmask = sched_group_cpus(sd->groups),
- .idle = idle,
- .loop_break = sched_nr_migrate_break,
- .cpus = cpus,
- .fbq_type = all,
- .tasks = LIST_HEAD_INIT(env.tasks),
+ .sd = sd,
+ .dst_cpu = this_cpu,
+ .dst_rq = this_rq,
+ .dst_grpmask = sched_group_cpus(sd->groups),
+ .idle = idle,
+ .loop_break = sched_nr_migrate_break,
+ .cpus = cpus,
+ .fbq_type = all,
+ .tasks = LIST_HEAD_INIT(env.tasks),
+ .imbalance = 0,
+ .flags = 0,
+ .loop = 0,
+ .busiest_nr_running = 0,
+ .busiest_grp_capacity = 0,
+ .boost_policy = sched_boost_policy(),
};
/*
@@ -9160,12 +10589,24 @@ redo:
* correctly treated as an imbalance.
*/
env.flags |= LBF_ALL_PINNED;
- env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
more_balance:
raw_spin_lock_irqsave(&busiest->lock, flags);
update_rq_clock(busiest);
+ /* The world might have changed. Validate assumptions */
+ if (busiest->nr_running <= 1) {
+ raw_spin_unlock_irqrestore(&busiest->lock, flags);
+ env.flags &= ~LBF_ALL_PINNED;
+ goto no_move;
+ }
+
+ /*
+ * Set loop_max when rq's lock is taken to prevent a race.
+ */
+ env.loop_max = min(sysctl_sched_nr_migrate,
+ busiest->nr_running);
+
/*
* cur_ld_moved - load moved in current iteration
* ld_moved - cumulative load moved across iterations
@@ -9253,17 +10694,22 @@ more_balance:
}
}
+no_move:
if (!ld_moved) {
- schedstat_inc(sd, lb_failed[idle]);
+ if (!(env.flags & LBF_BIG_TASK_ACTIVE_BALANCE))
+ schedstat_inc(sd, lb_failed[idle]);
+
/*
* Increment the failure counter only on periodic balance.
* We do not want newidle balance, which can be very
* frequent, pollute the failure counter causing
* excessive cache_hot migrations and active balances.
*/
- if (idle != CPU_NEWLY_IDLE)
- if (env.src_grp_nr_running > 1)
+ if (idle != CPU_NEWLY_IDLE &&
+ !(env.flags & LBF_BIG_TASK_ACTIVE_BALANCE)) {
+ if (env.src_grp_nr_running > 1)
sd->nr_balance_failed++;
+ }
if (need_active_balance(&env)) {
raw_spin_lock_irqsave(&busiest->lock, flags);
@@ -9285,7 +10731,8 @@ more_balance:
* ->active_balance_work. Once set, it's cleared
* only after active load balance is finished.
*/
- if (!busiest->active_balance) {
+ if (!busiest->active_balance &&
+ !cpu_isolated(cpu_of(busiest))) {
busiest->active_balance = 1;
busiest->push_cpu = this_cpu;
active_balance = 1;
@@ -9296,17 +10743,31 @@ more_balance:
stop_one_cpu_nowait(cpu_of(busiest),
active_load_balance_cpu_stop, busiest,
&busiest->active_balance_work);
+ *continue_balancing = 0;
}
/*
* We've kicked active balancing, reset the failure
* counter.
*/
- sd->nr_balance_failed = sd->cache_nice_tries+1;
+ sd->nr_balance_failed =
+ sd->cache_nice_tries +
+ NEED_ACTIVE_BALANCE_THRESHOLD - 1;
}
- } else
+ } else {
sd->nr_balance_failed = 0;
+ /* Assumes one 'busiest' cpu that we pulled tasks from */
+ if (!same_freq_domain(this_cpu, cpu_of(busiest))) {
+ int check_groups = !!(env.flags &
+ LBF_MOVED_RELATED_THREAD_GROUP_TASK);
+
+ check_for_freq_change(this_rq, false, check_groups);
+ check_for_freq_change(busiest, false, check_groups);
+ } else {
+ check_for_freq_change(this_rq, true, false);
+ }
+ }
if (likely(!active_balance)) {
/* We were unbalanced, so reset the balancing interval */
sd->balance_interval = sd->min_interval;
@@ -9364,6 +10825,11 @@ out_one_pinned:
(sd->balance_interval < sd->max_interval))
sd->balance_interval *= 2;
out:
+ trace_sched_load_balance(this_cpu, idle, *continue_balancing,
+ group ? group->cpumask[0] : 0,
+ busiest ? busiest->nr_running : 0,
+ env.imbalance, env.flags, ld_moved,
+ sd->balance_interval);
return ld_moved;
}
@@ -9406,6 +10872,9 @@ static int idle_balance(struct rq *this_rq)
int pulled_task = 0;
u64 curr_cost = 0;
+ if (cpu_isolated(this_cpu))
+ return 0;
+
idle_enter_fair(this_rq);
/*
@@ -9460,9 +10929,12 @@ static int idle_balance(struct rq *this_rq)
/*
* Stop searching for tasks to pull if there are
- * now runnable tasks on this rq.
+ * now runnable tasks on the balance rq or if
+ * continue_balancing has been unset (only possible
+ * due to active migration).
*/
- if (pulled_task || this_rq->nr_running > 0)
+ if (pulled_task || this_rq->nr_running > 0 ||
+ !continue_balancing)
break;
}
rcu_read_unlock();
@@ -9514,13 +10986,19 @@ static int active_load_balance_cpu_stop(void *data)
struct task_struct *push_task = NULL;
int push_task_detached = 0;
struct lb_env env = {
- .sd = sd,
- .dst_cpu = target_cpu,
- .dst_rq = target_rq,
- .src_cpu = busiest_rq->cpu,
- .src_rq = busiest_rq,
- .idle = CPU_IDLE,
+ .sd = sd,
+ .dst_cpu = target_cpu,
+ .dst_rq = target_rq,
+ .src_cpu = busiest_rq->cpu,
+ .src_rq = busiest_rq,
+ .idle = CPU_IDLE,
+ .busiest_nr_running = 0,
+ .busiest_grp_capacity = 0,
+ .flags = 0,
+ .loop = 0,
+ .boost_policy = sched_boost_policy(),
};
+ bool moved = false;
raw_spin_lock_irq(&busiest_rq->lock);
@@ -9541,12 +11019,15 @@ static int active_load_balance_cpu_stop(void *data)
BUG_ON(busiest_rq == target_rq);
push_task = busiest_rq->push_task;
+ target_cpu = busiest_rq->push_cpu;
if (push_task) {
if (task_on_rq_queued(push_task) &&
+ push_task->state == TASK_RUNNING &&
task_cpu(push_task) == busiest_cpu &&
cpu_online(target_cpu)) {
detach_task(push_task, &env);
push_task_detached = 1;
+ moved = true;
}
goto out_unlock;
}
@@ -9565,14 +11046,18 @@ static int active_load_balance_cpu_stop(void *data)
update_rq_clock(busiest_rq);
p = detach_one_task(&env);
- if (p)
+ if (p) {
schedstat_inc(sd, alb_pushed);
- else
+ moved = true;
+ } else {
schedstat_inc(sd, alb_failed);
+ }
}
rcu_read_unlock();
out_unlock:
busiest_rq->active_balance = 0;
+ push_task = busiest_rq->push_task;
+ target_cpu = busiest_rq->push_cpu;
if (push_task)
busiest_rq->push_task = NULL;
@@ -9583,6 +11068,7 @@ out_unlock:
if (push_task_detached)
attach_one_task(target_rq, push_task);
put_task_struct(push_task);
+ clear_reserved(target_cpu);
}
if (p)
@@ -9590,6 +11076,15 @@ out_unlock:
local_irq_enable();
+ if (moved && !same_freq_domain(busiest_cpu, target_cpu)) {
+ int check_groups = !!(env.flags &
+ LBF_MOVED_RELATED_THREAD_GROUP_TASK);
+ check_for_freq_change(busiest_rq, false, check_groups);
+ check_for_freq_change(target_rq, false, check_groups);
+ } else if (moved) {
+ check_for_freq_change(target_rq, true, false);
+ }
+
return 0;
}
@@ -9605,9 +11100,49 @@ static inline int on_null_domain(struct rq *rq)
* needed, they will kick the idle load balancer, which then does idle
* load balancing for all the idle CPUs.
*/
-static inline int find_new_ilb(void)
+
+#ifdef CONFIG_SCHED_HMP
+static inline int find_new_hmp_ilb(int type)
{
- int ilb = cpumask_first(nohz.idle_cpus_mask);
+ int call_cpu = raw_smp_processor_id();
+ struct sched_domain *sd;
+ int ilb;
+
+ rcu_read_lock();
+
+ /* Pick an idle cpu "closest" to call_cpu */
+ for_each_domain(call_cpu, sd) {
+ for_each_cpu_and(ilb, nohz.idle_cpus_mask,
+ sched_domain_span(sd)) {
+ if (idle_cpu(ilb) && (type != NOHZ_KICK_RESTRICT ||
+ cpu_max_power_cost(ilb) <=
+ cpu_max_power_cost(call_cpu))) {
+ rcu_read_unlock();
+ reset_balance_interval(ilb);
+ return ilb;
+ }
+ }
+ }
+
+ rcu_read_unlock();
+ return nr_cpu_ids;
+}
+#else /* CONFIG_SCHED_HMP */
+static inline int find_new_hmp_ilb(int type)
+{
+ return 0;
+}
+#endif /* CONFIG_SCHED_HMP */
+
+static inline int find_new_ilb(int type)
+{
+ int ilb;
+
+#ifdef CONFIG_SCHED_HMP
+ return find_new_hmp_ilb(type);
+#endif
+
+ ilb = cpumask_first(nohz.idle_cpus_mask);
if (ilb < nr_cpu_ids && idle_cpu(ilb))
return ilb;
@@ -9620,13 +11155,13 @@ static inline int find_new_ilb(void)
* nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
* CPU (if there is one).
*/
-static void nohz_balancer_kick(void)
+static void nohz_balancer_kick(int type)
{
int ilb_cpu;
nohz.next_balance++;
- ilb_cpu = find_new_ilb();
+ ilb_cpu = find_new_ilb(type);
if (ilb_cpu >= nr_cpu_ids)
return;
@@ -9643,16 +11178,21 @@ static void nohz_balancer_kick(void)
return;
}
+void nohz_balance_clear_nohz_mask(int cpu)
+{
+ if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
+ cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
+ atomic_dec(&nohz.nr_cpus);
+ }
+}
+
static inline void nohz_balance_exit_idle(int cpu)
{
if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
/*
* Completely isolated CPUs don't ever set, so we must test.
*/
- if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
- cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
- atomic_dec(&nohz.nr_cpus);
- }
+ nohz_balance_clear_nohz_mask(cpu);
clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
}
}
@@ -9709,7 +11249,7 @@ void nohz_balance_enter_idle(int cpu)
/*
* If we're a completely isolated CPU, we don't play.
*/
- if (on_null_domain(cpu_rq(cpu)))
+ if (on_null_domain(cpu_rq(cpu)) || cpu_isolated(cpu))
return;
cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
@@ -9738,7 +11278,13 @@ static DEFINE_SPINLOCK(balancing);
*/
void update_max_interval(void)
{
- max_load_balance_interval = HZ*num_online_cpus()/10;
+ cpumask_t avail_mask;
+ unsigned int available_cpus;
+
+ cpumask_andnot(&avail_mask, cpu_online_mask, cpu_isolated_mask);
+ available_cpus = cpumask_weight(&avail_mask);
+
+ max_load_balance_interval = HZ*available_cpus/10;
}
/*
@@ -9863,12 +11409,15 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
/* Earliest time when we have to do rebalance again */
unsigned long next_balance = jiffies + 60*HZ;
int update_next_balance = 0;
+ cpumask_t cpus;
if (idle != CPU_IDLE ||
!test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
goto end;
- for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
+ cpumask_andnot(&cpus, nohz.idle_cpus_mask, cpu_isolated_mask);
+
+ for_each_cpu(balance_cpu, &cpus) {
if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
continue;
@@ -9911,6 +11460,79 @@ end:
clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
}
+#ifdef CONFIG_SCHED_HMP
+static inline int _nohz_kick_needed_hmp(struct rq *rq, int cpu, int *type)
+{
+ struct sched_domain *sd;
+ int i;
+
+ if (rq->nr_running < 2)
+ return 0;
+
+ if (!sysctl_sched_restrict_cluster_spill ||
+ sched_boost_policy() == SCHED_BOOST_ON_ALL)
+ return 1;
+
+ if (cpu_max_power_cost(cpu) == max_power_cost)
+ return 1;
+
+ rcu_read_lock();
+ sd = rcu_dereference_check_sched_domain(rq->sd);
+ if (!sd) {
+ rcu_read_unlock();
+ return 0;
+ }
+
+ for_each_cpu(i, sched_domain_span(sd)) {
+ if (cpu_load(i) < sched_spill_load &&
+ cpu_rq(i)->nr_running <
+ sysctl_sched_spill_nr_run) {
+ /* Change the kick type to limit to CPUs that
+ * are of equal or lower capacity.
+ */
+ *type = NOHZ_KICK_RESTRICT;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ return 1;
+}
+#else
+static inline int _nohz_kick_needed_hmp(struct rq *rq, int cpu, int *type)
+{
+ return 0;
+}
+#endif
+
+static inline int _nohz_kick_needed(struct rq *rq, int cpu, int *type)
+{
+ unsigned long now = jiffies;
+
+ /*
+ * None are in tickless mode and hence no need for NOHZ idle load
+ * balancing.
+ */
+ if (likely(!atomic_read(&nohz.nr_cpus)))
+ return 0;
+
+#ifdef CONFIG_SCHED_HMP
+ return _nohz_kick_needed_hmp(rq, cpu, type);
+#endif
+
+ if (time_before(now, nohz.next_balance))
+ return 0;
+
+ if (rq->nr_running >= 2 &&
+ (!energy_aware() || cpu_overutilized(cpu)))
+ return true;
+
+ /* Do idle load balance if there have misfit task */
+ if (energy_aware())
+ return rq->misfit_task;
+
+ return (rq->nr_running >= 2);
+}
+
/*
* Current heuristic for kicking the idle load balancer in the presence
* of an idle cpu in the system.
@@ -9922,12 +11544,14 @@ end:
* - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
* domain span are idle.
*/
-static inline bool nohz_kick_needed(struct rq *rq)
+static inline bool nohz_kick_needed(struct rq *rq, int *type)
{
- unsigned long now = jiffies;
+#ifndef CONFIG_SCHED_HMP
struct sched_domain *sd;
struct sched_group_capacity *sgc;
- int nr_busy, cpu = rq->cpu;
+ int nr_busy;
+#endif
+ int cpu = rq->cpu;
bool kick = false;
if (unlikely(rq->idle_balance))
@@ -9940,24 +11564,10 @@ static inline bool nohz_kick_needed(struct rq *rq)
set_cpu_sd_state_busy();
nohz_balance_exit_idle(cpu);
- /*
- * None are in tickless mode and hence no need for NOHZ idle load
- * balancing.
- */
- if (likely(!atomic_read(&nohz.nr_cpus)))
- return false;
-
- if (time_before(now, nohz.next_balance))
- return false;
-
- if (rq->nr_running >= 2 &&
- (!energy_aware() || cpu_overutilized(cpu)))
+ if (_nohz_kick_needed(rq, cpu, type))
return true;
- /* Do idle load balance if there have misfit task */
- if (energy_aware())
- return rq->misfit_task;
-
+#ifndef CONFIG_SCHED_HMP
rcu_read_lock();
sd = rcu_dereference(per_cpu(sd_busy, cpu));
if (sd) {
@@ -9989,6 +11599,7 @@ static inline bool nohz_kick_needed(struct rq *rq)
unlock:
rcu_read_unlock();
+#endif
return kick;
}
#else
@@ -10022,15 +11633,19 @@ static void run_rebalance_domains(struct softirq_action *h)
*/
void trigger_load_balance(struct rq *rq)
{
- /* Don't need to rebalance while attached to NULL domain */
- if (unlikely(on_null_domain(rq)))
+ int type = NOHZ_KICK_ANY;
+
+ /* Don't need to rebalance while attached to NULL domain or
+ * cpu is isolated.
+ */
+ if (unlikely(on_null_domain(rq)) || cpu_isolated(cpu_of(rq)))
return;
if (time_after_eq(jiffies, rq->next_balance))
raise_softirq(SCHED_SOFTIRQ);
#ifdef CONFIG_NO_HZ_COMMON
- if (nohz_kick_needed(rq))
- nohz_balancer_kick();
+ if (nohz_kick_needed(rq, &type))
+ nohz_balancer_kick(type);
#endif
}
@@ -10049,47 +11664,6 @@ static void rq_offline_fair(struct rq *rq)
unthrottle_offline_cfs_rqs(rq);
}
-static inline int
-kick_active_balance(struct rq *rq, struct task_struct *p, int new_cpu)
-{
- int rc = 0;
-
- /* Invoke active balance to force migrate currently running task */
- raw_spin_lock(&rq->lock);
- if (!rq->active_balance) {
- rq->active_balance = 1;
- rq->push_cpu = new_cpu;
- get_task_struct(p);
- rq->push_task = p;
- rc = 1;
- }
- raw_spin_unlock(&rq->lock);
-
- return rc;
-}
-
-void check_for_migration(struct rq *rq, struct task_struct *p)
-{
- int new_cpu;
- int active_balance;
- int cpu = task_cpu(p);
-
- if (energy_aware() && rq->misfit_task) {
- if (rq->curr->state != TASK_RUNNING ||
- rq->curr->nr_cpus_allowed == 1)
- return;
-
- new_cpu = select_energy_cpu_brute(p, cpu, 0);
- if (capacity_orig_of(new_cpu) > capacity_orig_of(cpu)) {
- active_balance = kick_active_balance(rq, p, new_cpu);
- if (active_balance)
- stop_one_cpu_nowait(cpu,
- active_load_balance_cpu_stop,
- rq, &rq->active_balance_work);
- }
- }
-}
-
#endif /* CONFIG_SMP */
/*
@@ -10109,7 +11683,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
task_tick_numa(rq, curr);
#ifdef CONFIG_SMP
- if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) {
+ if (energy_aware() &&
+ !rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) {
rq->rd->overutilized = true;
trace_sched_overutilized(true);
}
@@ -10609,6 +12184,11 @@ const struct sched_class fair_sched_class = {
#ifdef CONFIG_FAIR_GROUP_SCHED
.task_change_group = task_change_group_fair,
#endif
+#ifdef CONFIG_SCHED_HMP
+ .inc_hmp_sched_stats = inc_hmp_sched_stats_fair,
+ .dec_hmp_sched_stats = dec_hmp_sched_stats_fair,
+ .fixup_hmp_sched_stats = fixup_hmp_sched_stats_fair,
+#endif
};
#ifdef CONFIG_SCHED_DEBUG
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 55e461055332..c30c48fde7e6 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -49,7 +49,7 @@ SCHED_FEAT(NONTASK_CAPACITY, true)
* Queue remote wakeups on the target CPU and process them
* using the scheduler IPI. Reduces rq->lock contention/bounces.
*/
-SCHED_FEAT(TTWU_QUEUE, true)
+SCHED_FEAT(TTWU_QUEUE, false)
#ifdef HAVE_RT_PUSH_IPI
/*
diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c
new file mode 100644
index 000000000000..598656b42203
--- /dev/null
+++ b/kernel/sched/hmp.c
@@ -0,0 +1,4416 @@
+/* Copyright (c) 2012-2018, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Implementation credits: Srivatsa Vaddagiri, Steve Muckle
+ * Syed Rameez Mustafa, Olav haugan, Joonwoo Park, Pavan Kumar Kondeti
+ * and Vikram Mulukutla
+ */
+
+#include <linux/cpufreq.h>
+#include <linux/list_sort.h>
+#include <linux/syscore_ops.h>
+
+#include "sched.h"
+
+#include <trace/events/sched.h>
+
+#define CSTATE_LATENCY_GRANULARITY_SHIFT (6)
+
+const char *task_event_names[] = {"PUT_PREV_TASK", "PICK_NEXT_TASK",
+ "TASK_WAKE", "TASK_MIGRATE", "TASK_UPDATE",
+ "IRQ_UPDATE"};
+
+const char *migrate_type_names[] = {"GROUP_TO_RQ", "RQ_TO_GROUP"};
+
+static ktime_t ktime_last;
+static bool sched_ktime_suspended;
+
+static bool use_cycle_counter;
+static struct cpu_cycle_counter_cb cpu_cycle_counter_cb;
+
+u64 sched_ktime_clock(void)
+{
+ if (unlikely(sched_ktime_suspended))
+ return ktime_to_ns(ktime_last);
+ return ktime_get_ns();
+}
+
+static void sched_resume(void)
+{
+ sched_ktime_suspended = false;
+}
+
+static int sched_suspend(void)
+{
+ ktime_last = ktime_get();
+ sched_ktime_suspended = true;
+ return 0;
+}
+
+static struct syscore_ops sched_syscore_ops = {
+ .resume = sched_resume,
+ .suspend = sched_suspend
+};
+
+static int __init sched_init_ops(void)
+{
+ register_syscore_ops(&sched_syscore_ops);
+ return 0;
+}
+late_initcall(sched_init_ops);
+
+inline void clear_ed_task(struct task_struct *p, struct rq *rq)
+{
+ if (p == rq->ed_task)
+ rq->ed_task = NULL;
+}
+
+inline void set_task_last_switch_out(struct task_struct *p, u64 wallclock)
+{
+ p->last_switch_out_ts = wallclock;
+}
+
+/*
+ * Note C-state for (idle) cpus.
+ *
+ * @cstate = cstate index, 0 -> active state
+ * @wakeup_energy = energy spent in waking up cpu
+ * @wakeup_latency = latency to wakeup from cstate
+ *
+ */
+void
+sched_set_cpu_cstate(int cpu, int cstate, int wakeup_energy, int wakeup_latency)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ rq->cstate = cstate; /* C1, C2 etc */
+ rq->wakeup_energy = wakeup_energy;
+ /* disregard small latency delta (64 us). */
+ rq->wakeup_latency = ((wakeup_latency >>
+ CSTATE_LATENCY_GRANULARITY_SHIFT) <<
+ CSTATE_LATENCY_GRANULARITY_SHIFT);
+}
+
+/*
+ * Note D-state for (idle) cluster.
+ *
+ * @dstate = dstate index, 0 -> active state
+ * @wakeup_energy = energy spent in waking up cluster
+ * @wakeup_latency = latency to wakeup from cluster
+ *
+ */
+void sched_set_cluster_dstate(const cpumask_t *cluster_cpus, int dstate,
+ int wakeup_energy, int wakeup_latency)
+{
+ struct sched_cluster *cluster =
+ cpu_rq(cpumask_first(cluster_cpus))->cluster;
+ cluster->dstate = dstate;
+ cluster->dstate_wakeup_energy = wakeup_energy;
+ cluster->dstate_wakeup_latency = wakeup_latency;
+}
+
+u32 __weak get_freq_max_load(int cpu, u32 freq)
+{
+ /* 100% by default */
+ return 100;
+}
+
+struct freq_max_load_entry {
+ /* The maximum load which has accounted governor's headroom. */
+ u64 hdemand;
+};
+
+struct freq_max_load {
+ struct rcu_head rcu;
+ int length;
+ struct freq_max_load_entry freqs[0];
+};
+
+static DEFINE_PER_CPU(struct freq_max_load *, freq_max_load);
+static DEFINE_SPINLOCK(freq_max_load_lock);
+
+struct cpu_pwr_stats __weak *get_cpu_pwr_stats(void)
+{
+ return NULL;
+}
+
+int sched_update_freq_max_load(const cpumask_t *cpumask)
+{
+ int i, cpu, ret;
+ unsigned int freq;
+ struct cpu_pstate_pwr *costs;
+ struct cpu_pwr_stats *per_cpu_info = get_cpu_pwr_stats();
+ struct freq_max_load *max_load, *old_max_load;
+ struct freq_max_load_entry *entry;
+ u64 max_demand_capacity, max_demand;
+ unsigned long flags;
+ u32 hfreq;
+ int hpct;
+
+ if (!per_cpu_info)
+ return 0;
+
+ spin_lock_irqsave(&freq_max_load_lock, flags);
+ max_demand_capacity = div64_u64(max_task_load(), max_possible_capacity);
+ for_each_cpu(cpu, cpumask) {
+ if (!per_cpu_info[cpu].ptable) {
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ old_max_load = rcu_dereference(per_cpu(freq_max_load, cpu));
+
+ /*
+ * allocate len + 1 and leave the last power cost as 0 for
+ * power_cost() can stop iterating index when
+ * per_cpu_info[cpu].len > len of max_load due to race between
+ * cpu power stats update and get_cpu_pwr_stats().
+ */
+ max_load = kzalloc(sizeof(struct freq_max_load) +
+ sizeof(struct freq_max_load_entry) *
+ (per_cpu_info[cpu].len + 1), GFP_ATOMIC);
+ if (unlikely(!max_load)) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ max_load->length = per_cpu_info[cpu].len;
+
+ max_demand = max_demand_capacity *
+ cpu_max_possible_capacity(cpu);
+
+ i = 0;
+ costs = per_cpu_info[cpu].ptable;
+ while (costs[i].freq) {
+ entry = &max_load->freqs[i];
+ freq = costs[i].freq;
+ hpct = get_freq_max_load(cpu, freq);
+ if (hpct <= 0 || hpct > 100)
+ hpct = 100;
+ hfreq = div64_u64((u64)freq * hpct, 100);
+ entry->hdemand =
+ div64_u64(max_demand * hfreq,
+ cpu_max_possible_freq(cpu));
+ i++;
+ }
+
+ rcu_assign_pointer(per_cpu(freq_max_load, cpu), max_load);
+ if (old_max_load)
+ kfree_rcu(old_max_load, rcu);
+ }
+
+ spin_unlock_irqrestore(&freq_max_load_lock, flags);
+ return 0;
+
+fail:
+ for_each_cpu(cpu, cpumask) {
+ max_load = rcu_dereference(per_cpu(freq_max_load, cpu));
+ if (max_load) {
+ rcu_assign_pointer(per_cpu(freq_max_load, cpu), NULL);
+ kfree_rcu(max_load, rcu);
+ }
+ }
+
+ spin_unlock_irqrestore(&freq_max_load_lock, flags);
+ return ret;
+}
+
+unsigned int max_possible_efficiency = 1;
+unsigned int min_possible_efficiency = UINT_MAX;
+
+unsigned long __weak arch_get_cpu_efficiency(int cpu)
+{
+ return SCHED_LOAD_SCALE;
+}
+
+/* Keep track of max/min capacity possible across CPUs "currently" */
+static void __update_min_max_capacity(void)
+{
+ int i;
+ int max_cap = 0, min_cap = INT_MAX;
+
+ for_each_online_cpu(i) {
+ max_cap = max(max_cap, cpu_capacity(i));
+ min_cap = min(min_cap, cpu_capacity(i));
+ }
+
+ max_capacity = max_cap;
+ min_capacity = min_cap;
+}
+
+static void update_min_max_capacity(void)
+{
+ unsigned long flags;
+ int i;
+
+ local_irq_save(flags);
+ for_each_possible_cpu(i)
+ raw_spin_lock(&cpu_rq(i)->lock);
+
+ __update_min_max_capacity();
+
+ for_each_possible_cpu(i)
+ raw_spin_unlock(&cpu_rq(i)->lock);
+ local_irq_restore(flags);
+}
+
+/*
+ * Return 'capacity' of a cpu in reference to "least" efficient cpu, such that
+ * least efficient cpu gets capacity of 1024
+ */
+static unsigned long
+capacity_scale_cpu_efficiency(struct sched_cluster *cluster)
+{
+ return (1024 * cluster->efficiency) / min_possible_efficiency;
+}
+
+/*
+ * Return 'capacity' of a cpu in reference to cpu with lowest max_freq
+ * (min_max_freq), such that one with lowest max_freq gets capacity of 1024.
+ */
+static unsigned long capacity_scale_cpu_freq(struct sched_cluster *cluster)
+{
+ return (1024 * cluster_max_freq(cluster)) / min_max_freq;
+}
+
+/*
+ * Return load_scale_factor of a cpu in reference to "most" efficient cpu, so
+ * that "most" efficient cpu gets a load_scale_factor of 1
+ */
+static inline unsigned long
+load_scale_cpu_efficiency(struct sched_cluster *cluster)
+{
+ return DIV_ROUND_UP(1024 * max_possible_efficiency,
+ cluster->efficiency);
+}
+
+/*
+ * Return load_scale_factor of a cpu in reference to cpu with best max_freq
+ * (max_possible_freq), so that one with best max_freq gets a load_scale_factor
+ * of 1.
+ */
+static inline unsigned long load_scale_cpu_freq(struct sched_cluster *cluster)
+{
+ return DIV_ROUND_UP(1024 * max_possible_freq,
+ cluster_max_freq(cluster));
+}
+
+static int compute_capacity(struct sched_cluster *cluster)
+{
+ int capacity = 1024;
+
+ capacity *= capacity_scale_cpu_efficiency(cluster);
+ capacity >>= 10;
+
+ capacity *= capacity_scale_cpu_freq(cluster);
+ capacity >>= 10;
+
+ return capacity;
+}
+
+static int compute_max_possible_capacity(struct sched_cluster *cluster)
+{
+ int capacity = 1024;
+
+ capacity *= capacity_scale_cpu_efficiency(cluster);
+ capacity >>= 10;
+
+ capacity *= (1024 * cluster->max_possible_freq) / min_max_freq;
+ capacity >>= 10;
+
+ return capacity;
+}
+
+static int compute_load_scale_factor(struct sched_cluster *cluster)
+{
+ int load_scale = 1024;
+
+ /*
+ * load_scale_factor accounts for the fact that task load
+ * is in reference to "best" performing cpu. Task's load will need to be
+ * scaled (up) by a factor to determine suitability to be placed on a
+ * (little) cpu.
+ */
+ load_scale *= load_scale_cpu_efficiency(cluster);
+ load_scale >>= 10;
+
+ load_scale *= load_scale_cpu_freq(cluster);
+ load_scale >>= 10;
+
+ return load_scale;
+}
+
+struct list_head cluster_head;
+static DEFINE_MUTEX(cluster_lock);
+static cpumask_t all_cluster_cpus = CPU_MASK_NONE;
+DECLARE_BITMAP(all_cluster_ids, NR_CPUS);
+struct sched_cluster *sched_cluster[NR_CPUS];
+int num_clusters;
+
+unsigned int max_power_cost = 1;
+
+struct sched_cluster init_cluster = {
+ .list = LIST_HEAD_INIT(init_cluster.list),
+ .id = 0,
+ .max_power_cost = 1,
+ .min_power_cost = 1,
+ .capacity = 1024,
+ .max_possible_capacity = 1024,
+ .efficiency = 1,
+ .load_scale_factor = 1024,
+ .cur_freq = 1,
+ .max_freq = 1,
+ .max_mitigated_freq = UINT_MAX,
+ .min_freq = 1,
+ .max_possible_freq = 1,
+ .dstate = 0,
+ .dstate_wakeup_energy = 0,
+ .dstate_wakeup_latency = 0,
+ .exec_scale_factor = 1024,
+ .notifier_sent = 0,
+ .wake_up_idle = 0,
+};
+
+static void update_all_clusters_stats(void)
+{
+ struct sched_cluster *cluster;
+ u64 highest_mpc = 0, lowest_mpc = U64_MAX;
+
+ pre_big_task_count_change(cpu_possible_mask);
+
+ for_each_sched_cluster(cluster) {
+ u64 mpc;
+
+ cluster->capacity = compute_capacity(cluster);
+ mpc = cluster->max_possible_capacity =
+ compute_max_possible_capacity(cluster);
+ cluster->load_scale_factor = compute_load_scale_factor(cluster);
+
+ cluster->exec_scale_factor =
+ DIV_ROUND_UP(cluster->efficiency * 1024,
+ max_possible_efficiency);
+
+ if (mpc > highest_mpc)
+ highest_mpc = mpc;
+
+ if (mpc < lowest_mpc)
+ lowest_mpc = mpc;
+ }
+
+ max_possible_capacity = highest_mpc;
+ min_max_possible_capacity = lowest_mpc;
+
+ __update_min_max_capacity();
+ sched_update_freq_max_load(cpu_possible_mask);
+ post_big_task_count_change(cpu_possible_mask);
+}
+
+static void assign_cluster_ids(struct list_head *head)
+{
+ struct sched_cluster *cluster;
+ int pos = 0;
+
+ list_for_each_entry(cluster, head, list) {
+ cluster->id = pos;
+ sched_cluster[pos++] = cluster;
+ }
+}
+
+static void
+move_list(struct list_head *dst, struct list_head *src, bool sync_rcu)
+{
+ struct list_head *first, *last;
+
+ first = src->next;
+ last = src->prev;
+
+ if (sync_rcu) {
+ INIT_LIST_HEAD_RCU(src);
+ synchronize_rcu();
+ }
+
+ first->prev = dst;
+ dst->prev = last;
+ last->next = dst;
+
+ /* Ensure list sanity before making the head visible to all CPUs. */
+ smp_mb();
+ dst->next = first;
+}
+
+static int
+compare_clusters(void *priv, struct list_head *a, struct list_head *b)
+{
+ struct sched_cluster *cluster1, *cluster2;
+ int ret;
+
+ cluster1 = container_of(a, struct sched_cluster, list);
+ cluster2 = container_of(b, struct sched_cluster, list);
+
+ /*
+ * Don't assume higher capacity means higher power. If the
+ * power cost is same, sort the higher capacity cluster before
+ * the lower capacity cluster to start placing the tasks
+ * on the higher capacity cluster.
+ */
+ ret = cluster1->max_power_cost > cluster2->max_power_cost ||
+ (cluster1->max_power_cost == cluster2->max_power_cost &&
+ cluster1->max_possible_capacity <
+ cluster2->max_possible_capacity);
+
+ return ret;
+}
+
+static void sort_clusters(void)
+{
+ struct sched_cluster *cluster;
+ struct list_head new_head;
+ unsigned int tmp_max = 1;
+
+ INIT_LIST_HEAD(&new_head);
+
+ for_each_sched_cluster(cluster) {
+ cluster->max_power_cost = power_cost(cluster_first_cpu(cluster),
+ max_task_load());
+ cluster->min_power_cost = power_cost(cluster_first_cpu(cluster),
+ 0);
+
+ if (cluster->max_power_cost > tmp_max)
+ tmp_max = cluster->max_power_cost;
+ }
+ max_power_cost = tmp_max;
+
+ move_list(&new_head, &cluster_head, true);
+
+ list_sort(NULL, &new_head, compare_clusters);
+ assign_cluster_ids(&new_head);
+
+ /*
+ * Ensure cluster ids are visible to all CPUs before making
+ * cluster_head visible.
+ */
+ move_list(&cluster_head, &new_head, false);
+}
+
+static void
+insert_cluster(struct sched_cluster *cluster, struct list_head *head)
+{
+ struct sched_cluster *tmp;
+ struct list_head *iter = head;
+
+ list_for_each_entry(tmp, head, list) {
+ if (cluster->max_power_cost < tmp->max_power_cost)
+ break;
+ iter = &tmp->list;
+ }
+
+ list_add(&cluster->list, iter);
+}
+
+static struct sched_cluster *alloc_new_cluster(const struct cpumask *cpus)
+{
+ struct sched_cluster *cluster = NULL;
+
+ cluster = kzalloc(sizeof(struct sched_cluster), GFP_ATOMIC);
+ if (!cluster) {
+ __WARN_printf("Cluster allocation failed. \
+ Possible bad scheduling\n");
+ return NULL;
+ }
+
+ INIT_LIST_HEAD(&cluster->list);
+ cluster->max_power_cost = 1;
+ cluster->min_power_cost = 1;
+ cluster->capacity = 1024;
+ cluster->max_possible_capacity = 1024;
+ cluster->efficiency = 1;
+ cluster->load_scale_factor = 1024;
+ cluster->cur_freq = 1;
+ cluster->max_freq = 1;
+ cluster->max_mitigated_freq = UINT_MAX;
+ cluster->min_freq = 1;
+ cluster->max_possible_freq = 1;
+ cluster->dstate = 0;
+ cluster->dstate_wakeup_energy = 0;
+ cluster->dstate_wakeup_latency = 0;
+ cluster->freq_init_done = false;
+
+ raw_spin_lock_init(&cluster->load_lock);
+ cluster->cpus = *cpus;
+ cluster->efficiency = arch_get_cpu_efficiency(cpumask_first(cpus));
+
+ if (cluster->efficiency > max_possible_efficiency)
+ max_possible_efficiency = cluster->efficiency;
+ if (cluster->efficiency < min_possible_efficiency)
+ min_possible_efficiency = cluster->efficiency;
+
+ cluster->notifier_sent = 0;
+ return cluster;
+}
+
+static void add_cluster(const struct cpumask *cpus, struct list_head *head)
+{
+ struct sched_cluster *cluster = alloc_new_cluster(cpus);
+ int i;
+
+ if (!cluster)
+ return;
+
+ for_each_cpu(i, cpus)
+ cpu_rq(i)->cluster = cluster;
+
+ insert_cluster(cluster, head);
+ set_bit(num_clusters, all_cluster_ids);
+ num_clusters++;
+}
+
+void update_cluster_topology(void)
+{
+ struct cpumask cpus = *cpu_possible_mask;
+ const struct cpumask *cluster_cpus;
+ struct list_head new_head;
+ int i;
+
+ INIT_LIST_HEAD(&new_head);
+
+ for_each_cpu(i, &cpus) {
+ cluster_cpus = cpu_coregroup_mask(i);
+ cpumask_or(&all_cluster_cpus, &all_cluster_cpus, cluster_cpus);
+ cpumask_andnot(&cpus, &cpus, cluster_cpus);
+ add_cluster(cluster_cpus, &new_head);
+ }
+
+ assign_cluster_ids(&new_head);
+
+ /*
+ * Ensure cluster ids are visible to all CPUs before making
+ * cluster_head visible.
+ */
+ move_list(&cluster_head, &new_head, false);
+ update_all_clusters_stats();
+}
+
+void init_clusters(void)
+{
+ bitmap_clear(all_cluster_ids, 0, NR_CPUS);
+ init_cluster.cpus = *cpu_possible_mask;
+ raw_spin_lock_init(&init_cluster.load_lock);
+ INIT_LIST_HEAD(&cluster_head);
+}
+
+int register_cpu_cycle_counter_cb(struct cpu_cycle_counter_cb *cb)
+{
+ mutex_lock(&cluster_lock);
+ if (!cb->get_cpu_cycle_counter) {
+ mutex_unlock(&cluster_lock);
+ return -EINVAL;
+ }
+
+ cpu_cycle_counter_cb = *cb;
+ use_cycle_counter = true;
+ mutex_unlock(&cluster_lock);
+
+ return 0;
+}
+
+/* Clear any HMP scheduler related requests pending from or on cpu */
+void clear_hmp_request(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
+
+ clear_boost_kick(cpu);
+ clear_reserved(cpu);
+ if (rq->push_task) {
+ struct task_struct *push_task = NULL;
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ if (rq->push_task) {
+ clear_reserved(rq->push_cpu);
+ push_task = rq->push_task;
+ rq->push_task = NULL;
+ }
+ rq->active_balance = 0;
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+ if (push_task)
+ put_task_struct(push_task);
+ }
+}
+
+int sched_set_static_cpu_pwr_cost(int cpu, unsigned int cost)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ rq->static_cpu_pwr_cost = cost;
+ return 0;
+}
+
+unsigned int sched_get_static_cpu_pwr_cost(int cpu)
+{
+ return cpu_rq(cpu)->static_cpu_pwr_cost;
+}
+
+int sched_set_static_cluster_pwr_cost(int cpu, unsigned int cost)
+{
+ struct sched_cluster *cluster = cpu_rq(cpu)->cluster;
+
+ cluster->static_cluster_pwr_cost = cost;
+ return 0;
+}
+
+unsigned int sched_get_static_cluster_pwr_cost(int cpu)
+{
+ return cpu_rq(cpu)->cluster->static_cluster_pwr_cost;
+}
+
+int sched_set_cluster_wake_idle(int cpu, unsigned int wake_idle)
+{
+ struct sched_cluster *cluster = cpu_rq(cpu)->cluster;
+
+ cluster->wake_up_idle = !!wake_idle;
+ return 0;
+}
+
+unsigned int sched_get_cluster_wake_idle(int cpu)
+{
+ return cpu_rq(cpu)->cluster->wake_up_idle;
+}
+
+/*
+ * sched_window_stats_policy and sched_ravg_hist_size have a 'sysctl' copy
+ * associated with them. This is required for atomic update of those variables
+ * when being modifed via sysctl interface.
+ *
+ * IMPORTANT: Initialize both copies to same value!!
+ */
+
+/*
+ * Tasks that are runnable continuously for a period greather than
+ * EARLY_DETECTION_DURATION can be flagged early as potential
+ * high load tasks.
+ */
+#define EARLY_DETECTION_DURATION 9500000
+
+static __read_mostly unsigned int sched_ravg_hist_size = 5;
+__read_mostly unsigned int sysctl_sched_ravg_hist_size = 5;
+
+static __read_mostly unsigned int sched_window_stats_policy =
+ WINDOW_STATS_MAX_RECENT_AVG;
+__read_mostly unsigned int sysctl_sched_window_stats_policy =
+ WINDOW_STATS_MAX_RECENT_AVG;
+
+#define SCHED_ACCOUNT_WAIT_TIME 1
+
+__read_mostly unsigned int sysctl_sched_cpu_high_irqload = (10 * NSEC_PER_MSEC);
+
+/*
+ * Enable colocation and frequency aggregation for all threads in a process.
+ * The children inherits the group id from the parent.
+ */
+unsigned int __read_mostly sysctl_sched_enable_thread_grouping;
+
+
+#define SCHED_NEW_TASK_WINDOWS 5
+
+#define SCHED_FREQ_ACCOUNT_WAIT_TIME 0
+
+/*
+ * This governs what load needs to be used when reporting CPU busy time
+ * to the cpufreq governor.
+ */
+__read_mostly unsigned int sysctl_sched_freq_reporting_policy;
+
+/*
+ * For increase, send notification if
+ * freq_required - cur_freq > sysctl_sched_freq_inc_notify
+ */
+__read_mostly int sysctl_sched_freq_inc_notify = 10 * 1024 * 1024; /* + 10GHz */
+
+/*
+ * For decrease, send notification if
+ * cur_freq - freq_required > sysctl_sched_freq_dec_notify
+ */
+__read_mostly int sysctl_sched_freq_dec_notify = 10 * 1024 * 1024; /* - 10GHz */
+
+static __read_mostly unsigned int sched_io_is_busy;
+
+__read_mostly unsigned int sysctl_sched_pred_alert_freq = 10 * 1024 * 1024;
+
+/*
+ * Maximum possible frequency across all cpus. Task demand and cpu
+ * capacity (cpu_power) metrics are scaled in reference to it.
+ */
+unsigned int max_possible_freq = 1;
+
+/*
+ * Minimum possible max_freq across all cpus. This will be same as
+ * max_possible_freq on homogeneous systems and could be different from
+ * max_possible_freq on heterogenous systems. min_max_freq is used to derive
+ * capacity (cpu_power) of cpus.
+ */
+unsigned int min_max_freq = 1;
+
+unsigned int max_capacity = 1024; /* max(rq->capacity) */
+unsigned int min_capacity = 1024; /* min(rq->capacity) */
+unsigned int max_possible_capacity = 1024; /* max(rq->max_possible_capacity) */
+unsigned int
+min_max_possible_capacity = 1024; /* min(rq->max_possible_capacity) */
+
+/* Min window size (in ns) = 10ms */
+#define MIN_SCHED_RAVG_WINDOW 10000000
+
+/* Max window size (in ns) = 1s */
+#define MAX_SCHED_RAVG_WINDOW 1000000000
+
+/* Window size (in ns) */
+__read_mostly unsigned int sched_ravg_window = MIN_SCHED_RAVG_WINDOW;
+
+/* Maximum allowed threshold before freq aggregation must be enabled */
+#define MAX_FREQ_AGGR_THRESH 1000
+
+/* Temporarily disable window-stats activity on all cpus */
+unsigned int __read_mostly sched_disable_window_stats;
+
+struct related_thread_group *related_thread_groups[MAX_NUM_CGROUP_COLOC_ID];
+static LIST_HEAD(active_related_thread_groups);
+static DEFINE_RWLOCK(related_thread_group_lock);
+
+#define for_each_related_thread_group(grp) \
+ list_for_each_entry(grp, &active_related_thread_groups, list)
+
+/*
+ * Task load is categorized into buckets for the purpose of top task tracking.
+ * The entire range of load from 0 to sched_ravg_window needs to be covered
+ * in NUM_LOAD_INDICES number of buckets. Therefore the size of each bucket
+ * is given by sched_ravg_window / NUM_LOAD_INDICES. Since the default value
+ * of sched_ravg_window is MIN_SCHED_RAVG_WINDOW, use that to compute
+ * sched_load_granule.
+ */
+__read_mostly unsigned int sched_load_granule =
+ MIN_SCHED_RAVG_WINDOW / NUM_LOAD_INDICES;
+
+/* Size of bitmaps maintained to track top tasks */
+static const unsigned int top_tasks_bitmap_size =
+ BITS_TO_LONGS(NUM_LOAD_INDICES + 1) * sizeof(unsigned long);
+
+/*
+ * Demand aggregation for frequency purpose:
+ *
+ * 'sched_freq_aggregate' controls aggregation of cpu demand of related threads
+ * for frequency determination purpose. This aggregation is done per-cluster.
+ *
+ * CPU demand of tasks from various related groups is aggregated per-cluster and
+ * added to the "max_busy_cpu" in that cluster, where max_busy_cpu is determined
+ * by just rq->prev_runnable_sum.
+ *
+ * Some examples follow, which assume:
+ * Cluster0 = CPU0-3, Cluster1 = CPU4-7
+ * One related thread group A that has tasks A0, A1, A2
+ *
+ * A->cpu_time[X].curr/prev_sum = counters in which cpu execution stats of
+ * tasks belonging to group A are accumulated when they run on cpu X.
+ *
+ * CX->curr/prev_sum = counters in which cpu execution stats of all tasks
+ * not belonging to group A are accumulated when they run on cpu X
+ *
+ * Lets say the stats for window M was as below:
+ *
+ * C0->prev_sum = 1ms, A->cpu_time[0].prev_sum = 5ms
+ * Task A0 ran 5ms on CPU0
+ * Task B0 ran 1ms on CPU0
+ *
+ * C1->prev_sum = 5ms, A->cpu_time[1].prev_sum = 6ms
+ * Task A1 ran 4ms on CPU1
+ * Task A2 ran 2ms on CPU1
+ * Task B1 ran 5ms on CPU1
+ *
+ * C2->prev_sum = 0ms, A->cpu_time[2].prev_sum = 0
+ * CPU2 idle
+ *
+ * C3->prev_sum = 0ms, A->cpu_time[3].prev_sum = 0
+ * CPU3 idle
+ *
+ * In this case, CPU1 was most busy going by just its prev_sum counter. Demand
+ * from all group A tasks are added to CPU1. IOW, at end of window M, cpu busy
+ * time reported to governor will be:
+ *
+ *
+ * C0 busy time = 1ms
+ * C1 busy time = 5 + 5 + 6 = 16ms
+ *
+ */
+static __read_mostly unsigned int sched_freq_aggregate = 1;
+__read_mostly unsigned int sysctl_sched_freq_aggregate = 1;
+
+unsigned int __read_mostly sysctl_sched_freq_aggregate_threshold_pct;
+static unsigned int __read_mostly sched_freq_aggregate_threshold;
+
+/* Initial task load. Newly created tasks are assigned this load. */
+unsigned int __read_mostly sched_init_task_load_windows;
+unsigned int __read_mostly sysctl_sched_init_task_load_pct = 15;
+
+unsigned int max_task_load(void)
+{
+ return sched_ravg_window;
+}
+
+/* A cpu can no longer accommodate more tasks if:
+ *
+ * rq->nr_running > sysctl_sched_spill_nr_run ||
+ * rq->hmp_stats.cumulative_runnable_avg > sched_spill_load
+ */
+unsigned int __read_mostly sysctl_sched_spill_nr_run = 10;
+
+/*
+ * Place sync wakee tasks those have less than configured demand to the waker's
+ * cluster.
+ */
+unsigned int __read_mostly sched_small_wakee_task_load;
+unsigned int __read_mostly sysctl_sched_small_wakee_task_load_pct = 10;
+
+unsigned int __read_mostly sched_big_waker_task_load;
+unsigned int __read_mostly sysctl_sched_big_waker_task_load_pct = 25;
+
+/*
+ * CPUs with load greater than the sched_spill_load_threshold are not
+ * eligible for task placement. When all CPUs in a cluster achieve a
+ * load higher than this level, tasks becomes eligible for inter
+ * cluster migration.
+ */
+unsigned int __read_mostly sched_spill_load;
+unsigned int __read_mostly sysctl_sched_spill_load_pct = 100;
+
+/*
+ * Prefer the waker CPU for sync wakee task, if the CPU has only 1 runnable
+ * task. This eliminates the LPM exit latency associated with the idle
+ * CPUs in the waker cluster.
+ */
+unsigned int __read_mostly sysctl_sched_prefer_sync_wakee_to_waker;
+
+/*
+ * Tasks whose bandwidth consumption on a cpu is more than
+ * sched_upmigrate are considered "big" tasks. Big tasks will be
+ * considered for "up" migration, i.e migrating to a cpu with better
+ * capacity.
+ */
+unsigned int __read_mostly sched_upmigrate;
+unsigned int __read_mostly sysctl_sched_upmigrate_pct = 80;
+
+/*
+ * Big tasks, once migrated, will need to drop their bandwidth
+ * consumption to less than sched_downmigrate before they are "down"
+ * migrated.
+ */
+unsigned int __read_mostly sched_downmigrate;
+unsigned int __read_mostly sysctl_sched_downmigrate_pct = 60;
+
+/*
+ * Task groups whose aggregate demand on a cpu is more than
+ * sched_group_upmigrate need to be up-migrated if possible.
+ */
+unsigned int __read_mostly sched_group_upmigrate;
+unsigned int __read_mostly sysctl_sched_group_upmigrate_pct = 100;
+
+/*
+ * Task groups, once up-migrated, will need to drop their aggregate
+ * demand to less than sched_group_downmigrate before they are "down"
+ * migrated.
+ */
+unsigned int __read_mostly sched_group_downmigrate;
+unsigned int __read_mostly sysctl_sched_group_downmigrate_pct = 95;
+
+/*
+ * The load scale factor of a CPU gets boosted when its max frequency
+ * is restricted due to which the tasks are migrating to higher capacity
+ * CPUs early. The sched_upmigrate threshold is auto-upgraded by
+ * rq->max_possible_freq/rq->max_freq of a lower capacity CPU.
+ */
+unsigned int up_down_migrate_scale_factor = 1024;
+
+/*
+ * Scheduler selects and places task to its previous CPU if sleep time is
+ * less than sysctl_sched_select_prev_cpu_us.
+ */
+unsigned int __read_mostly
+sched_short_sleep_task_threshold = 2000 * NSEC_PER_USEC;
+
+unsigned int __read_mostly sysctl_sched_select_prev_cpu_us = 2000;
+
+unsigned int __read_mostly
+sched_long_cpu_selection_threshold = 100 * NSEC_PER_MSEC;
+
+unsigned int __read_mostly sysctl_sched_restrict_cluster_spill;
+
+/*
+ * Scheduler tries to avoid waking up idle CPUs for tasks running
+ * in short bursts. If the task average burst is less than
+ * sysctl_sched_short_burst nanoseconds and it sleeps on an average
+ * for more than sysctl_sched_short_sleep nanoseconds, then the
+ * task is eligible for packing.
+ */
+unsigned int __read_mostly sysctl_sched_short_burst;
+unsigned int __read_mostly sysctl_sched_short_sleep = 1 * NSEC_PER_MSEC;
+
+static void _update_up_down_migrate(unsigned int *up_migrate,
+ unsigned int *down_migrate, bool is_group)
+{
+ unsigned int delta;
+
+ if (up_down_migrate_scale_factor == 1024)
+ return;
+
+ delta = *up_migrate - *down_migrate;
+
+ *up_migrate /= NSEC_PER_USEC;
+ *up_migrate *= up_down_migrate_scale_factor;
+ *up_migrate >>= 10;
+ *up_migrate *= NSEC_PER_USEC;
+
+ if (!is_group)
+ *up_migrate = min(*up_migrate, sched_ravg_window);
+
+ *down_migrate /= NSEC_PER_USEC;
+ *down_migrate *= up_down_migrate_scale_factor;
+ *down_migrate >>= 10;
+ *down_migrate *= NSEC_PER_USEC;
+
+ *down_migrate = min(*down_migrate, *up_migrate - delta);
+}
+
+static void update_up_down_migrate(void)
+{
+ unsigned int up_migrate = pct_to_real(sysctl_sched_upmigrate_pct);
+ unsigned int down_migrate = pct_to_real(sysctl_sched_downmigrate_pct);
+
+ _update_up_down_migrate(&up_migrate, &down_migrate, false);
+ sched_upmigrate = up_migrate;
+ sched_downmigrate = down_migrate;
+
+ up_migrate = pct_to_real(sysctl_sched_group_upmigrate_pct);
+ down_migrate = pct_to_real(sysctl_sched_group_downmigrate_pct);
+
+ _update_up_down_migrate(&up_migrate, &down_migrate, true);
+ sched_group_upmigrate = up_migrate;
+ sched_group_downmigrate = down_migrate;
+}
+
+void set_hmp_defaults(void)
+{
+ sched_spill_load =
+ pct_to_real(sysctl_sched_spill_load_pct);
+
+ update_up_down_migrate();
+
+ sched_init_task_load_windows =
+ div64_u64((u64)sysctl_sched_init_task_load_pct *
+ (u64)sched_ravg_window, 100);
+
+ sched_short_sleep_task_threshold = sysctl_sched_select_prev_cpu_us *
+ NSEC_PER_USEC;
+
+ sched_small_wakee_task_load =
+ div64_u64((u64)sysctl_sched_small_wakee_task_load_pct *
+ (u64)sched_ravg_window, 100);
+
+ sched_big_waker_task_load =
+ div64_u64((u64)sysctl_sched_big_waker_task_load_pct *
+ (u64)sched_ravg_window, 100);
+
+ sched_freq_aggregate_threshold =
+ pct_to_real(sysctl_sched_freq_aggregate_threshold_pct);
+}
+
+u32 sched_get_init_task_load(struct task_struct *p)
+{
+ return p->init_load_pct;
+}
+
+int sched_set_init_task_load(struct task_struct *p, int init_load_pct)
+{
+ if (init_load_pct < 0 || init_load_pct > 100)
+ return -EINVAL;
+
+ p->init_load_pct = init_load_pct;
+
+ return 0;
+}
+
+#ifdef CONFIG_CGROUP_SCHED
+
+int upmigrate_discouraged(struct task_struct *p)
+{
+ return task_group(p)->upmigrate_discouraged;
+}
+
+#else
+
+static inline int upmigrate_discouraged(struct task_struct *p)
+{
+ return 0;
+}
+
+#endif
+
+/* Is a task "big" on its current cpu */
+static inline int __is_big_task(struct task_struct *p, u64 scaled_load)
+{
+ int nice = task_nice(p);
+
+ if (nice > SCHED_UPMIGRATE_MIN_NICE || upmigrate_discouraged(p))
+ return 0;
+
+ return scaled_load > sched_upmigrate;
+}
+
+int is_big_task(struct task_struct *p)
+{
+ return __is_big_task(p, scale_load_to_cpu(task_load(p), task_cpu(p)));
+}
+
+u64 cpu_load(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ return scale_load_to_cpu(rq->hmp_stats.cumulative_runnable_avg, cpu);
+}
+
+u64 cpu_load_sync(int cpu, int sync)
+{
+ return scale_load_to_cpu(cpu_cravg_sync(cpu, sync), cpu);
+}
+
+/*
+ * Task will fit on a cpu if it's bandwidth consumption on that cpu
+ * will be less than sched_upmigrate. A big task that was previously
+ * "up" migrated will be considered fitting on "little" cpu if its
+ * bandwidth consumption on "little" cpu will be less than
+ * sched_downmigrate. This will help avoid frequenty migrations for
+ * tasks with load close to the upmigrate threshold
+ */
+int task_load_will_fit(struct task_struct *p, u64 task_load, int cpu,
+ enum sched_boost_policy boost_policy)
+{
+ int upmigrate = sched_upmigrate;
+
+ if (cpu_capacity(cpu) == max_capacity)
+ return 1;
+
+ if (cpu_capacity(task_cpu(p)) > cpu_capacity(cpu))
+ upmigrate = sched_downmigrate;
+
+ if (boost_policy != SCHED_BOOST_ON_BIG) {
+ if (task_nice(p) > SCHED_UPMIGRATE_MIN_NICE ||
+ upmigrate_discouraged(p))
+ return 1;
+
+ if (task_load < upmigrate)
+ return 1;
+ } else {
+ if (task_sched_boost(p) || task_load >= upmigrate)
+ return 0;
+
+ return 1;
+ }
+
+ return 0;
+}
+
+int task_will_fit(struct task_struct *p, int cpu)
+{
+ u64 tload = scale_load_to_cpu(task_load(p), cpu);
+
+ return task_load_will_fit(p, tload, cpu, sched_boost_policy());
+}
+
+static int
+group_will_fit(struct sched_cluster *cluster, struct related_thread_group *grp,
+ u64 demand, bool group_boost)
+{
+ int cpu = cluster_first_cpu(cluster);
+ int prev_capacity = 0;
+ unsigned int threshold = sched_group_upmigrate;
+ u64 load;
+
+ if (cluster->capacity == max_capacity)
+ return 1;
+
+ if (group_boost)
+ return 0;
+
+ if (!demand)
+ return 1;
+
+ if (grp->preferred_cluster)
+ prev_capacity = grp->preferred_cluster->capacity;
+
+ if (cluster->capacity < prev_capacity)
+ threshold = sched_group_downmigrate;
+
+ load = scale_load_to_cpu(demand, cpu);
+ if (load < threshold)
+ return 1;
+
+ return 0;
+}
+
+/*
+ * Return the cost of running task p on CPU cpu. This function
+ * currently assumes that task p is the only task which will run on
+ * the CPU.
+ */
+unsigned int power_cost(int cpu, u64 demand)
+{
+ int first, mid, last;
+ struct cpu_pwr_stats *per_cpu_info = get_cpu_pwr_stats();
+ struct cpu_pstate_pwr *costs;
+ struct freq_max_load *max_load;
+ int total_static_pwr_cost = 0;
+ struct rq *rq = cpu_rq(cpu);
+ unsigned int pc;
+
+ if (!per_cpu_info || !per_cpu_info[cpu].ptable)
+ /*
+ * When power aware scheduling is not in use, or CPU
+ * power data is not available, just use the CPU
+ * capacity as a rough stand-in for real CPU power
+ * numbers, assuming bigger CPUs are more power
+ * hungry.
+ */
+ return cpu_max_possible_capacity(cpu);
+
+ rcu_read_lock();
+ max_load = rcu_dereference(per_cpu(freq_max_load, cpu));
+ if (!max_load) {
+ pc = cpu_max_possible_capacity(cpu);
+ goto unlock;
+ }
+
+ costs = per_cpu_info[cpu].ptable;
+
+ if (demand <= max_load->freqs[0].hdemand) {
+ pc = costs[0].power;
+ goto unlock;
+ } else if (demand > max_load->freqs[max_load->length - 1].hdemand) {
+ pc = costs[max_load->length - 1].power;
+ goto unlock;
+ }
+
+ first = 0;
+ last = max_load->length - 1;
+ mid = (last - first) >> 1;
+ while (1) {
+ if (demand <= max_load->freqs[mid].hdemand)
+ last = mid;
+ else
+ first = mid;
+
+ if (last - first == 1)
+ break;
+ mid = first + ((last - first) >> 1);
+ }
+
+ pc = costs[last].power;
+
+unlock:
+ rcu_read_unlock();
+
+ if (idle_cpu(cpu) && rq->cstate) {
+ total_static_pwr_cost += rq->static_cpu_pwr_cost;
+ if (rq->cluster->dstate)
+ total_static_pwr_cost +=
+ rq->cluster->static_cluster_pwr_cost;
+ }
+
+ return pc + total_static_pwr_cost;
+
+}
+
+void inc_nr_big_task(struct hmp_sched_stats *stats, struct task_struct *p)
+{
+ if (sched_disable_window_stats)
+ return;
+
+ if (is_big_task(p))
+ stats->nr_big_tasks++;
+}
+
+void dec_nr_big_task(struct hmp_sched_stats *stats, struct task_struct *p)
+{
+ if (sched_disable_window_stats)
+ return;
+
+ if (is_big_task(p))
+ stats->nr_big_tasks--;
+
+ BUG_ON(stats->nr_big_tasks < 0);
+}
+
+void inc_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra)
+{
+ inc_nr_big_task(&rq->hmp_stats, p);
+ if (change_cra)
+ inc_cumulative_runnable_avg(&rq->hmp_stats, p);
+}
+
+void dec_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra)
+{
+ dec_nr_big_task(&rq->hmp_stats, p);
+ if (change_cra)
+ dec_cumulative_runnable_avg(&rq->hmp_stats, p);
+}
+
+void reset_hmp_stats(struct hmp_sched_stats *stats, int reset_cra)
+{
+ stats->nr_big_tasks = 0;
+ if (reset_cra) {
+ stats->cumulative_runnable_avg = 0;
+ stats->pred_demands_sum = 0;
+ }
+}
+
+int preferred_cluster(struct sched_cluster *cluster, struct task_struct *p)
+{
+ struct related_thread_group *grp;
+ int rc = 1;
+
+ rcu_read_lock();
+
+ grp = task_related_thread_group(p);
+ if (grp)
+ rc = (grp->preferred_cluster == cluster);
+
+ rcu_read_unlock();
+ return rc;
+}
+
+struct sched_cluster *rq_cluster(struct rq *rq)
+{
+ return rq->cluster;
+}
+
+/*
+ * reset_cpu_hmp_stats - reset HMP stats for a cpu
+ * nr_big_tasks
+ * cumulative_runnable_avg (iff reset_cra is true)
+ */
+void reset_cpu_hmp_stats(int cpu, int reset_cra)
+{
+ reset_cfs_rq_hmp_stats(cpu, reset_cra);
+ reset_hmp_stats(&cpu_rq(cpu)->hmp_stats, reset_cra);
+}
+
+void fixup_nr_big_tasks(struct hmp_sched_stats *stats,
+ struct task_struct *p, s64 delta)
+{
+ u64 new_task_load;
+ u64 old_task_load;
+
+ if (sched_disable_window_stats)
+ return;
+
+ old_task_load = scale_load_to_cpu(task_load(p), task_cpu(p));
+ new_task_load = scale_load_to_cpu(delta + task_load(p), task_cpu(p));
+
+ if (__is_big_task(p, old_task_load) && !__is_big_task(p, new_task_load))
+ stats->nr_big_tasks--;
+ else if (!__is_big_task(p, old_task_load) &&
+ __is_big_task(p, new_task_load))
+ stats->nr_big_tasks++;
+
+ BUG_ON(stats->nr_big_tasks < 0);
+}
+
+/*
+ * Walk runqueue of cpu and re-initialize 'nr_big_tasks' counters.
+ */
+static void update_nr_big_tasks(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct task_struct *p;
+
+ /* Do not reset cumulative_runnable_avg */
+ reset_cpu_hmp_stats(cpu, 0);
+
+ list_for_each_entry(p, &rq->cfs_tasks, se.group_node)
+ _inc_hmp_sched_stats_fair(rq, p, 0);
+}
+
+/* Disable interrupts and grab runqueue lock of all cpus listed in @cpus */
+void pre_big_task_count_change(const struct cpumask *cpus)
+{
+ int i;
+
+ local_irq_disable();
+
+ for_each_cpu(i, cpus)
+ raw_spin_lock(&cpu_rq(i)->lock);
+}
+
+/*
+ * Reinitialize 'nr_big_tasks' counters on all affected cpus
+ */
+void post_big_task_count_change(const struct cpumask *cpus)
+{
+ int i;
+
+ /* Assumes local_irq_disable() keeps online cpumap stable */
+ for_each_cpu(i, cpus)
+ update_nr_big_tasks(i);
+
+ for_each_cpu(i, cpus)
+ raw_spin_unlock(&cpu_rq(i)->lock);
+
+ local_irq_enable();
+}
+
+DEFINE_MUTEX(policy_mutex);
+
+unsigned int update_freq_aggregate_threshold(unsigned int threshold)
+{
+ unsigned int old_threshold;
+
+ mutex_lock(&policy_mutex);
+
+ old_threshold = sysctl_sched_freq_aggregate_threshold_pct;
+
+ sysctl_sched_freq_aggregate_threshold_pct = threshold;
+ sched_freq_aggregate_threshold =
+ pct_to_real(sysctl_sched_freq_aggregate_threshold_pct);
+
+ mutex_unlock(&policy_mutex);
+
+ return old_threshold;
+}
+
+static inline int invalid_value_freq_input(unsigned int *data)
+{
+ if (data == &sysctl_sched_freq_aggregate)
+ return !(*data == 0 || *data == 1);
+
+ return 0;
+}
+
+static inline int invalid_value(unsigned int *data)
+{
+ unsigned int val = *data;
+
+ if (data == &sysctl_sched_ravg_hist_size)
+ return (val < 2 || val > RAVG_HIST_SIZE_MAX);
+
+ if (data == &sysctl_sched_window_stats_policy)
+ return val >= WINDOW_STATS_INVALID_POLICY;
+
+ return invalid_value_freq_input(data);
+}
+
+/*
+ * Handle "atomic" update of sysctl_sched_window_stats_policy,
+ * sysctl_sched_ravg_hist_size variables.
+ */
+int sched_window_update_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret;
+ unsigned int *data = (unsigned int *)table->data;
+ unsigned int old_val;
+
+ mutex_lock(&policy_mutex);
+
+ old_val = *data;
+
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (ret || !write || (write && (old_val == *data)))
+ goto done;
+
+ if (invalid_value(data)) {
+ *data = old_val;
+ ret = -EINVAL;
+ goto done;
+ }
+
+ reset_all_window_stats(0, 0);
+
+done:
+ mutex_unlock(&policy_mutex);
+
+ return ret;
+}
+
+/*
+ * Convert percentage value into absolute form. This will avoid div() operation
+ * in fast path, to convert task load in percentage scale.
+ */
+int sched_hmp_proc_update_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret;
+ unsigned int old_val;
+ unsigned int *data = (unsigned int *)table->data;
+ int update_task_count = 0;
+
+ /*
+ * The policy mutex is acquired with cpu_hotplug.lock
+ * held from cpu_up()->cpufreq_governor_interactive()->
+ * sched_set_window(). So enforce the same order here.
+ */
+ if (write && (data == &sysctl_sched_upmigrate_pct)) {
+ update_task_count = 1;
+ get_online_cpus();
+ }
+
+ mutex_lock(&policy_mutex);
+
+ old_val = *data;
+
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+ if (ret || !write)
+ goto done;
+
+ if (write && (old_val == *data))
+ goto done;
+
+ if (sysctl_sched_downmigrate_pct > sysctl_sched_upmigrate_pct ||
+ sysctl_sched_group_downmigrate_pct >
+ sysctl_sched_group_upmigrate_pct) {
+ *data = old_val;
+ ret = -EINVAL;
+ goto done;
+ }
+
+ /*
+ * Big task tunable change will need to re-classify tasks on
+ * runqueue as big and set their counters appropriately.
+ * sysctl interface affects secondary variables (*_pct), which is then
+ * "atomically" carried over to the primary variables. Atomic change
+ * includes taking runqueue lock of all online cpus and re-initiatizing
+ * their big counter values based on changed criteria.
+ */
+ if (update_task_count)
+ pre_big_task_count_change(cpu_online_mask);
+
+ set_hmp_defaults();
+
+ if (update_task_count)
+ post_big_task_count_change(cpu_online_mask);
+
+done:
+ mutex_unlock(&policy_mutex);
+ if (update_task_count)
+ put_online_cpus();
+ return ret;
+}
+
+inline int nr_big_tasks(struct rq *rq)
+{
+ return rq->hmp_stats.nr_big_tasks;
+}
+
+unsigned int cpu_temp(int cpu)
+{
+ struct cpu_pwr_stats *per_cpu_info = get_cpu_pwr_stats();
+
+ if (per_cpu_info)
+ return per_cpu_info[cpu].temp;
+ else
+ return 0;
+}
+
+/*
+ * kfree() may wakeup kswapd. So this function should NOT be called
+ * with any CPU's rq->lock acquired.
+ */
+void free_task_load_ptrs(struct task_struct *p)
+{
+ kfree(p->ravg.curr_window_cpu);
+ kfree(p->ravg.prev_window_cpu);
+
+ /*
+ * update_task_ravg() can be called for exiting tasks. While the
+ * function itself ensures correct behavior, the corresponding
+ * trace event requires that these pointers be NULL.
+ */
+ p->ravg.curr_window_cpu = NULL;
+ p->ravg.prev_window_cpu = NULL;
+}
+
+void init_new_task_load(struct task_struct *p)
+{
+ int i;
+ u32 init_load_windows = sched_init_task_load_windows;
+ u32 init_load_pct = current->init_load_pct;
+
+ p->init_load_pct = 0;
+ rcu_assign_pointer(p->grp, NULL);
+ INIT_LIST_HEAD(&p->grp_list);
+ memset(&p->ravg, 0, sizeof(struct ravg));
+ p->cpu_cycles = 0;
+ p->ravg.curr_burst = 0;
+ /*
+ * Initialize the avg_burst to twice the threshold, so that
+ * a task would not be classified as short burst right away
+ * after fork. It takes at least 6 sleep-wakeup cycles for
+ * the avg_burst to go below the threshold.
+ */
+ p->ravg.avg_burst = 2 * (u64)sysctl_sched_short_burst;
+ p->ravg.avg_sleep_time = 0;
+
+ p->ravg.curr_window_cpu = kcalloc(nr_cpu_ids, sizeof(u32), GFP_KERNEL);
+ p->ravg.prev_window_cpu = kcalloc(nr_cpu_ids, sizeof(u32), GFP_KERNEL);
+
+ /* Don't have much choice. CPU frequency would be bogus */
+ BUG_ON(!p->ravg.curr_window_cpu || !p->ravg.prev_window_cpu);
+
+ if (init_load_pct)
+ init_load_windows = div64_u64((u64)init_load_pct *
+ (u64)sched_ravg_window, 100);
+
+ p->ravg.demand = init_load_windows;
+ p->ravg.pred_demand = 0;
+ for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i)
+ p->ravg.sum_history[i] = init_load_windows;
+}
+
+/* Return task demand in percentage scale */
+unsigned int pct_task_load(struct task_struct *p)
+{
+ unsigned int load;
+
+ load = div64_u64((u64)task_load(p) * 100, (u64)max_task_load());
+
+ return load;
+}
+
+/*
+ * Return total number of tasks "eligible" to run on highest capacity cpu
+ *
+ * This is simply nr_big_tasks for cpus which are not of max_capacity and
+ * nr_running for cpus of max_capacity
+ */
+unsigned int nr_eligible_big_tasks(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ int nr_big = rq->hmp_stats.nr_big_tasks;
+ int nr = rq->nr_running;
+
+ if (!is_max_capacity_cpu(cpu))
+ return nr_big;
+
+ return nr;
+}
+
+static inline int exiting_task(struct task_struct *p)
+{
+ return (p->ravg.sum_history[0] == EXITING_TASK_MARKER);
+}
+
+static int __init set_sched_ravg_window(char *str)
+{
+ unsigned int window_size;
+
+ get_option(&str, &window_size);
+
+ if (window_size < MIN_SCHED_RAVG_WINDOW ||
+ window_size > MAX_SCHED_RAVG_WINDOW) {
+ WARN_ON(1);
+ return -EINVAL;
+ }
+
+ sched_ravg_window = window_size;
+ return 0;
+}
+
+early_param("sched_ravg_window", set_sched_ravg_window);
+
+static inline void
+update_window_start(struct rq *rq, u64 wallclock)
+{
+ s64 delta;
+ int nr_windows;
+
+ delta = wallclock - rq->window_start;
+ BUG_ON(delta < 0);
+ if (delta < sched_ravg_window)
+ return;
+
+ nr_windows = div64_u64(delta, sched_ravg_window);
+ rq->window_start += (u64)nr_windows * (u64)sched_ravg_window;
+}
+
+#define DIV64_U64_ROUNDUP(X, Y) div64_u64((X) + (Y - 1), Y)
+
+static inline u64 scale_exec_time(u64 delta, struct rq *rq)
+{
+ u32 freq;
+
+ freq = cpu_cycles_to_freq(rq->cc.cycles, rq->cc.time);
+ delta = DIV64_U64_ROUNDUP(delta * freq, max_possible_freq);
+ delta *= rq->cluster->exec_scale_factor;
+ delta >>= 10;
+
+ return delta;
+}
+
+static inline int cpu_is_waiting_on_io(struct rq *rq)
+{
+ if (!sched_io_is_busy)
+ return 0;
+
+ return atomic_read(&rq->nr_iowait);
+}
+
+/* Does freq_required sufficiently exceed or fall behind cur_freq? */
+static inline int
+nearly_same_freq(unsigned int cur_freq, unsigned int freq_required)
+{
+ int delta = freq_required - cur_freq;
+
+ if (freq_required > cur_freq)
+ return delta < sysctl_sched_freq_inc_notify;
+
+ delta = -delta;
+
+ return delta < sysctl_sched_freq_dec_notify;
+}
+
+/* Convert busy time to frequency equivalent */
+static inline unsigned int load_to_freq(struct rq *rq, u64 load)
+{
+ unsigned int freq;
+
+ load = scale_load_to_cpu(load, cpu_of(rq));
+ load *= 128;
+ load = div64_u64(load, max_task_load());
+
+ freq = load * cpu_max_possible_freq(cpu_of(rq));
+ freq /= 128;
+
+ return freq;
+}
+
+/*
+ * Return load from all related groups in given frequency domain.
+ */
+static void group_load_in_freq_domain(struct cpumask *cpus,
+ u64 *grp_load, u64 *new_grp_load)
+{
+ int j;
+
+ for_each_cpu(j, cpus) {
+ struct rq *rq = cpu_rq(j);
+
+ *grp_load += rq->grp_time.prev_runnable_sum;
+ *new_grp_load += rq->grp_time.nt_prev_runnable_sum;
+ }
+}
+
+static inline u64 freq_policy_load(struct rq *rq, u64 load);
+/*
+ * Should scheduler alert governor for changing frequency?
+ *
+ * @check_pred - evaluate frequency based on the predictive demand
+ * @check_groups - add load from all related groups on given cpu
+ *
+ * check_groups is set to 1 if a "related" task movement/wakeup is triggering
+ * the notification check. To avoid "re-aggregation" of demand in such cases,
+ * we check whether the migrated/woken tasks demand (along with demand from
+ * existing tasks on the cpu) can be met on target cpu
+ *
+ */
+
+static int send_notification(struct rq *rq, int check_pred, int check_groups)
+{
+ unsigned int cur_freq, freq_required;
+ unsigned long flags;
+ int rc = 0;
+ u64 group_load = 0, new_load = 0;
+
+ if (check_pred) {
+ u64 prev = rq->old_busy_time;
+ u64 predicted = rq->hmp_stats.pred_demands_sum;
+
+ if (rq->cluster->cur_freq == cpu_max_freq(cpu_of(rq)))
+ return 0;
+
+ prev = max(prev, rq->old_estimated_time);
+ if (prev > predicted)
+ return 0;
+
+ cur_freq = load_to_freq(rq, prev);
+ freq_required = load_to_freq(rq, predicted);
+
+ if (freq_required < cur_freq + sysctl_sched_pred_alert_freq)
+ return 0;
+ } else {
+ /*
+ * Protect from concurrent update of rq->prev_runnable_sum and
+ * group cpu load
+ */
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ if (check_groups)
+ group_load = rq->grp_time.prev_runnable_sum;
+
+ new_load = rq->prev_runnable_sum + group_load;
+ new_load = freq_policy_load(rq, new_load);
+
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+ cur_freq = load_to_freq(rq, rq->old_busy_time);
+ freq_required = load_to_freq(rq, new_load);
+
+ if (nearly_same_freq(cur_freq, freq_required))
+ return 0;
+ }
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ if (!rq->cluster->notifier_sent) {
+ rq->cluster->notifier_sent = 1;
+ rc = 1;
+ trace_sched_freq_alert(cpu_of(rq), check_pred, check_groups, rq,
+ new_load);
+ }
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+ return rc;
+}
+
+/* Alert governor if there is a need to change frequency */
+void check_for_freq_change(struct rq *rq, bool check_pred, bool check_groups)
+{
+ int cpu = cpu_of(rq);
+
+ if (!send_notification(rq, check_pred, check_groups))
+ return;
+
+ atomic_notifier_call_chain(
+ &load_alert_notifier_head, 0,
+ (void *)(long)cpu);
+}
+
+void notify_migration(int src_cpu, int dest_cpu, bool src_cpu_dead,
+ struct task_struct *p)
+{
+ bool check_groups;
+
+ rcu_read_lock();
+ check_groups = task_in_related_thread_group(p);
+ rcu_read_unlock();
+
+ if (!same_freq_domain(src_cpu, dest_cpu)) {
+ if (!src_cpu_dead)
+ check_for_freq_change(cpu_rq(src_cpu), false,
+ check_groups);
+ check_for_freq_change(cpu_rq(dest_cpu), false, check_groups);
+ } else {
+ check_for_freq_change(cpu_rq(dest_cpu), true, check_groups);
+ }
+}
+
+static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p,
+ u64 irqtime, int event)
+{
+ if (is_idle_task(p)) {
+ /* TASK_WAKE && TASK_MIGRATE is not possible on idle task! */
+ if (event == PICK_NEXT_TASK)
+ return 0;
+
+ /* PUT_PREV_TASK, TASK_UPDATE && IRQ_UPDATE are left */
+ return irqtime || cpu_is_waiting_on_io(rq);
+ }
+
+ if (event == TASK_WAKE)
+ return 0;
+
+ if (event == PUT_PREV_TASK || event == IRQ_UPDATE)
+ return 1;
+
+ /*
+ * TASK_UPDATE can be called on sleeping task, when its moved between
+ * related groups
+ */
+ if (event == TASK_UPDATE) {
+ if (rq->curr == p)
+ return 1;
+
+ return p->on_rq ? SCHED_FREQ_ACCOUNT_WAIT_TIME : 0;
+ }
+
+ /* TASK_MIGRATE, PICK_NEXT_TASK left */
+ return SCHED_FREQ_ACCOUNT_WAIT_TIME;
+}
+
+static inline bool is_new_task(struct task_struct *p)
+{
+ return p->ravg.active_windows < SCHED_NEW_TASK_WINDOWS;
+}
+
+#define INC_STEP 8
+#define DEC_STEP 2
+#define CONSISTENT_THRES 16
+#define INC_STEP_BIG 16
+/*
+ * bucket_increase - update the count of all buckets
+ *
+ * @buckets: array of buckets tracking busy time of a task
+ * @idx: the index of bucket to be incremented
+ *
+ * Each time a complete window finishes, count of bucket that runtime
+ * falls in (@idx) is incremented. Counts of all other buckets are
+ * decayed. The rate of increase and decay could be different based
+ * on current count in the bucket.
+ */
+static inline void bucket_increase(u8 *buckets, int idx)
+{
+ int i, step;
+
+ for (i = 0; i < NUM_BUSY_BUCKETS; i++) {
+ if (idx != i) {
+ if (buckets[i] > DEC_STEP)
+ buckets[i] -= DEC_STEP;
+ else
+ buckets[i] = 0;
+ } else {
+ step = buckets[i] >= CONSISTENT_THRES ?
+ INC_STEP_BIG : INC_STEP;
+ if (buckets[i] > U8_MAX - step)
+ buckets[i] = U8_MAX;
+ else
+ buckets[i] += step;
+ }
+ }
+}
+
+static inline int busy_to_bucket(u32 normalized_rt)
+{
+ int bidx;
+
+ bidx = mult_frac(normalized_rt, NUM_BUSY_BUCKETS, max_task_load());
+ bidx = min(bidx, NUM_BUSY_BUCKETS - 1);
+
+ /*
+ * Combine lowest two buckets. The lowest frequency falls into
+ * 2nd bucket and thus keep predicting lowest bucket is not
+ * useful.
+ */
+ if (!bidx)
+ bidx++;
+
+ return bidx;
+}
+
+static inline u64
+scale_load_to_freq(u64 load, unsigned int src_freq, unsigned int dst_freq)
+{
+ return div64_u64(load * (u64)src_freq, (u64)dst_freq);
+}
+
+/*
+ * get_pred_busy - calculate predicted demand for a task on runqueue
+ *
+ * @rq: runqueue of task p
+ * @p: task whose prediction is being updated
+ * @start: starting bucket. returned prediction should not be lower than
+ * this bucket.
+ * @runtime: runtime of the task. returned prediction should not be lower
+ * than this runtime.
+ * Note: @start can be derived from @runtime. It's passed in only to
+ * avoid duplicated calculation in some cases.
+ *
+ * A new predicted busy time is returned for task @p based on @runtime
+ * passed in. The function searches through buckets that represent busy
+ * time equal to or bigger than @runtime and attempts to find the bucket to
+ * to use for prediction. Once found, it searches through historical busy
+ * time and returns the latest that falls into the bucket. If no such busy
+ * time exists, it returns the medium of that bucket.
+ */
+static u32 get_pred_busy(struct rq *rq, struct task_struct *p,
+ int start, u32 runtime)
+{
+ int i;
+ u8 *buckets = p->ravg.busy_buckets;
+ u32 *hist = p->ravg.sum_history;
+ u32 dmin, dmax;
+ u64 cur_freq_runtime = 0;
+ int first = NUM_BUSY_BUCKETS, final;
+ u32 ret = runtime;
+
+ /* skip prediction for new tasks due to lack of history */
+ if (unlikely(is_new_task(p)))
+ goto out;
+
+ /* find minimal bucket index to pick */
+ for (i = start; i < NUM_BUSY_BUCKETS; i++) {
+ if (buckets[i]) {
+ first = i;
+ break;
+ }
+ }
+ /* if no higher buckets are filled, predict runtime */
+ if (first >= NUM_BUSY_BUCKETS)
+ goto out;
+
+ /* compute the bucket for prediction */
+ final = first;
+
+ /* determine demand range for the predicted bucket */
+ if (final < 2) {
+ /* lowest two buckets are combined */
+ dmin = 0;
+ final = 1;
+ } else {
+ dmin = mult_frac(final, max_task_load(), NUM_BUSY_BUCKETS);
+ }
+ dmax = mult_frac(final + 1, max_task_load(), NUM_BUSY_BUCKETS);
+
+ /*
+ * search through runtime history and return first runtime that falls
+ * into the range of predicted bucket.
+ */
+ for (i = 0; i < sched_ravg_hist_size; i++) {
+ if (hist[i] >= dmin && hist[i] < dmax) {
+ ret = hist[i];
+ break;
+ }
+ }
+ /* no historical runtime within bucket found, use average of the bin */
+ if (ret < dmin)
+ ret = (dmin + dmax) / 2;
+ /*
+ * when updating in middle of a window, runtime could be higher
+ * than all recorded history. Always predict at least runtime.
+ */
+ ret = max(runtime, ret);
+out:
+ trace_sched_update_pred_demand(rq, p, runtime,
+ mult_frac((unsigned int)cur_freq_runtime, 100,
+ sched_ravg_window), ret);
+ return ret;
+}
+
+static inline u32 calc_pred_demand(struct rq *rq, struct task_struct *p)
+{
+ if (p->ravg.pred_demand >= p->ravg.curr_window)
+ return p->ravg.pred_demand;
+
+ return get_pred_busy(rq, p, busy_to_bucket(p->ravg.curr_window),
+ p->ravg.curr_window);
+}
+
+/*
+ * predictive demand of a task is calculated at the window roll-over.
+ * if the task current window busy time exceeds the predicted
+ * demand, update it here to reflect the task needs.
+ */
+void update_task_pred_demand(struct rq *rq, struct task_struct *p, int event)
+{
+ u32 new, old;
+
+ if (is_idle_task(p) || exiting_task(p))
+ return;
+
+ if (event != PUT_PREV_TASK && event != TASK_UPDATE &&
+ (!SCHED_FREQ_ACCOUNT_WAIT_TIME ||
+ (event != TASK_MIGRATE &&
+ event != PICK_NEXT_TASK)))
+ return;
+
+ /*
+ * TASK_UPDATE can be called on sleeping task, when its moved between
+ * related groups
+ */
+ if (event == TASK_UPDATE) {
+ if (!p->on_rq && !SCHED_FREQ_ACCOUNT_WAIT_TIME)
+ return;
+ }
+
+ new = calc_pred_demand(rq, p);
+ old = p->ravg.pred_demand;
+
+ if (old >= new)
+ return;
+
+ if (task_on_rq_queued(p) && (!task_has_dl_policy(p) ||
+ !p->dl.dl_throttled))
+ p->sched_class->fixup_hmp_sched_stats(rq, p,
+ p->ravg.demand,
+ new);
+
+ p->ravg.pred_demand = new;
+}
+
+void clear_top_tasks_bitmap(unsigned long *bitmap)
+{
+ memset(bitmap, 0, top_tasks_bitmap_size);
+ __set_bit(NUM_LOAD_INDICES, bitmap);
+}
+
+/*
+ * Special case the last index and provide a fast path for index = 0.
+ * Note that sched_load_granule can change underneath us if we are not
+ * holding any runqueue locks while calling the two functions below.
+ */
+static u32 top_task_load(struct rq *rq)
+{
+ int index = rq->prev_top;
+ u8 prev = 1 - rq->curr_table;
+
+ if (!index) {
+ int msb = NUM_LOAD_INDICES - 1;
+
+ if (!test_bit(msb, rq->top_tasks_bitmap[prev]))
+ return 0;
+ else
+ return sched_load_granule;
+ } else if (index == NUM_LOAD_INDICES - 1) {
+ return sched_ravg_window;
+ } else {
+ return (index + 1) * sched_load_granule;
+ }
+}
+
+static u32 load_to_index(u32 load)
+{
+ u32 index = load / sched_load_granule;
+
+ return min(index, (u32)(NUM_LOAD_INDICES - 1));
+}
+
+static void update_top_tasks(struct task_struct *p, struct rq *rq,
+ u32 old_curr_window, int new_window, bool full_window)
+{
+ u8 curr = rq->curr_table;
+ u8 prev = 1 - curr;
+ u8 *curr_table = rq->top_tasks[curr];
+ u8 *prev_table = rq->top_tasks[prev];
+ int old_index, new_index, update_index;
+ u32 curr_window = p->ravg.curr_window;
+ u32 prev_window = p->ravg.prev_window;
+ bool zero_index_update;
+
+ if (old_curr_window == curr_window && !new_window)
+ return;
+
+ old_index = load_to_index(old_curr_window);
+ new_index = load_to_index(curr_window);
+
+ if (!new_window) {
+ zero_index_update = !old_curr_window && curr_window;
+ if (old_index != new_index || zero_index_update) {
+ if (old_curr_window)
+ curr_table[old_index] -= 1;
+ if (curr_window)
+ curr_table[new_index] += 1;
+ if (new_index > rq->curr_top)
+ rq->curr_top = new_index;
+ }
+
+ if (!curr_table[old_index])
+ __clear_bit(NUM_LOAD_INDICES - old_index - 1,
+ rq->top_tasks_bitmap[curr]);
+
+ if (curr_table[new_index] == 1)
+ __set_bit(NUM_LOAD_INDICES - new_index - 1,
+ rq->top_tasks_bitmap[curr]);
+
+ return;
+ }
+
+ /*
+ * The window has rolled over for this task. By the time we get
+ * here, curr/prev swaps would has already occurred. So we need
+ * to use prev_window for the new index.
+ */
+ update_index = load_to_index(prev_window);
+
+ if (full_window) {
+ /*
+ * Two cases here. Either 'p' ran for the entire window or
+ * it didn't run at all. In either case there is no entry
+ * in the prev table. If 'p' ran the entire window, we just
+ * need to create a new entry in the prev table. In this case
+ * update_index will be correspond to sched_ravg_window
+ * so we can unconditionally update the top index.
+ */
+ if (prev_window) {
+ prev_table[update_index] += 1;
+ rq->prev_top = update_index;
+ }
+
+ if (prev_table[update_index] == 1)
+ __set_bit(NUM_LOAD_INDICES - update_index - 1,
+ rq->top_tasks_bitmap[prev]);
+ } else {
+ zero_index_update = !old_curr_window && prev_window;
+ if (old_index != update_index || zero_index_update) {
+ if (old_curr_window)
+ prev_table[old_index] -= 1;
+
+ prev_table[update_index] += 1;
+
+ if (update_index > rq->prev_top)
+ rq->prev_top = update_index;
+
+ if (!prev_table[old_index])
+ __clear_bit(NUM_LOAD_INDICES - old_index - 1,
+ rq->top_tasks_bitmap[prev]);
+
+ if (prev_table[update_index] == 1)
+ __set_bit(NUM_LOAD_INDICES - update_index - 1,
+ rq->top_tasks_bitmap[prev]);
+ }
+ }
+
+ if (curr_window) {
+ curr_table[new_index] += 1;
+
+ if (new_index > rq->curr_top)
+ rq->curr_top = new_index;
+
+ if (curr_table[new_index] == 1)
+ __set_bit(NUM_LOAD_INDICES - new_index - 1,
+ rq->top_tasks_bitmap[curr]);
+ }
+}
+
+static inline void clear_top_tasks_table(u8 *table)
+{
+ memset(table, 0, NUM_LOAD_INDICES * sizeof(u8));
+}
+
+static void rollover_top_tasks(struct rq *rq, bool full_window)
+{
+ u8 curr_table = rq->curr_table;
+ u8 prev_table = 1 - curr_table;
+ int curr_top = rq->curr_top;
+
+ clear_top_tasks_table(rq->top_tasks[prev_table]);
+ clear_top_tasks_bitmap(rq->top_tasks_bitmap[prev_table]);
+
+ if (full_window) {
+ curr_top = 0;
+ clear_top_tasks_table(rq->top_tasks[curr_table]);
+ clear_top_tasks_bitmap(
+ rq->top_tasks_bitmap[curr_table]);
+ }
+
+ rq->curr_table = prev_table;
+ rq->prev_top = curr_top;
+ rq->curr_top = 0;
+}
+
+static u32 empty_windows[NR_CPUS];
+
+static void rollover_task_window(struct task_struct *p, bool full_window)
+{
+ u32 *curr_cpu_windows = empty_windows;
+ u32 curr_window;
+ int i;
+
+ /* Rollover the sum */
+ curr_window = 0;
+
+ if (!full_window) {
+ curr_window = p->ravg.curr_window;
+ curr_cpu_windows = p->ravg.curr_window_cpu;
+ }
+
+ p->ravg.prev_window = curr_window;
+ p->ravg.curr_window = 0;
+
+ /* Roll over individual CPU contributions */
+ for (i = 0; i < nr_cpu_ids; i++) {
+ p->ravg.prev_window_cpu[i] = curr_cpu_windows[i];
+ p->ravg.curr_window_cpu[i] = 0;
+ }
+}
+
+static void rollover_cpu_window(struct rq *rq, bool full_window)
+{
+ u64 curr_sum = rq->curr_runnable_sum;
+ u64 nt_curr_sum = rq->nt_curr_runnable_sum;
+ u64 grp_curr_sum = rq->grp_time.curr_runnable_sum;
+ u64 grp_nt_curr_sum = rq->grp_time.nt_curr_runnable_sum;
+
+ if (unlikely(full_window)) {
+ curr_sum = 0;
+ nt_curr_sum = 0;
+ grp_curr_sum = 0;
+ grp_nt_curr_sum = 0;
+ }
+
+ rq->prev_runnable_sum = curr_sum;
+ rq->nt_prev_runnable_sum = nt_curr_sum;
+ rq->grp_time.prev_runnable_sum = grp_curr_sum;
+ rq->grp_time.nt_prev_runnable_sum = grp_nt_curr_sum;
+
+ rq->curr_runnable_sum = 0;
+ rq->nt_curr_runnable_sum = 0;
+ rq->grp_time.curr_runnable_sum = 0;
+ rq->grp_time.nt_curr_runnable_sum = 0;
+}
+
+/*
+ * Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum)
+ */
+static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
+ int event, u64 wallclock, u64 irqtime)
+{
+ int new_window, full_window = 0;
+ int p_is_curr_task = (p == rq->curr);
+ u64 mark_start = p->ravg.mark_start;
+ u64 window_start = rq->window_start;
+ u32 window_size = sched_ravg_window;
+ u64 delta;
+ u64 *curr_runnable_sum = &rq->curr_runnable_sum;
+ u64 *prev_runnable_sum = &rq->prev_runnable_sum;
+ u64 *nt_curr_runnable_sum = &rq->nt_curr_runnable_sum;
+ u64 *nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;
+ bool new_task;
+ struct related_thread_group *grp;
+ int cpu = rq->cpu;
+ u32 old_curr_window = p->ravg.curr_window;
+
+ new_window = mark_start < window_start;
+ if (new_window) {
+ full_window = (window_start - mark_start) >= window_size;
+ if (p->ravg.active_windows < USHRT_MAX)
+ p->ravg.active_windows++;
+ }
+
+ new_task = is_new_task(p);
+
+ /*
+ * Handle per-task window rollover. We don't care about the idle
+ * task or exiting tasks.
+ */
+ if (!is_idle_task(p) && !exiting_task(p)) {
+ if (new_window)
+ rollover_task_window(p, full_window);
+ }
+
+ if (p_is_curr_task && new_window) {
+ rollover_cpu_window(rq, full_window);
+ rollover_top_tasks(rq, full_window);
+ }
+
+ if (!account_busy_for_cpu_time(rq, p, irqtime, event))
+ goto done;
+
+ grp = p->grp;
+ if (grp && sched_freq_aggregate) {
+ struct group_cpu_time *cpu_time = &rq->grp_time;
+
+ curr_runnable_sum = &cpu_time->curr_runnable_sum;
+ prev_runnable_sum = &cpu_time->prev_runnable_sum;
+
+ nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
+ nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
+ }
+
+ if (!new_window) {
+ /*
+ * account_busy_for_cpu_time() = 1 so busy time needs
+ * to be accounted to the current window. No rollover
+ * since we didn't start a new window. An example of this is
+ * when a task starts execution and then sleeps within the
+ * same window.
+ */
+
+ if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq))
+ delta = wallclock - mark_start;
+ else
+ delta = irqtime;
+ delta = scale_exec_time(delta, rq);
+ *curr_runnable_sum += delta;
+ if (new_task)
+ *nt_curr_runnable_sum += delta;
+
+ if (!is_idle_task(p) && !exiting_task(p)) {
+ p->ravg.curr_window += delta;
+ p->ravg.curr_window_cpu[cpu] += delta;
+ }
+
+ goto done;
+ }
+
+ if (!p_is_curr_task) {
+ /*
+ * account_busy_for_cpu_time() = 1 so busy time needs
+ * to be accounted to the current window. A new window
+ * has also started, but p is not the current task, so the
+ * window is not rolled over - just split up and account
+ * as necessary into curr and prev. The window is only
+ * rolled over when a new window is processed for the current
+ * task.
+ *
+ * Irqtime can't be accounted by a task that isn't the
+ * currently running task.
+ */
+
+ if (!full_window) {
+ /*
+ * A full window hasn't elapsed, account partial
+ * contribution to previous completed window.
+ */
+ delta = scale_exec_time(window_start - mark_start, rq);
+ if (!exiting_task(p)) {
+ p->ravg.prev_window += delta;
+ p->ravg.prev_window_cpu[cpu] += delta;
+ }
+ } else {
+ /*
+ * Since at least one full window has elapsed,
+ * the contribution to the previous window is the
+ * full window (window_size).
+ */
+ delta = scale_exec_time(window_size, rq);
+ if (!exiting_task(p)) {
+ p->ravg.prev_window = delta;
+ p->ravg.prev_window_cpu[cpu] = delta;
+ }
+ }
+
+ *prev_runnable_sum += delta;
+ if (new_task)
+ *nt_prev_runnable_sum += delta;
+
+ /* Account piece of busy time in the current window. */
+ delta = scale_exec_time(wallclock - window_start, rq);
+ *curr_runnable_sum += delta;
+ if (new_task)
+ *nt_curr_runnable_sum += delta;
+
+ if (!exiting_task(p)) {
+ p->ravg.curr_window = delta;
+ p->ravg.curr_window_cpu[cpu] = delta;
+ }
+
+ goto done;
+ }
+
+ if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) {
+ /*
+ * account_busy_for_cpu_time() = 1 so busy time needs
+ * to be accounted to the current window. A new window
+ * has started and p is the current task so rollover is
+ * needed. If any of these three above conditions are true
+ * then this busy time can't be accounted as irqtime.
+ *
+ * Busy time for the idle task or exiting tasks need not
+ * be accounted.
+ *
+ * An example of this would be a task that starts execution
+ * and then sleeps once a new window has begun.
+ */
+
+ if (!full_window) {
+ /*
+ * A full window hasn't elapsed, account partial
+ * contribution to previous completed window.
+ */
+ delta = scale_exec_time(window_start - mark_start, rq);
+ if (!is_idle_task(p) && !exiting_task(p)) {
+ p->ravg.prev_window += delta;
+ p->ravg.prev_window_cpu[cpu] += delta;
+ }
+ } else {
+ /*
+ * Since at least one full window has elapsed,
+ * the contribution to the previous window is the
+ * full window (window_size).
+ */
+ delta = scale_exec_time(window_size, rq);
+ if (!is_idle_task(p) && !exiting_task(p)) {
+ p->ravg.prev_window = delta;
+ p->ravg.prev_window_cpu[cpu] = delta;
+ }
+ }
+
+ /*
+ * Rollover is done here by overwriting the values in
+ * prev_runnable_sum and curr_runnable_sum.
+ */
+ *prev_runnable_sum += delta;
+ if (new_task)
+ *nt_prev_runnable_sum += delta;
+
+ /* Account piece of busy time in the current window. */
+ delta = scale_exec_time(wallclock - window_start, rq);
+ *curr_runnable_sum += delta;
+ if (new_task)
+ *nt_curr_runnable_sum += delta;
+
+ if (!is_idle_task(p) && !exiting_task(p)) {
+ p->ravg.curr_window = delta;
+ p->ravg.curr_window_cpu[cpu] = delta;
+ }
+
+ goto done;
+ }
+
+ if (irqtime) {
+ /*
+ * account_busy_for_cpu_time() = 1 so busy time needs
+ * to be accounted to the current window. A new window
+ * has started and p is the current task so rollover is
+ * needed. The current task must be the idle task because
+ * irqtime is not accounted for any other task.
+ *
+ * Irqtime will be accounted each time we process IRQ activity
+ * after a period of idleness, so we know the IRQ busy time
+ * started at wallclock - irqtime.
+ */
+
+ BUG_ON(!is_idle_task(p));
+ mark_start = wallclock - irqtime;
+
+ /*
+ * Roll window over. If IRQ busy time was just in the current
+ * window then that is all that need be accounted.
+ */
+ if (mark_start > window_start) {
+ *curr_runnable_sum = scale_exec_time(irqtime, rq);
+ return;
+ }
+
+ /*
+ * The IRQ busy time spanned multiple windows. Process the
+ * busy time preceding the current window start first.
+ */
+ delta = window_start - mark_start;
+ if (delta > window_size)
+ delta = window_size;
+ delta = scale_exec_time(delta, rq);
+ *prev_runnable_sum += delta;
+
+ /* Process the remaining IRQ busy time in the current window. */
+ delta = wallclock - window_start;
+ rq->curr_runnable_sum = scale_exec_time(delta, rq);
+
+ return;
+ }
+
+done:
+ if (!is_idle_task(p) && !exiting_task(p))
+ update_top_tasks(p, rq, old_curr_window,
+ new_window, full_window);
+}
+
+static inline u32 predict_and_update_buckets(struct rq *rq,
+ struct task_struct *p, u32 runtime) {
+
+ int bidx;
+ u32 pred_demand;
+
+ bidx = busy_to_bucket(runtime);
+ pred_demand = get_pred_busy(rq, p, bidx, runtime);
+ bucket_increase(p->ravg.busy_buckets, bidx);
+
+ return pred_demand;
+}
+
+#define THRESH_CC_UPDATE (2 * NSEC_PER_USEC)
+
+/*
+ * Assumes rq_lock is held and wallclock was recorded in the same critical
+ * section as this function's invocation.
+ */
+static inline u64 read_cycle_counter(int cpu, u64 wallclock)
+{
+ struct sched_cluster *cluster = cpu_rq(cpu)->cluster;
+ u64 delta;
+
+ if (unlikely(!cluster))
+ return cpu_cycle_counter_cb.get_cpu_cycle_counter(cpu);
+
+ /*
+ * Why don't we need locking here? Let's say that delta is negative
+ * because some other CPU happened to update last_cc_update with a
+ * more recent timestamp. We simply read the conter again in that case
+ * with no harmful side effects. This can happen if there is an FIQ
+ * between when we read the wallclock and when we use it here.
+ */
+ delta = wallclock - atomic64_read(&cluster->last_cc_update);
+ if (delta > THRESH_CC_UPDATE) {
+ atomic64_set(&cluster->cycles,
+ cpu_cycle_counter_cb.get_cpu_cycle_counter(cpu));
+ atomic64_set(&cluster->last_cc_update, wallclock);
+ }
+
+ return atomic64_read(&cluster->cycles);
+}
+
+static void update_task_cpu_cycles(struct task_struct *p, int cpu,
+ u64 wallclock)
+{
+ if (use_cycle_counter)
+ p->cpu_cycles = read_cycle_counter(cpu, wallclock);
+}
+
+static void
+update_task_rq_cpu_cycles(struct task_struct *p, struct rq *rq, int event,
+ u64 wallclock, u64 irqtime)
+{
+ u64 cur_cycles;
+ int cpu = cpu_of(rq);
+
+ lockdep_assert_held(&rq->lock);
+
+ if (!use_cycle_counter) {
+ rq->cc.cycles = cpu_cur_freq(cpu);
+ rq->cc.time = 1;
+ return;
+ }
+
+ cur_cycles = read_cycle_counter(cpu, wallclock);
+
+ /*
+ * If current task is idle task and irqtime == 0 CPU was
+ * indeed idle and probably its cycle counter was not
+ * increasing. We still need estimatied CPU frequency
+ * for IO wait time accounting. Use the previously
+ * calculated frequency in such a case.
+ */
+ if (!is_idle_task(rq->curr) || irqtime) {
+ if (unlikely(cur_cycles < p->cpu_cycles))
+ rq->cc.cycles = cur_cycles + (U64_MAX - p->cpu_cycles);
+ else
+ rq->cc.cycles = cur_cycles - p->cpu_cycles;
+ rq->cc.cycles = rq->cc.cycles * NSEC_PER_MSEC;
+
+ if (event == IRQ_UPDATE && is_idle_task(p))
+ /*
+ * Time between mark_start of idle task and IRQ handler
+ * entry time is CPU cycle counter stall period.
+ * Upon IRQ handler entry sched_account_irqstart()
+ * replenishes idle task's cpu cycle counter so
+ * rq->cc.cycles now represents increased cycles during
+ * IRQ handler rather than time between idle entry and
+ * IRQ exit. Thus use irqtime as time delta.
+ */
+ rq->cc.time = irqtime;
+ else
+ rq->cc.time = wallclock - p->ravg.mark_start;
+ BUG_ON((s64)rq->cc.time < 0);
+ }
+
+ p->cpu_cycles = cur_cycles;
+
+ trace_sched_get_task_cpu_cycles(cpu, event, rq->cc.cycles,
+ rq->cc.time, p);
+}
+
+static int
+account_busy_for_task_demand(struct rq *rq, struct task_struct *p, int event)
+{
+ /*
+ * No need to bother updating task demand for exiting tasks
+ * or the idle task.
+ */
+ if (exiting_task(p) || is_idle_task(p))
+ return 0;
+
+ /*
+ * When a task is waking up it is completing a segment of non-busy
+ * time. Likewise, if wait time is not treated as busy time, then
+ * when a task begins to run or is migrated, it is not running and
+ * is completing a segment of non-busy time.
+ */
+ if (event == TASK_WAKE || (!SCHED_ACCOUNT_WAIT_TIME &&
+ (event == PICK_NEXT_TASK || event == TASK_MIGRATE)))
+ return 0;
+
+ /*
+ * TASK_UPDATE can be called on sleeping task, when its moved between
+ * related groups
+ */
+ if (event == TASK_UPDATE) {
+ if (rq->curr == p)
+ return 1;
+
+ return p->on_rq ? SCHED_ACCOUNT_WAIT_TIME : 0;
+ }
+
+ return 1;
+}
+
+/*
+ * Called when new window is starting for a task, to record cpu usage over
+ * recently concluded window(s). Normally 'samples' should be 1. It can be > 1
+ * when, say, a real-time task runs without preemption for several windows at a
+ * stretch.
+ */
+static void update_history(struct rq *rq, struct task_struct *p,
+ u32 runtime, int samples, int event)
+{
+ u32 *hist = &p->ravg.sum_history[0];
+ int ridx, widx;
+ u32 max = 0, avg, demand, pred_demand;
+ u64 sum = 0;
+
+ /* Ignore windows where task had no activity */
+ if (!runtime || is_idle_task(p) || exiting_task(p) || !samples)
+ goto done;
+
+ /* Push new 'runtime' value onto stack */
+ widx = sched_ravg_hist_size - 1;
+ ridx = widx - samples;
+ for (; ridx >= 0; --widx, --ridx) {
+ hist[widx] = hist[ridx];
+ sum += hist[widx];
+ if (hist[widx] > max)
+ max = hist[widx];
+ }
+
+ for (widx = 0; widx < samples && widx < sched_ravg_hist_size; widx++) {
+ hist[widx] = runtime;
+ sum += hist[widx];
+ if (hist[widx] > max)
+ max = hist[widx];
+ }
+
+ p->ravg.sum = 0;
+
+ if (sched_window_stats_policy == WINDOW_STATS_RECENT) {
+ demand = runtime;
+ } else if (sched_window_stats_policy == WINDOW_STATS_MAX) {
+ demand = max;
+ } else {
+ avg = div64_u64(sum, sched_ravg_hist_size);
+ if (sched_window_stats_policy == WINDOW_STATS_AVG)
+ demand = avg;
+ else
+ demand = max(avg, runtime);
+ }
+ pred_demand = predict_and_update_buckets(rq, p, runtime);
+
+ /*
+ * A throttled deadline sched class task gets dequeued without
+ * changing p->on_rq. Since the dequeue decrements hmp stats
+ * avoid decrementing it here again.
+ */
+ if (task_on_rq_queued(p) && (!task_has_dl_policy(p) ||
+ !p->dl.dl_throttled))
+ p->sched_class->fixup_hmp_sched_stats(rq, p, demand,
+ pred_demand);
+
+ p->ravg.demand = demand;
+ p->ravg.pred_demand = pred_demand;
+
+done:
+ trace_sched_update_history(rq, p, runtime, samples, event);
+}
+
+static u64 add_to_task_demand(struct rq *rq, struct task_struct *p, u64 delta)
+{
+ delta = scale_exec_time(delta, rq);
+ p->ravg.sum += delta;
+ if (unlikely(p->ravg.sum > sched_ravg_window))
+ p->ravg.sum = sched_ravg_window;
+
+ return delta;
+}
+
+/*
+ * Account cpu demand of task and/or update task's cpu demand history
+ *
+ * ms = p->ravg.mark_start;
+ * wc = wallclock
+ * ws = rq->window_start
+ *
+ * Three possibilities:
+ *
+ * a) Task event is contained within one window.
+ * window_start < mark_start < wallclock
+ *
+ * ws ms wc
+ * | | |
+ * V V V
+ * |---------------|
+ *
+ * In this case, p->ravg.sum is updated *iff* event is appropriate
+ * (ex: event == PUT_PREV_TASK)
+ *
+ * b) Task event spans two windows.
+ * mark_start < window_start < wallclock
+ *
+ * ms ws wc
+ * | | |
+ * V V V
+ * -----|-------------------
+ *
+ * In this case, p->ravg.sum is updated with (ws - ms) *iff* event
+ * is appropriate, then a new window sample is recorded followed
+ * by p->ravg.sum being set to (wc - ws) *iff* event is appropriate.
+ *
+ * c) Task event spans more than two windows.
+ *
+ * ms ws_tmp ws wc
+ * | | | |
+ * V V V V
+ * ---|-------|-------|-------|-------|------
+ * | |
+ * |<------ nr_full_windows ------>|
+ *
+ * In this case, p->ravg.sum is updated with (ws_tmp - ms) first *iff*
+ * event is appropriate, window sample of p->ravg.sum is recorded,
+ * 'nr_full_window' samples of window_size is also recorded *iff*
+ * event is appropriate and finally p->ravg.sum is set to (wc - ws)
+ * *iff* event is appropriate.
+ *
+ * IMPORTANT : Leave p->ravg.mark_start unchanged, as update_cpu_busy_time()
+ * depends on it!
+ */
+static u64 update_task_demand(struct task_struct *p, struct rq *rq,
+ int event, u64 wallclock)
+{
+ u64 mark_start = p->ravg.mark_start;
+ u64 delta, window_start = rq->window_start;
+ int new_window, nr_full_windows;
+ u32 window_size = sched_ravg_window;
+ u64 runtime;
+
+ new_window = mark_start < window_start;
+ if (!account_busy_for_task_demand(rq, p, event)) {
+ if (new_window)
+ /*
+ * If the time accounted isn't being accounted as
+ * busy time, and a new window started, only the
+ * previous window need be closed out with the
+ * pre-existing demand. Multiple windows may have
+ * elapsed, but since empty windows are dropped,
+ * it is not necessary to account those.
+ */
+ update_history(rq, p, p->ravg.sum, 1, event);
+ return 0;
+ }
+
+ if (!new_window) {
+ /*
+ * The simple case - busy time contained within the existing
+ * window.
+ */
+ return add_to_task_demand(rq, p, wallclock - mark_start);
+ }
+
+ /*
+ * Busy time spans at least two windows. Temporarily rewind
+ * window_start to first window boundary after mark_start.
+ */
+ delta = window_start - mark_start;
+ nr_full_windows = div64_u64(delta, window_size);
+ window_start -= (u64)nr_full_windows * (u64)window_size;
+
+ /* Process (window_start - mark_start) first */
+ runtime = add_to_task_demand(rq, p, window_start - mark_start);
+
+ /* Push new sample(s) into task's demand history */
+ update_history(rq, p, p->ravg.sum, 1, event);
+ if (nr_full_windows) {
+ u64 scaled_window = scale_exec_time(window_size, rq);
+
+ update_history(rq, p, scaled_window, nr_full_windows, event);
+ runtime += nr_full_windows * scaled_window;
+ }
+
+ /*
+ * Roll window_start back to current to process any remainder
+ * in current window.
+ */
+ window_start += (u64)nr_full_windows * (u64)window_size;
+
+ /* Process (wallclock - window_start) next */
+ mark_start = window_start;
+ runtime += add_to_task_demand(rq, p, wallclock - mark_start);
+
+ return runtime;
+}
+
+static inline void
+update_task_burst(struct task_struct *p, struct rq *rq, int event, u64 runtime)
+{
+ /*
+ * update_task_demand() has checks for idle task and
+ * exit task. The runtime may include the wait time,
+ * so update the burst only for the cases where the
+ * task is running.
+ */
+ if (event == PUT_PREV_TASK || (event == TASK_UPDATE &&
+ rq->curr == p))
+ p->ravg.curr_burst += runtime;
+}
+
+/* Reflect task activity on its demand and cpu's busy time statistics */
+void update_task_ravg(struct task_struct *p, struct rq *rq, int event,
+ u64 wallclock, u64 irqtime)
+{
+ u64 runtime;
+
+ if (!rq->window_start || sched_disable_window_stats ||
+ p->ravg.mark_start == wallclock)
+ return;
+
+ lockdep_assert_held(&rq->lock);
+
+ update_window_start(rq, wallclock);
+
+ if (!p->ravg.mark_start) {
+ update_task_cpu_cycles(p, cpu_of(rq), wallclock);
+ goto done;
+ }
+
+ update_task_rq_cpu_cycles(p, rq, event, wallclock, irqtime);
+ runtime = update_task_demand(p, rq, event, wallclock);
+ if (runtime)
+ update_task_burst(p, rq, event, runtime);
+ update_cpu_busy_time(p, rq, event, wallclock, irqtime);
+ update_task_pred_demand(rq, p, event);
+
+ if (exiting_task(p))
+ goto done;
+
+ trace_sched_update_task_ravg(p, rq, event, wallclock, irqtime,
+ rq->cc.cycles, rq->cc.time,
+ p->grp ? &rq->grp_time : NULL);
+
+done:
+ p->ravg.mark_start = wallclock;
+}
+
+void sched_account_irqtime(int cpu, struct task_struct *curr,
+ u64 delta, u64 wallclock)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags, nr_windows;
+ u64 cur_jiffies_ts;
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+
+ /*
+ * cputime (wallclock) uses sched_clock so use the same here for
+ * consistency.
+ */
+ delta += sched_clock() - wallclock;
+ cur_jiffies_ts = get_jiffies_64();
+
+ if (is_idle_task(curr))
+ update_task_ravg(curr, rq, IRQ_UPDATE, sched_ktime_clock(),
+ delta);
+
+ nr_windows = cur_jiffies_ts - rq->irqload_ts;
+
+ if (nr_windows) {
+ if (nr_windows < 10) {
+ /* Decay CPU's irqload by 3/4 for each window. */
+ rq->avg_irqload *= (3 * nr_windows);
+ rq->avg_irqload = div64_u64(rq->avg_irqload,
+ 4 * nr_windows);
+ } else {
+ rq->avg_irqload = 0;
+ }
+ rq->avg_irqload += rq->cur_irqload;
+ rq->cur_irqload = 0;
+ }
+
+ rq->cur_irqload += delta;
+ rq->irqload_ts = cur_jiffies_ts;
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+void sched_account_irqstart(int cpu, struct task_struct *curr, u64 wallclock)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ if (!rq->window_start || sched_disable_window_stats)
+ return;
+
+ if (is_idle_task(curr)) {
+ /* We're here without rq->lock held, IRQ disabled */
+ raw_spin_lock(&rq->lock);
+ update_task_cpu_cycles(curr, cpu, sched_ktime_clock());
+ raw_spin_unlock(&rq->lock);
+ }
+}
+
+void reset_task_stats(struct task_struct *p)
+{
+ u32 sum = 0;
+ u32 *curr_window_ptr = NULL;
+ u32 *prev_window_ptr = NULL;
+
+ if (exiting_task(p)) {
+ sum = EXITING_TASK_MARKER;
+ } else {
+ curr_window_ptr = p->ravg.curr_window_cpu;
+ prev_window_ptr = p->ravg.prev_window_cpu;
+ memset(curr_window_ptr, 0, sizeof(u32) * nr_cpu_ids);
+ memset(prev_window_ptr, 0, sizeof(u32) * nr_cpu_ids);
+ }
+
+ memset(&p->ravg, 0, sizeof(struct ravg));
+
+ p->ravg.curr_window_cpu = curr_window_ptr;
+ p->ravg.prev_window_cpu = prev_window_ptr;
+
+ p->ravg.avg_burst = 2 * (u64)sysctl_sched_short_burst;
+
+ /* Retain EXITING_TASK marker */
+ p->ravg.sum_history[0] = sum;
+}
+
+void mark_task_starting(struct task_struct *p)
+{
+ u64 wallclock;
+ struct rq *rq = task_rq(p);
+
+ if (!rq->window_start || sched_disable_window_stats) {
+ reset_task_stats(p);
+ return;
+ }
+
+ wallclock = sched_ktime_clock();
+ p->ravg.mark_start = p->last_wake_ts = wallclock;
+ p->last_cpu_selected_ts = wallclock;
+ p->last_switch_out_ts = 0;
+ update_task_cpu_cycles(p, cpu_of(rq), wallclock);
+}
+
+void set_window_start(struct rq *rq)
+{
+ static int sync_cpu_available;
+
+ if (rq->window_start)
+ return;
+
+ if (!sync_cpu_available) {
+ rq->window_start = sched_ktime_clock();
+ sync_cpu_available = 1;
+ } else {
+ struct rq *sync_rq = cpu_rq(cpumask_any(cpu_online_mask));
+
+ raw_spin_unlock(&rq->lock);
+ double_rq_lock(rq, sync_rq);
+ rq->window_start = sync_rq->window_start;
+ rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
+ rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0;
+ raw_spin_unlock(&sync_rq->lock);
+ }
+
+ rq->curr->ravg.mark_start = rq->window_start;
+}
+
+static void reset_all_task_stats(void)
+{
+ struct task_struct *g, *p;
+
+ do_each_thread(g, p) {
+ reset_task_stats(p);
+ } while_each_thread(g, p);
+}
+
+enum reset_reason_code {
+ WINDOW_CHANGE,
+ POLICY_CHANGE,
+ HIST_SIZE_CHANGE,
+ FREQ_AGGREGATE_CHANGE,
+};
+
+const char *sched_window_reset_reasons[] = {
+ "WINDOW_CHANGE",
+ "POLICY_CHANGE",
+ "HIST_SIZE_CHANGE",
+ "FREQ_AGGREGATE_CHANGE",
+};
+
+/* Called with IRQs enabled */
+void reset_all_window_stats(u64 window_start, unsigned int window_size)
+{
+ int cpu, i;
+ unsigned long flags;
+ u64 start_ts = sched_ktime_clock();
+ int reason = WINDOW_CHANGE;
+ unsigned int old = 0, new = 0;
+
+ local_irq_save(flags);
+
+ read_lock(&tasklist_lock);
+
+ read_lock(&related_thread_group_lock);
+
+ /* Taking all runqueue locks prevents race with sched_exit(). */
+ for_each_possible_cpu(cpu)
+ raw_spin_lock(&cpu_rq(cpu)->lock);
+
+ sched_disable_window_stats = 1;
+
+ reset_all_task_stats();
+
+ read_unlock(&tasklist_lock);
+
+ if (window_size) {
+ sched_ravg_window = window_size * TICK_NSEC;
+ set_hmp_defaults();
+ sched_load_granule = sched_ravg_window / NUM_LOAD_INDICES;
+ }
+
+ sched_disable_window_stats = 0;
+
+ for_each_possible_cpu(cpu) {
+ struct rq *rq = cpu_rq(cpu);
+
+ if (window_start)
+ rq->window_start = window_start;
+ rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
+ rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0;
+ memset(&rq->grp_time, 0, sizeof(struct group_cpu_time));
+ for (i = 0; i < NUM_TRACKED_WINDOWS; i++) {
+ memset(&rq->load_subs[i], 0,
+ sizeof(struct load_subtractions));
+ clear_top_tasks_table(rq->top_tasks[i]);
+ clear_top_tasks_bitmap(rq->top_tasks_bitmap[i]);
+ }
+
+ rq->curr_table = 0;
+ rq->curr_top = 0;
+ rq->prev_top = 0;
+ reset_cpu_hmp_stats(cpu, 1);
+ }
+
+ if (sched_window_stats_policy != sysctl_sched_window_stats_policy) {
+ reason = POLICY_CHANGE;
+ old = sched_window_stats_policy;
+ new = sysctl_sched_window_stats_policy;
+ sched_window_stats_policy = sysctl_sched_window_stats_policy;
+ } else if (sched_ravg_hist_size != sysctl_sched_ravg_hist_size) {
+ reason = HIST_SIZE_CHANGE;
+ old = sched_ravg_hist_size;
+ new = sysctl_sched_ravg_hist_size;
+ sched_ravg_hist_size = sysctl_sched_ravg_hist_size;
+ } else if (sched_freq_aggregate !=
+ sysctl_sched_freq_aggregate) {
+ reason = FREQ_AGGREGATE_CHANGE;
+ old = sched_freq_aggregate;
+ new = sysctl_sched_freq_aggregate;
+ sched_freq_aggregate = sysctl_sched_freq_aggregate;
+ }
+
+ for_each_possible_cpu(cpu)
+ raw_spin_unlock(&cpu_rq(cpu)->lock);
+
+ read_unlock(&related_thread_group_lock);
+
+ local_irq_restore(flags);
+
+ trace_sched_reset_all_window_stats(window_start, window_size,
+ sched_ktime_clock() - start_ts, reason, old, new);
+}
+
+/*
+ * In this function we match the accumulated subtractions with the current
+ * and previous windows we are operating with. Ignore any entries where
+ * the window start in the load_subtraction struct does not match either
+ * the curent or the previous window. This could happen whenever CPUs
+ * become idle or busy with interrupts disabled for an extended period.
+ */
+static inline void account_load_subtractions(struct rq *rq)
+{
+ u64 ws = rq->window_start;
+ u64 prev_ws = ws - sched_ravg_window;
+ struct load_subtractions *ls = rq->load_subs;
+ int i;
+
+ for (i = 0; i < NUM_TRACKED_WINDOWS; i++) {
+ if (ls[i].window_start == ws) {
+ rq->curr_runnable_sum -= ls[i].subs;
+ rq->nt_curr_runnable_sum -= ls[i].new_subs;
+ } else if (ls[i].window_start == prev_ws) {
+ rq->prev_runnable_sum -= ls[i].subs;
+ rq->nt_prev_runnable_sum -= ls[i].new_subs;
+ }
+
+ ls[i].subs = 0;
+ ls[i].new_subs = 0;
+ }
+
+ BUG_ON((s64)rq->prev_runnable_sum < 0);
+ BUG_ON((s64)rq->curr_runnable_sum < 0);
+ BUG_ON((s64)rq->nt_prev_runnable_sum < 0);
+ BUG_ON((s64)rq->nt_curr_runnable_sum < 0);
+}
+
+static inline u64 freq_policy_load(struct rq *rq, u64 load)
+{
+ unsigned int reporting_policy = sysctl_sched_freq_reporting_policy;
+
+ switch (reporting_policy) {
+ case FREQ_REPORT_MAX_CPU_LOAD_TOP_TASK:
+ load = max_t(u64, load, top_task_load(rq));
+ break;
+ case FREQ_REPORT_TOP_TASK:
+ load = top_task_load(rq);
+ break;
+ case FREQ_REPORT_CPU_LOAD:
+ break;
+ default:
+ break;
+ }
+
+ return load;
+}
+
+void sched_get_cpus_busy(struct sched_load *busy,
+ const struct cpumask *query_cpus)
+{
+ unsigned long flags;
+ struct rq *rq;
+ const int cpus = cpumask_weight(query_cpus);
+ u64 load[cpus], group_load[cpus];
+ u64 nload[cpus], ngload[cpus];
+ u64 pload[cpus];
+ unsigned int max_freq[cpus];
+ int notifier_sent = 0;
+ int early_detection[cpus];
+ int cpu, i = 0;
+ unsigned int window_size;
+ u64 max_prev_sum = 0;
+ int max_busy_cpu = cpumask_first(query_cpus);
+ u64 total_group_load = 0, total_ngload = 0;
+ bool aggregate_load = false;
+ struct sched_cluster *cluster = cpu_cluster(cpumask_first(query_cpus));
+
+ if (unlikely(cpus == 0))
+ return;
+
+ local_irq_save(flags);
+
+ /*
+ * This function could be called in timer context, and the
+ * current task may have been executing for a long time. Ensure
+ * that the window stats are current by doing an update.
+ */
+
+ for_each_cpu(cpu, query_cpus)
+ raw_spin_lock(&cpu_rq(cpu)->lock);
+
+ window_size = sched_ravg_window;
+
+ /*
+ * We don't really need the cluster lock for this entire for loop
+ * block. However, there is no advantage in optimizing this as rq
+ * locks are held regardless and would prevent migration anyways
+ */
+ raw_spin_lock(&cluster->load_lock);
+
+ for_each_cpu(cpu, query_cpus) {
+ rq = cpu_rq(cpu);
+
+ update_task_ravg(rq->curr, rq, TASK_UPDATE, sched_ktime_clock(),
+ 0);
+
+ /*
+ * Ensure that we don't report load for 'cpu' again via the
+ * cpufreq_update_util path in the window that started at
+ * rq->window_start
+ */
+ rq->load_reported_window = rq->window_start;
+
+ account_load_subtractions(rq);
+ load[i] = rq->prev_runnable_sum;
+ nload[i] = rq->nt_prev_runnable_sum;
+ pload[i] = rq->hmp_stats.pred_demands_sum;
+ rq->old_estimated_time = pload[i];
+
+ if (load[i] > max_prev_sum) {
+ max_prev_sum = load[i];
+ max_busy_cpu = cpu;
+ }
+
+ /*
+ * sched_get_cpus_busy() is called for all CPUs in a
+ * frequency domain. So the notifier_sent flag per
+ * cluster works even when a frequency domain spans
+ * more than 1 cluster.
+ */
+ if (rq->cluster->notifier_sent) {
+ notifier_sent = 1;
+ rq->cluster->notifier_sent = 0;
+ }
+ early_detection[i] = (rq->ed_task != NULL);
+ max_freq[i] = cpu_max_freq(cpu);
+ i++;
+ }
+
+ raw_spin_unlock(&cluster->load_lock);
+
+ group_load_in_freq_domain(
+ &cpu_rq(max_busy_cpu)->freq_domain_cpumask,
+ &total_group_load, &total_ngload);
+ aggregate_load = !!(total_group_load > sched_freq_aggregate_threshold);
+
+ i = 0;
+ for_each_cpu(cpu, query_cpus) {
+ group_load[i] = 0;
+ ngload[i] = 0;
+
+ if (early_detection[i])
+ goto skip_early;
+
+ rq = cpu_rq(cpu);
+ if (aggregate_load) {
+ if (cpu == max_busy_cpu) {
+ group_load[i] = total_group_load;
+ ngload[i] = total_ngload;
+ }
+ } else {
+ group_load[i] = rq->grp_time.prev_runnable_sum;
+ ngload[i] = rq->grp_time.nt_prev_runnable_sum;
+ }
+
+ load[i] += group_load[i];
+ nload[i] += ngload[i];
+
+ load[i] = freq_policy_load(rq, load[i]);
+ rq->old_busy_time = load[i];
+
+ /*
+ * Scale load in reference to cluster max_possible_freq.
+ *
+ * Note that scale_load_to_cpu() scales load in reference to
+ * the cluster max_freq.
+ */
+ load[i] = scale_load_to_cpu(load[i], cpu);
+ nload[i] = scale_load_to_cpu(nload[i], cpu);
+ pload[i] = scale_load_to_cpu(pload[i], cpu);
+skip_early:
+ i++;
+ }
+
+ for_each_cpu(cpu, query_cpus)
+ raw_spin_unlock(&(cpu_rq(cpu))->lock);
+
+ local_irq_restore(flags);
+
+ i = 0;
+ for_each_cpu(cpu, query_cpus) {
+ rq = cpu_rq(cpu);
+
+ if (early_detection[i]) {
+ busy[i].prev_load = div64_u64(sched_ravg_window,
+ NSEC_PER_USEC);
+ busy[i].new_task_load = 0;
+ busy[i].predicted_load = 0;
+ goto exit_early;
+ }
+
+ load[i] = scale_load_to_freq(load[i], max_freq[i],
+ cpu_max_possible_freq(cpu));
+ nload[i] = scale_load_to_freq(nload[i], max_freq[i],
+ cpu_max_possible_freq(cpu));
+
+ pload[i] = scale_load_to_freq(pload[i], max_freq[i],
+ rq->cluster->max_possible_freq);
+
+ busy[i].prev_load = div64_u64(load[i], NSEC_PER_USEC);
+ busy[i].new_task_load = div64_u64(nload[i], NSEC_PER_USEC);
+ busy[i].predicted_load = div64_u64(pload[i], NSEC_PER_USEC);
+
+exit_early:
+ trace_sched_get_busy(cpu, busy[i].prev_load,
+ busy[i].new_task_load,
+ busy[i].predicted_load,
+ early_detection[i],
+ aggregate_load &&
+ cpu == max_busy_cpu);
+ i++;
+ }
+}
+
+void sched_set_io_is_busy(int val)
+{
+ sched_io_is_busy = val;
+}
+
+int sched_set_window(u64 window_start, unsigned int window_size)
+{
+ u64 now, cur_jiffies, jiffy_ktime_ns;
+ s64 ws;
+ unsigned long flags;
+
+ if (window_size * TICK_NSEC < MIN_SCHED_RAVG_WINDOW)
+ return -EINVAL;
+
+ mutex_lock(&policy_mutex);
+
+ /*
+ * Get a consistent view of ktime, jiffies, and the time
+ * since the last jiffy (based on last_jiffies_update).
+ */
+ local_irq_save(flags);
+ cur_jiffies = jiffy_to_ktime_ns(&now, &jiffy_ktime_ns);
+ local_irq_restore(flags);
+
+ /* translate window_start from jiffies to nanoseconds */
+ ws = (window_start - cur_jiffies); /* jiffy difference */
+ ws *= TICK_NSEC;
+ ws += jiffy_ktime_ns;
+
+ /*
+ * Roll back calculated window start so that it is in
+ * the past (window stats must have a current window).
+ */
+ while (ws > now)
+ ws -= (window_size * TICK_NSEC);
+
+ BUG_ON(sched_ktime_clock() < ws);
+
+ reset_all_window_stats(ws, window_size);
+
+ sched_update_freq_max_load(cpu_possible_mask);
+
+ mutex_unlock(&policy_mutex);
+
+ return 0;
+}
+
+static inline void create_subtraction_entry(struct rq *rq, u64 ws, int index)
+{
+ rq->load_subs[index].window_start = ws;
+ rq->load_subs[index].subs = 0;
+ rq->load_subs[index].new_subs = 0;
+}
+
+static bool get_subtraction_index(struct rq *rq, u64 ws)
+{
+ int i;
+ u64 oldest = ULLONG_MAX;
+ int oldest_index = 0;
+
+ for (i = 0; i < NUM_TRACKED_WINDOWS; i++) {
+ u64 entry_ws = rq->load_subs[i].window_start;
+
+ if (ws == entry_ws)
+ return i;
+
+ if (entry_ws < oldest) {
+ oldest = entry_ws;
+ oldest_index = i;
+ }
+ }
+
+ create_subtraction_entry(rq, ws, oldest_index);
+ return oldest_index;
+}
+
+static void update_rq_load_subtractions(int index, struct rq *rq,
+ u32 sub_load, bool new_task)
+{
+ rq->load_subs[index].subs += sub_load;
+ if (new_task)
+ rq->load_subs[index].new_subs += sub_load;
+}
+
+static void update_cluster_load_subtractions(struct task_struct *p,
+ int cpu, u64 ws, bool new_task)
+{
+ struct sched_cluster *cluster = cpu_cluster(cpu);
+ struct cpumask cluster_cpus = cluster->cpus;
+ u64 prev_ws = ws - sched_ravg_window;
+ int i;
+
+ cpumask_clear_cpu(cpu, &cluster_cpus);
+ raw_spin_lock(&cluster->load_lock);
+
+ for_each_cpu(i, &cluster_cpus) {
+ struct rq *rq = cpu_rq(i);
+ int index;
+
+ if (p->ravg.curr_window_cpu[i]) {
+ index = get_subtraction_index(rq, ws);
+ update_rq_load_subtractions(index, rq,
+ p->ravg.curr_window_cpu[i], new_task);
+ p->ravg.curr_window_cpu[i] = 0;
+ }
+
+ if (p->ravg.prev_window_cpu[i]) {
+ index = get_subtraction_index(rq, prev_ws);
+ update_rq_load_subtractions(index, rq,
+ p->ravg.prev_window_cpu[i], new_task);
+ p->ravg.prev_window_cpu[i] = 0;
+ }
+ }
+
+ raw_spin_unlock(&cluster->load_lock);
+}
+
+static inline void inter_cluster_migration_fixup
+ (struct task_struct *p, int new_cpu, int task_cpu, bool new_task)
+{
+ struct rq *dest_rq = cpu_rq(new_cpu);
+ struct rq *src_rq = cpu_rq(task_cpu);
+
+ if (same_freq_domain(new_cpu, task_cpu))
+ return;
+
+ p->ravg.curr_window_cpu[new_cpu] = p->ravg.curr_window;
+ p->ravg.prev_window_cpu[new_cpu] = p->ravg.prev_window;
+
+ dest_rq->curr_runnable_sum += p->ravg.curr_window;
+ dest_rq->prev_runnable_sum += p->ravg.prev_window;
+
+ src_rq->curr_runnable_sum -= p->ravg.curr_window_cpu[task_cpu];
+ src_rq->prev_runnable_sum -= p->ravg.prev_window_cpu[task_cpu];
+
+ if (new_task) {
+ dest_rq->nt_curr_runnable_sum += p->ravg.curr_window;
+ dest_rq->nt_prev_runnable_sum += p->ravg.prev_window;
+
+ src_rq->nt_curr_runnable_sum -=
+ p->ravg.curr_window_cpu[task_cpu];
+ src_rq->nt_prev_runnable_sum -=
+ p->ravg.prev_window_cpu[task_cpu];
+ }
+
+ p->ravg.curr_window_cpu[task_cpu] = 0;
+ p->ravg.prev_window_cpu[task_cpu] = 0;
+
+ update_cluster_load_subtractions(p, task_cpu,
+ src_rq->window_start, new_task);
+
+ BUG_ON((s64)src_rq->prev_runnable_sum < 0);
+ BUG_ON((s64)src_rq->curr_runnable_sum < 0);
+ BUG_ON((s64)src_rq->nt_prev_runnable_sum < 0);
+ BUG_ON((s64)src_rq->nt_curr_runnable_sum < 0);
+}
+
+static int get_top_index(unsigned long *bitmap, unsigned long old_top)
+{
+ int index = find_next_bit(bitmap, NUM_LOAD_INDICES, old_top);
+
+ if (index == NUM_LOAD_INDICES)
+ return 0;
+
+ return NUM_LOAD_INDICES - 1 - index;
+}
+
+static void
+migrate_top_tasks(struct task_struct *p, struct rq *src_rq, struct rq *dst_rq)
+{
+ int index;
+ int top_index;
+ u32 curr_window = p->ravg.curr_window;
+ u32 prev_window = p->ravg.prev_window;
+ u8 src = src_rq->curr_table;
+ u8 dst = dst_rq->curr_table;
+ u8 *src_table;
+ u8 *dst_table;
+
+ if (curr_window) {
+ src_table = src_rq->top_tasks[src];
+ dst_table = dst_rq->top_tasks[dst];
+ index = load_to_index(curr_window);
+ src_table[index] -= 1;
+ dst_table[index] += 1;
+
+ if (!src_table[index])
+ __clear_bit(NUM_LOAD_INDICES - index - 1,
+ src_rq->top_tasks_bitmap[src]);
+
+ if (dst_table[index] == 1)
+ __set_bit(NUM_LOAD_INDICES - index - 1,
+ dst_rq->top_tasks_bitmap[dst]);
+
+ if (index > dst_rq->curr_top)
+ dst_rq->curr_top = index;
+
+ top_index = src_rq->curr_top;
+ if (index == top_index && !src_table[index])
+ src_rq->curr_top = get_top_index(
+ src_rq->top_tasks_bitmap[src], top_index);
+ }
+
+ if (prev_window) {
+ src = 1 - src;
+ dst = 1 - dst;
+ src_table = src_rq->top_tasks[src];
+ dst_table = dst_rq->top_tasks[dst];
+ index = load_to_index(prev_window);
+ src_table[index] -= 1;
+ dst_table[index] += 1;
+
+ if (!src_table[index])
+ __clear_bit(NUM_LOAD_INDICES - index - 1,
+ src_rq->top_tasks_bitmap[src]);
+
+ if (dst_table[index] == 1)
+ __set_bit(NUM_LOAD_INDICES - index - 1,
+ dst_rq->top_tasks_bitmap[dst]);
+
+ if (index > dst_rq->prev_top)
+ dst_rq->prev_top = index;
+
+ top_index = src_rq->prev_top;
+ if (index == top_index && !src_table[index])
+ src_rq->prev_top = get_top_index(
+ src_rq->top_tasks_bitmap[src], top_index);
+ }
+}
+
+void fixup_busy_time(struct task_struct *p, int new_cpu)
+{
+ struct rq *src_rq = task_rq(p);
+ struct rq *dest_rq = cpu_rq(new_cpu);
+ u64 wallclock;
+ u64 *src_curr_runnable_sum, *dst_curr_runnable_sum;
+ u64 *src_prev_runnable_sum, *dst_prev_runnable_sum;
+ u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum;
+ u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum;
+ bool new_task;
+ struct related_thread_group *grp;
+
+ if (!p->on_rq && p->state != TASK_WAKING)
+ return;
+
+ if (exiting_task(p)) {
+ clear_ed_task(p, src_rq);
+ return;
+ }
+
+ if (p->state == TASK_WAKING)
+ double_rq_lock(src_rq, dest_rq);
+
+ if (sched_disable_window_stats)
+ goto done;
+
+ wallclock = sched_ktime_clock();
+
+ update_task_ravg(task_rq(p)->curr, task_rq(p),
+ TASK_UPDATE,
+ wallclock, 0);
+ update_task_ravg(dest_rq->curr, dest_rq,
+ TASK_UPDATE, wallclock, 0);
+
+ update_task_ravg(p, task_rq(p), TASK_MIGRATE,
+ wallclock, 0);
+
+ update_task_cpu_cycles(p, new_cpu, wallclock);
+
+ new_task = is_new_task(p);
+ /* Protected by rq_lock */
+ grp = p->grp;
+
+ /*
+ * For frequency aggregation, we continue to do migration fixups
+ * even for intra cluster migrations. This is because, the aggregated
+ * load has to reported on a single CPU regardless.
+ */
+ if (grp && sched_freq_aggregate) {
+ struct group_cpu_time *cpu_time;
+
+ cpu_time = &src_rq->grp_time;
+ src_curr_runnable_sum = &cpu_time->curr_runnable_sum;
+ src_prev_runnable_sum = &cpu_time->prev_runnable_sum;
+ src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
+ src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
+
+ cpu_time = &dest_rq->grp_time;
+ dst_curr_runnable_sum = &cpu_time->curr_runnable_sum;
+ dst_prev_runnable_sum = &cpu_time->prev_runnable_sum;
+ dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
+ dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
+
+ if (p->ravg.curr_window) {
+ *src_curr_runnable_sum -= p->ravg.curr_window;
+ *dst_curr_runnable_sum += p->ravg.curr_window;
+ if (new_task) {
+ *src_nt_curr_runnable_sum -=
+ p->ravg.curr_window;
+ *dst_nt_curr_runnable_sum +=
+ p->ravg.curr_window;
+ }
+ }
+
+ if (p->ravg.prev_window) {
+ *src_prev_runnable_sum -= p->ravg.prev_window;
+ *dst_prev_runnable_sum += p->ravg.prev_window;
+ if (new_task) {
+ *src_nt_prev_runnable_sum -=
+ p->ravg.prev_window;
+ *dst_nt_prev_runnable_sum +=
+ p->ravg.prev_window;
+ }
+ }
+ } else {
+ inter_cluster_migration_fixup(p, new_cpu,
+ task_cpu(p), new_task);
+ }
+
+ migrate_top_tasks(p, src_rq, dest_rq);
+
+ if (!same_freq_domain(new_cpu, task_cpu(p))) {
+ cpufreq_update_util(dest_rq, SCHED_CPUFREQ_INTERCLUSTER_MIG);
+ cpufreq_update_util(src_rq, SCHED_CPUFREQ_INTERCLUSTER_MIG);
+ }
+
+ if (p == src_rq->ed_task) {
+ src_rq->ed_task = NULL;
+ if (!dest_rq->ed_task)
+ dest_rq->ed_task = p;
+ }
+
+done:
+ if (p->state == TASK_WAKING)
+ double_rq_unlock(src_rq, dest_rq);
+}
+
+#define sched_up_down_migrate_auto_update 1
+static void check_for_up_down_migrate_update(const struct cpumask *cpus)
+{
+ int i = cpumask_first(cpus);
+
+ if (!sched_up_down_migrate_auto_update)
+ return;
+
+ if (cpu_max_possible_capacity(i) == max_possible_capacity)
+ return;
+
+ if (cpu_max_possible_freq(i) == cpu_max_freq(i))
+ up_down_migrate_scale_factor = 1024;
+ else
+ up_down_migrate_scale_factor = (1024 *
+ cpu_max_possible_freq(i)) / cpu_max_freq(i);
+
+ update_up_down_migrate();
+}
+
+/* Return cluster which can offer required capacity for group */
+static struct sched_cluster *best_cluster(struct related_thread_group *grp,
+ u64 total_demand, bool group_boost)
+{
+ struct sched_cluster *cluster = NULL;
+
+ for_each_sched_cluster(cluster) {
+ if (group_will_fit(cluster, grp, total_demand, group_boost))
+ return cluster;
+ }
+
+ return sched_cluster[0];
+}
+
+static void _set_preferred_cluster(struct related_thread_group *grp)
+{
+ struct task_struct *p;
+ u64 combined_demand = 0;
+ bool boost_on_big = sched_boost_policy() == SCHED_BOOST_ON_BIG;
+ bool group_boost = false;
+ u64 wallclock;
+
+ if (list_empty(&grp->tasks))
+ return;
+
+ wallclock = sched_ktime_clock();
+
+ /*
+ * wakeup of two or more related tasks could race with each other and
+ * could result in multiple calls to _set_preferred_cluster being issued
+ * at same time. Avoid overhead in such cases of rechecking preferred
+ * cluster
+ */
+ if (wallclock - grp->last_update < sched_ravg_window / 10)
+ return;
+
+ list_for_each_entry(p, &grp->tasks, grp_list) {
+ if (boost_on_big && task_sched_boost(p)) {
+ group_boost = true;
+ break;
+ }
+
+ if (p->ravg.mark_start < wallclock -
+ (sched_ravg_window * sched_ravg_hist_size))
+ continue;
+
+ combined_demand += p->ravg.demand;
+
+ }
+
+ grp->preferred_cluster = best_cluster(grp,
+ combined_demand, group_boost);
+ grp->last_update = sched_ktime_clock();
+ trace_sched_set_preferred_cluster(grp, combined_demand);
+}
+
+void set_preferred_cluster(struct related_thread_group *grp)
+{
+ raw_spin_lock(&grp->lock);
+ _set_preferred_cluster(grp);
+ raw_spin_unlock(&grp->lock);
+}
+
+#define ADD_TASK 0
+#define REM_TASK 1
+
+#define DEFAULT_CGROUP_COLOC_ID 1
+
+/*
+ * Task's cpu usage is accounted in:
+ * rq->curr/prev_runnable_sum, when its ->grp is NULL
+ * grp->cpu_time[cpu]->curr/prev_runnable_sum, when its ->grp is !NULL
+ *
+ * Transfer task's cpu usage between those counters when transitioning between
+ * groups
+ */
+static void transfer_busy_time(struct rq *rq, struct related_thread_group *grp,
+ struct task_struct *p, int event)
+{
+ u64 wallclock;
+ struct group_cpu_time *cpu_time;
+ u64 *src_curr_runnable_sum, *dst_curr_runnable_sum;
+ u64 *src_prev_runnable_sum, *dst_prev_runnable_sum;
+ u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum;
+ u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum;
+ int migrate_type;
+ int cpu = cpu_of(rq);
+ bool new_task;
+ int i;
+
+ if (!sched_freq_aggregate)
+ return;
+
+ wallclock = sched_ktime_clock();
+
+ update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+ update_task_ravg(p, rq, TASK_UPDATE, wallclock, 0);
+ new_task = is_new_task(p);
+
+ cpu_time = &rq->grp_time;
+ if (event == ADD_TASK) {
+ migrate_type = RQ_TO_GROUP;
+
+ src_curr_runnable_sum = &rq->curr_runnable_sum;
+ dst_curr_runnable_sum = &cpu_time->curr_runnable_sum;
+ src_prev_runnable_sum = &rq->prev_runnable_sum;
+ dst_prev_runnable_sum = &cpu_time->prev_runnable_sum;
+
+ src_nt_curr_runnable_sum = &rq->nt_curr_runnable_sum;
+ dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
+ src_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;
+ dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
+
+ *src_curr_runnable_sum -= p->ravg.curr_window_cpu[cpu];
+ *src_prev_runnable_sum -= p->ravg.prev_window_cpu[cpu];
+ if (new_task) {
+ *src_nt_curr_runnable_sum -=
+ p->ravg.curr_window_cpu[cpu];
+ *src_nt_prev_runnable_sum -=
+ p->ravg.prev_window_cpu[cpu];
+ }
+
+ update_cluster_load_subtractions(p, cpu,
+ rq->window_start, new_task);
+
+ } else {
+ migrate_type = GROUP_TO_RQ;
+
+ src_curr_runnable_sum = &cpu_time->curr_runnable_sum;
+ dst_curr_runnable_sum = &rq->curr_runnable_sum;
+ src_prev_runnable_sum = &cpu_time->prev_runnable_sum;
+ dst_prev_runnable_sum = &rq->prev_runnable_sum;
+
+ src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
+ dst_nt_curr_runnable_sum = &rq->nt_curr_runnable_sum;
+ src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
+ dst_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;
+
+ *src_curr_runnable_sum -= p->ravg.curr_window;
+ *src_prev_runnable_sum -= p->ravg.prev_window;
+ if (new_task) {
+ *src_nt_curr_runnable_sum -= p->ravg.curr_window;
+ *src_nt_prev_runnable_sum -= p->ravg.prev_window;
+ }
+
+ /*
+ * Need to reset curr/prev windows for all CPUs, not just the
+ * ones in the same cluster. Since inter cluster migrations
+ * did not result in the appropriate book keeping, the values
+ * per CPU would be inaccurate.
+ */
+ for_each_possible_cpu(i) {
+ p->ravg.curr_window_cpu[i] = 0;
+ p->ravg.prev_window_cpu[i] = 0;
+ }
+ }
+
+ *dst_curr_runnable_sum += p->ravg.curr_window;
+ *dst_prev_runnable_sum += p->ravg.prev_window;
+ if (new_task) {
+ *dst_nt_curr_runnable_sum += p->ravg.curr_window;
+ *dst_nt_prev_runnable_sum += p->ravg.prev_window;
+ }
+
+ /*
+ * When a task enter or exits a group, it's curr and prev windows are
+ * moved to a single CPU. This behavior might be sub-optimal in the
+ * exit case, however, it saves us the overhead of handling inter
+ * cluster migration fixups while the task is part of a related group.
+ */
+ p->ravg.curr_window_cpu[cpu] = p->ravg.curr_window;
+ p->ravg.prev_window_cpu[cpu] = p->ravg.prev_window;
+
+ trace_sched_migration_update_sum(p, migrate_type, rq);
+
+ BUG_ON((s64)*src_curr_runnable_sum < 0);
+ BUG_ON((s64)*src_prev_runnable_sum < 0);
+ BUG_ON((s64)*src_nt_curr_runnable_sum < 0);
+ BUG_ON((s64)*src_nt_prev_runnable_sum < 0);
+}
+
+static inline struct related_thread_group*
+lookup_related_thread_group(unsigned int group_id)
+{
+ return related_thread_groups[group_id];
+}
+
+int alloc_related_thread_groups(void)
+{
+ int i, ret;
+ struct related_thread_group *grp;
+
+ /* groupd_id = 0 is invalid as it's special id to remove group. */
+ for (i = 1; i < MAX_NUM_CGROUP_COLOC_ID; i++) {
+ grp = kzalloc(sizeof(*grp), GFP_NOWAIT);
+ if (!grp) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ grp->id = i;
+ INIT_LIST_HEAD(&grp->tasks);
+ INIT_LIST_HEAD(&grp->list);
+ raw_spin_lock_init(&grp->lock);
+
+ related_thread_groups[i] = grp;
+ }
+
+ return 0;
+
+err:
+ for (i = 1; i < MAX_NUM_CGROUP_COLOC_ID; i++) {
+ grp = lookup_related_thread_group(i);
+ if (grp) {
+ kfree(grp);
+ related_thread_groups[i] = NULL;
+ } else {
+ break;
+ }
+ }
+
+ return ret;
+}
+
+static void remove_task_from_group(struct task_struct *p)
+{
+ struct related_thread_group *grp = p->grp;
+ struct rq *rq;
+ int empty_group = 1;
+
+ raw_spin_lock(&grp->lock);
+
+ rq = __task_rq_lock(p);
+ transfer_busy_time(rq, p->grp, p, REM_TASK);
+ list_del_init(&p->grp_list);
+ rcu_assign_pointer(p->grp, NULL);
+ __task_rq_unlock(rq);
+
+ if (!list_empty(&grp->tasks)) {
+ empty_group = 0;
+ _set_preferred_cluster(grp);
+ }
+
+ raw_spin_unlock(&grp->lock);
+
+ /* Reserved groups cannot be destroyed */
+ if (empty_group && grp->id != DEFAULT_CGROUP_COLOC_ID)
+ /*
+ * We test whether grp->list is attached with list_empty()
+ * hence re-init the list after deletion.
+ */
+ list_del_init(&grp->list);
+}
+
+static int
+add_task_to_group(struct task_struct *p, struct related_thread_group *grp)
+{
+ struct rq *rq;
+
+ raw_spin_lock(&grp->lock);
+
+ /*
+ * Change p->grp under rq->lock. Will prevent races with read-side
+ * reference of p->grp in various hot-paths
+ */
+ rq = __task_rq_lock(p);
+ transfer_busy_time(rq, grp, p, ADD_TASK);
+ list_add(&p->grp_list, &grp->tasks);
+ rcu_assign_pointer(p->grp, grp);
+ __task_rq_unlock(rq);
+
+ _set_preferred_cluster(grp);
+
+ raw_spin_unlock(&grp->lock);
+
+ return 0;
+}
+
+void add_new_task_to_grp(struct task_struct *new)
+{
+ unsigned long flags;
+ struct related_thread_group *grp;
+ struct task_struct *leader = new->group_leader;
+ unsigned int leader_grp_id = sched_get_group_id(leader);
+
+ if (!sysctl_sched_enable_thread_grouping &&
+ leader_grp_id != DEFAULT_CGROUP_COLOC_ID)
+ return;
+
+ if (thread_group_leader(new))
+ return;
+
+ if (leader_grp_id == DEFAULT_CGROUP_COLOC_ID) {
+ if (!same_schedtune(new, leader))
+ return;
+ }
+
+ write_lock_irqsave(&related_thread_group_lock, flags);
+
+ rcu_read_lock();
+ grp = task_related_thread_group(leader);
+ rcu_read_unlock();
+
+ /*
+ * It's possible that someone already added the new task to the
+ * group. A leader's thread group is updated prior to calling
+ * this function. It's also possible that the leader has exited
+ * the group. In either case, there is nothing else to do.
+ */
+ if (!grp || new->grp) {
+ write_unlock_irqrestore(&related_thread_group_lock, flags);
+ return;
+ }
+
+ raw_spin_lock(&grp->lock);
+
+ rcu_assign_pointer(new->grp, grp);
+ list_add(&new->grp_list, &grp->tasks);
+
+ raw_spin_unlock(&grp->lock);
+ write_unlock_irqrestore(&related_thread_group_lock, flags);
+}
+
+static int __sched_set_group_id(struct task_struct *p, unsigned int group_id)
+{
+ int rc = 0;
+ unsigned long flags;
+ struct related_thread_group *grp = NULL;
+
+ if (group_id >= MAX_NUM_CGROUP_COLOC_ID)
+ return -EINVAL;
+
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+ write_lock(&related_thread_group_lock);
+
+ /* Switching from one group to another directly is not permitted */
+ if ((current != p && p->flags & PF_EXITING) ||
+ (!p->grp && !group_id) ||
+ (p->grp && group_id))
+ goto done;
+
+ if (!group_id) {
+ remove_task_from_group(p);
+ goto done;
+ }
+
+ grp = lookup_related_thread_group(group_id);
+ if (list_empty(&grp->list))
+ list_add(&grp->list, &active_related_thread_groups);
+
+ rc = add_task_to_group(p, grp);
+done:
+ write_unlock(&related_thread_group_lock);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+ return rc;
+}
+
+int sched_set_group_id(struct task_struct *p, unsigned int group_id)
+{
+ /* DEFAULT_CGROUP_COLOC_ID is a reserved id */
+ if (group_id == DEFAULT_CGROUP_COLOC_ID)
+ return -EINVAL;
+
+ return __sched_set_group_id(p, group_id);
+}
+
+unsigned int sched_get_group_id(struct task_struct *p)
+{
+ unsigned int group_id;
+ struct related_thread_group *grp;
+
+ rcu_read_lock();
+ grp = task_related_thread_group(p);
+ group_id = grp ? grp->id : 0;
+ rcu_read_unlock();
+
+ return group_id;
+}
+
+#if defined(CONFIG_SCHED_TUNE) && defined(CONFIG_CGROUP_SCHEDTUNE)
+/*
+ * We create a default colocation group at boot. There is no need to
+ * synchronize tasks between cgroups at creation time because the
+ * correct cgroup hierarchy is not available at boot. Therefore cgroup
+ * colocation is turned off by default even though the colocation group
+ * itself has been allocated. Furthermore this colocation group cannot
+ * be destroyted once it has been created. All of this has been as part
+ * of runtime optimizations.
+ *
+ * The job of synchronizing tasks to the colocation group is done when
+ * the colocation flag in the cgroup is turned on.
+ */
+static int __init create_default_coloc_group(void)
+{
+ struct related_thread_group *grp = NULL;
+ unsigned long flags;
+
+ grp = lookup_related_thread_group(DEFAULT_CGROUP_COLOC_ID);
+ write_lock_irqsave(&related_thread_group_lock, flags);
+ list_add(&grp->list, &active_related_thread_groups);
+ write_unlock_irqrestore(&related_thread_group_lock, flags);
+
+ update_freq_aggregate_threshold(MAX_FREQ_AGGR_THRESH);
+ return 0;
+}
+late_initcall(create_default_coloc_group);
+
+int sync_cgroup_colocation(struct task_struct *p, bool insert)
+{
+ unsigned int grp_id = insert ? DEFAULT_CGROUP_COLOC_ID : 0;
+
+ return __sched_set_group_id(p, grp_id);
+}
+#endif
+
+static void update_cpu_cluster_capacity(const cpumask_t *cpus)
+{
+ int i;
+ struct sched_cluster *cluster;
+ struct cpumask cpumask;
+
+ cpumask_copy(&cpumask, cpus);
+ pre_big_task_count_change(cpu_possible_mask);
+
+ for_each_cpu(i, &cpumask) {
+ cluster = cpu_rq(i)->cluster;
+ cpumask_andnot(&cpumask, &cpumask, &cluster->cpus);
+
+ cluster->capacity = compute_capacity(cluster);
+ cluster->load_scale_factor = compute_load_scale_factor(cluster);
+
+ /* 'cpus' can contain cpumask more than one cluster */
+ check_for_up_down_migrate_update(&cluster->cpus);
+ }
+
+ __update_min_max_capacity();
+
+ post_big_task_count_change(cpu_possible_mask);
+}
+
+static DEFINE_SPINLOCK(cpu_freq_min_max_lock);
+void sched_update_cpu_freq_min_max(const cpumask_t *cpus, u32 fmin, u32 fmax)
+{
+ struct cpumask cpumask;
+ struct sched_cluster *cluster;
+ int i, update_capacity = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&cpu_freq_min_max_lock, flags);
+ cpumask_copy(&cpumask, cpus);
+ for_each_cpu(i, &cpumask) {
+ cluster = cpu_rq(i)->cluster;
+ cpumask_andnot(&cpumask, &cpumask, &cluster->cpus);
+
+ update_capacity += (cluster->max_mitigated_freq != fmax);
+ cluster->max_mitigated_freq = fmax;
+ }
+ spin_unlock_irqrestore(&cpu_freq_min_max_lock, flags);
+
+ if (update_capacity)
+ update_cpu_cluster_capacity(cpus);
+}
+
+static int cpufreq_notifier_policy(struct notifier_block *nb,
+ unsigned long val, void *data)
+{
+ struct cpufreq_policy *policy = (struct cpufreq_policy *)data;
+ struct sched_cluster *cluster = NULL;
+ struct cpumask policy_cluster = *policy->related_cpus;
+ unsigned int orig_max_freq = 0;
+ int i, j, update_capacity = 0;
+
+ if (val != CPUFREQ_NOTIFY && val != CPUFREQ_REMOVE_POLICY &&
+ val != CPUFREQ_CREATE_POLICY)
+ return 0;
+
+ if (val == CPUFREQ_REMOVE_POLICY || val == CPUFREQ_CREATE_POLICY) {
+ update_min_max_capacity();
+ return 0;
+ }
+
+ max_possible_freq = max(max_possible_freq, policy->cpuinfo.max_freq);
+ if (min_max_freq == 1)
+ min_max_freq = UINT_MAX;
+ min_max_freq = min(min_max_freq, policy->cpuinfo.max_freq);
+ BUG_ON(!min_max_freq);
+ BUG_ON(!policy->max);
+
+ for_each_cpu(i, &policy_cluster) {
+ cluster = cpu_rq(i)->cluster;
+ cpumask_andnot(&policy_cluster, &policy_cluster,
+ &cluster->cpus);
+
+ orig_max_freq = cluster->max_freq;
+ cluster->min_freq = policy->min;
+ cluster->max_freq = policy->max;
+ cluster->cur_freq = policy->cur;
+
+ if (!cluster->freq_init_done) {
+ mutex_lock(&cluster_lock);
+ for_each_cpu(j, &cluster->cpus)
+ cpumask_copy(&cpu_rq(j)->freq_domain_cpumask,
+ policy->related_cpus);
+ cluster->max_possible_freq = policy->cpuinfo.max_freq;
+ cluster->max_possible_capacity =
+ compute_max_possible_capacity(cluster);
+ cluster->freq_init_done = true;
+
+ sort_clusters();
+ update_all_clusters_stats();
+ mutex_unlock(&cluster_lock);
+ continue;
+ }
+
+ update_capacity += (orig_max_freq != cluster->max_freq);
+ }
+
+ if (update_capacity)
+ update_cpu_cluster_capacity(policy->related_cpus);
+
+ return 0;
+}
+
+static int cpufreq_notifier_trans(struct notifier_block *nb,
+ unsigned long val, void *data)
+{
+ struct cpufreq_freqs *freq = (struct cpufreq_freqs *)data;
+ unsigned int cpu = freq->cpu, new_freq = freq->new;
+ unsigned long flags;
+ struct sched_cluster *cluster;
+ struct cpumask policy_cpus = cpu_rq(cpu)->freq_domain_cpumask;
+ int i, j;
+
+ if (val != CPUFREQ_POSTCHANGE)
+ return 0;
+
+ BUG_ON(!new_freq);
+
+ if (cpu_cur_freq(cpu) == new_freq)
+ return 0;
+
+ for_each_cpu(i, &policy_cpus) {
+ cluster = cpu_rq(i)->cluster;
+
+ for_each_cpu(j, &cluster->cpus) {
+ struct rq *rq = cpu_rq(j);
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ update_task_ravg(rq->curr, rq, TASK_UPDATE,
+ sched_ktime_clock(), 0);
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+ }
+
+ cluster->cur_freq = new_freq;
+ cpumask_andnot(&policy_cpus, &policy_cpus, &cluster->cpus);
+ }
+
+ return 0;
+}
+
+static int pwr_stats_ready_notifier(struct notifier_block *nb,
+ unsigned long cpu, void *data)
+{
+ cpumask_t mask = CPU_MASK_NONE;
+
+ cpumask_set_cpu(cpu, &mask);
+ sched_update_freq_max_load(&mask);
+
+ mutex_lock(&cluster_lock);
+ sort_clusters();
+ mutex_unlock(&cluster_lock);
+
+ return 0;
+}
+
+static struct notifier_block notifier_policy_block = {
+ .notifier_call = cpufreq_notifier_policy
+};
+
+static struct notifier_block notifier_trans_block = {
+ .notifier_call = cpufreq_notifier_trans
+};
+
+static struct notifier_block notifier_pwr_stats_ready = {
+ .notifier_call = pwr_stats_ready_notifier
+};
+
+int __weak register_cpu_pwr_stats_ready_notifier(struct notifier_block *nb)
+{
+ return -EINVAL;
+}
+
+static int register_sched_callback(void)
+{
+ int ret;
+
+ ret = cpufreq_register_notifier(&notifier_policy_block,
+ CPUFREQ_POLICY_NOTIFIER);
+
+ if (!ret)
+ ret = cpufreq_register_notifier(&notifier_trans_block,
+ CPUFREQ_TRANSITION_NOTIFIER);
+
+ register_cpu_pwr_stats_ready_notifier(&notifier_pwr_stats_ready);
+
+ return 0;
+}
+
+/*
+ * cpufreq callbacks can be registered at core_initcall or later time.
+ * Any registration done prior to that is "forgotten" by cpufreq. See
+ * initialization of variable init_cpufreq_transition_notifier_list_called
+ * for further information.
+ */
+core_initcall(register_sched_callback);
+
+int update_preferred_cluster(struct related_thread_group *grp,
+ struct task_struct *p, u32 old_load)
+{
+ u32 new_load = task_load(p);
+
+ if (!grp)
+ return 0;
+
+ /*
+ * Update if task's load has changed significantly or a complete window
+ * has passed since we last updated preference
+ */
+ if (abs(new_load - old_load) > sched_ravg_window / 4 ||
+ sched_ktime_clock() - grp->last_update > sched_ravg_window)
+ return 1;
+
+ return 0;
+}
+
+bool early_detection_notify(struct rq *rq, u64 wallclock)
+{
+ struct task_struct *p;
+ int loop_max = 10;
+
+ if (sched_boost_policy() == SCHED_BOOST_NONE || !rq->cfs.h_nr_running)
+ return 0;
+
+ rq->ed_task = NULL;
+ list_for_each_entry(p, &rq->cfs_tasks, se.group_node) {
+ if (!loop_max)
+ break;
+
+ if (wallclock - p->last_wake_ts >= EARLY_DETECTION_DURATION) {
+ rq->ed_task = p;
+ return 1;
+ }
+
+ loop_max--;
+ }
+
+ return 0;
+}
+
+void update_avg_burst(struct task_struct *p)
+{
+ update_avg(&p->ravg.avg_burst, p->ravg.curr_burst);
+ p->ravg.curr_burst = 0;
+}
+
+void note_task_waking(struct task_struct *p, u64 wallclock)
+{
+ u64 sleep_time = wallclock - p->last_switch_out_ts;
+
+ /*
+ * When a short burst and short sleeping task goes for a long
+ * sleep, the task's avg_sleep_time gets boosted. It will not
+ * come below short_sleep threshold for a lot of time and it
+ * results in incorrect packing. The idead behind tracking
+ * avg_sleep_time is to detect if a task is short sleeping
+ * or not. So limit the sleep time to twice the short sleep
+ * threshold. For regular long sleeping tasks, the avg_sleep_time
+ * would be higher than threshold, and packing happens correctly.
+ */
+ sleep_time = min_t(u64, sleep_time, 2 * sysctl_sched_short_sleep);
+ update_avg(&p->ravg.avg_sleep_time, sleep_time);
+
+ p->last_wake_ts = wallclock;
+}
+
+#ifdef CONFIG_CGROUP_SCHED
+u64 cpu_upmigrate_discourage_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct task_group *tg = css_tg(css);
+
+ return tg->upmigrate_discouraged;
+}
+
+int cpu_upmigrate_discourage_write_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 upmigrate_discourage)
+{
+ struct task_group *tg = css_tg(css);
+ int discourage = upmigrate_discourage > 0;
+
+ if (tg->upmigrate_discouraged == discourage)
+ return 0;
+
+ /*
+ * Revisit big-task classification for tasks of this cgroup. It would
+ * have been efficient to walk tasks of just this cgroup in running
+ * state, but we don't have easy means to do that. Walk all tasks in
+ * running state on all cpus instead and re-visit their big task
+ * classification.
+ */
+ get_online_cpus();
+ pre_big_task_count_change(cpu_online_mask);
+
+ tg->upmigrate_discouraged = discourage;
+
+ post_big_task_count_change(cpu_online_mask);
+ put_online_cpus();
+
+ return 0;
+}
+#endif /* CONFIG_CGROUP_SCHED */
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 33d7003fa1b8..d562efb04775 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -80,6 +80,26 @@ static void update_curr_idle(struct rq *rq)
{
}
+#ifdef CONFIG_SCHED_HMP
+
+static void
+inc_hmp_sched_stats_idle(struct rq *rq, struct task_struct *p)
+{
+}
+
+static void
+dec_hmp_sched_stats_idle(struct rq *rq, struct task_struct *p)
+{
+}
+
+static void
+fixup_hmp_sched_stats_idle(struct rq *rq, struct task_struct *p,
+ u32 new_task_load, u32 new_pred_demand)
+{
+}
+
+#endif
+
/*
* Simple, special scheduling class for the per-CPU idle tasks:
*/
@@ -108,4 +128,9 @@ const struct sched_class idle_sched_class = {
.prio_changed = prio_changed_idle,
.switched_to = switched_to_idle,
.update_curr = update_curr_idle,
+#ifdef CONFIG_SCHED_HMP
+ .inc_hmp_sched_stats = inc_hmp_sched_stats_idle,
+ .dec_hmp_sched_stats = dec_hmp_sched_stats_idle,
+ .fixup_hmp_sched_stats = fixup_hmp_sched_stats_idle,
+#endif
};
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index ad0519e8c7f5..391ec29c71c0 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -5,11 +5,12 @@
#include "sched.h"
+#include <linux/interrupt.h>
#include <linux/slab.h>
#include <linux/irq_work.h>
+#include <trace/events/sched.h>
#include <linux/hrtimer.h>
-#include "walt.h"
#include "tune.h"
int sched_rr_timeslice = RR_TIMESLICE;
@@ -258,8 +259,12 @@ static void pull_rt_task(struct rq *this_rq);
static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
{
- /* Try to pull RT tasks here if we lower this rq's prio */
- return rq->rt.highest_prio.curr > prev->prio;
+ /*
+ * Try to pull RT tasks here if we lower this rq's prio and cpu is not
+ * isolated
+ */
+ return rq->rt.highest_prio.curr > prev->prio &&
+ !cpu_isolated(cpu_of(rq));
}
static inline int rt_overloaded(struct rq *rq)
@@ -430,7 +435,7 @@ static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
static inline int on_rt_rq(struct sched_rt_entity *rt_se)
{
- return !list_empty(&rt_se->run_list);
+ return rt_se->on_rq;
}
#ifdef CONFIG_RT_GROUP_SCHED
@@ -476,8 +481,8 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
return rt_se->my_q;
}
-static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head);
-static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
+static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
{
@@ -493,7 +498,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
if (!rt_se)
enqueue_top_rt_rq(rt_rq);
else if (!on_rt_rq(rt_se))
- enqueue_rt_entity(rt_se, false);
+ enqueue_rt_entity(rt_se, 0);
if (rt_rq->highest_prio.curr < curr->prio)
resched_curr(rq);
@@ -510,7 +515,7 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
if (!rt_se)
dequeue_top_rt_rq(rt_rq);
else if (on_rt_rq(rt_se))
- dequeue_rt_entity(rt_se);
+ dequeue_rt_entity(rt_se, 0);
}
static inline int rt_rq_throttled(struct rt_rq *rt_rq)
@@ -1244,6 +1249,41 @@ void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
#endif /* CONFIG_RT_GROUP_SCHED */
+#ifdef CONFIG_SCHED_HMP
+
+static void
+inc_hmp_sched_stats_rt(struct rq *rq, struct task_struct *p)
+{
+ inc_cumulative_runnable_avg(&rq->hmp_stats, p);
+}
+
+static void
+dec_hmp_sched_stats_rt(struct rq *rq, struct task_struct *p)
+{
+ dec_cumulative_runnable_avg(&rq->hmp_stats, p);
+}
+
+static void
+fixup_hmp_sched_stats_rt(struct rq *rq, struct task_struct *p,
+ u32 new_task_load, u32 new_pred_demand)
+{
+ s64 task_load_delta = (s64)new_task_load - task_load(p);
+ s64 pred_demand_delta = PRED_DEMAND_DELTA;
+
+ fixup_cumulative_runnable_avg(&rq->hmp_stats, p, task_load_delta,
+ pred_demand_delta);
+}
+
+#else /* CONFIG_SCHED_HMP */
+
+static inline void
+inc_hmp_sched_stats_rt(struct rq *rq, struct task_struct *p) { }
+
+static inline void
+dec_hmp_sched_stats_rt(struct rq *rq, struct task_struct *p) { }
+
+#endif /* CONFIG_SCHED_HMP */
+
static inline
unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
{
@@ -1280,7 +1320,30 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
dec_rt_group(rt_se, rt_rq);
}
-static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
+/*
+ * Change rt_se->run_list location unless SAVE && !MOVE
+ *
+ * assumes ENQUEUE/DEQUEUE flags match
+ */
+static inline bool move_entity(unsigned int flags)
+{
+ if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
+ return false;
+
+ return true;
+}
+
+static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
+{
+ list_del_init(&rt_se->run_list);
+
+ if (list_empty(array->queue + rt_se_prio(rt_se)))
+ __clear_bit(rt_se_prio(rt_se), array->bitmap);
+
+ rt_se->on_list = 0;
+}
+
+static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
{
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
struct rt_prio_array *array = &rt_rq->active;
@@ -1293,26 +1356,37 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
* get throttled and the current group doesn't have any other
* active members.
*/
- if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
+ if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
+ if (rt_se->on_list)
+ __delist_rt_entity(rt_se, array);
return;
+ }
- if (head)
- list_add(&rt_se->run_list, queue);
- else
- list_add_tail(&rt_se->run_list, queue);
- __set_bit(rt_se_prio(rt_se), array->bitmap);
+ if (move_entity(flags)) {
+ WARN_ON_ONCE(rt_se->on_list);
+ if (flags & ENQUEUE_HEAD)
+ list_add(&rt_se->run_list, queue);
+ else
+ list_add_tail(&rt_se->run_list, queue);
+
+ __set_bit(rt_se_prio(rt_se), array->bitmap);
+ rt_se->on_list = 1;
+ }
+ rt_se->on_rq = 1;
inc_rt_tasks(rt_se, rt_rq);
}
-static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
+static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
{
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
struct rt_prio_array *array = &rt_rq->active;
- list_del_init(&rt_se->run_list);
- if (list_empty(array->queue + rt_se_prio(rt_se)))
- __clear_bit(rt_se_prio(rt_se), array->bitmap);
+ if (move_entity(flags)) {
+ WARN_ON_ONCE(!rt_se->on_list);
+ __delist_rt_entity(rt_se, array);
+ }
+ rt_se->on_rq = 0;
dec_rt_tasks(rt_se, rt_rq);
}
@@ -1321,7 +1395,7 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
* Because the prio of an upper entry depends on the lower
* entries, we must remove entries top - down.
*/
-static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
+static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
{
struct sched_rt_entity *back = NULL;
@@ -1334,31 +1408,31 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
for (rt_se = back; rt_se; rt_se = rt_se->back) {
if (on_rt_rq(rt_se))
- __dequeue_rt_entity(rt_se);
+ __dequeue_rt_entity(rt_se, flags);
}
}
-static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
{
struct rq *rq = rq_of_rt_se(rt_se);
- dequeue_rt_stack(rt_se);
+ dequeue_rt_stack(rt_se, flags);
for_each_sched_rt_entity(rt_se)
- __enqueue_rt_entity(rt_se, head);
+ __enqueue_rt_entity(rt_se, flags);
enqueue_top_rt_rq(&rq->rt);
}
-static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
+static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
{
struct rq *rq = rq_of_rt_se(rt_se);
- dequeue_rt_stack(rt_se);
+ dequeue_rt_stack(rt_se, flags);
for_each_sched_rt_entity(rt_se) {
struct rt_rq *rt_rq = group_rt_rq(rt_se);
if (rt_rq && rt_rq->rt_nr_running)
- __enqueue_rt_entity(rt_se, false);
+ __enqueue_rt_entity(rt_se, flags);
}
enqueue_top_rt_rq(&rq->rt);
}
@@ -1374,8 +1448,8 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
if (flags & ENQUEUE_WAKEUP)
rt_se->timeout = 0;
- enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
- walt_inc_cumulative_runnable_avg(rq, p);
+ enqueue_rt_entity(rt_se, flags);
+ inc_hmp_sched_stats_rt(rq, p);
if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
@@ -1413,8 +1487,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
struct sched_rt_entity *rt_se = &p->rt;
update_curr_rt(rq);
- dequeue_rt_entity(rt_se);
- walt_dec_cumulative_runnable_avg(rq, p);
+ dequeue_rt_entity(rt_se, flags);
+ dec_hmp_sched_stats_rt(rq, p);
dequeue_pushable_task(rq, p);
@@ -1469,6 +1543,40 @@ static void yield_task_rt(struct rq *rq)
#ifdef CONFIG_SMP
static int find_lowest_rq(struct task_struct *task);
+#ifdef CONFIG_SCHED_HMP
+static int
+select_task_rq_rt_hmp(struct task_struct *p, int cpu, int sd_flag, int flags)
+{
+ int target;
+
+ rcu_read_lock();
+ target = find_lowest_rq(p);
+ if (target != -1)
+ cpu = target;
+ rcu_read_unlock();
+
+ return cpu;
+}
+#endif
+
+/*
+ * Return whether the task on the given cpu is currently non-preemptible
+ * while handling a potentially long softint, or if the task is likely
+ * to block preemptions soon because it is a ksoftirq thread that is
+ * handling slow softints.
+ */
+bool
+task_may_not_preempt(struct task_struct *task, int cpu)
+{
+ __u32 softirqs = per_cpu(active_softirqs, cpu) |
+ __IRQ_STAT(cpu, __softirq_pending);
+ struct task_struct *cpu_ksoftirqd = per_cpu(ksoftirqd, cpu);
+
+ return ((softirqs & LONG_SOFTIRQ_MASK) &&
+ (task == cpu_ksoftirqd ||
+ task_thread_info(task)->preempt_count & SOFTIRQ_MASK));
+}
+
/*
* Perform a schedtune dequeue and cancelation of boost timers if needed.
* Should be called only with the rq->lock held.
@@ -1501,6 +1609,11 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags,
{
struct task_struct *curr;
struct rq *rq;
+ bool may_not_preempt;
+
+#ifdef CONFIG_SCHED_HMP
+ return select_task_rq_rt_hmp(p, cpu, sd_flag, flags);
+#endif
/* For anything but wake ups, just return the task_cpu */
if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
@@ -1512,7 +1625,17 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags,
curr = READ_ONCE(rq->curr); /* unlocked access */
/*
- * If the current task on @p's runqueue is an RT task, then
+ * If the current task on @p's runqueue is a softirq task,
+ * it may run without preemption for a time that is
+ * ill-suited for a waiting RT task. Therefore, try to
+ * wake this RT task on another runqueue.
+ *
+ * Also, if the current task on @p's runqueue is an RT task, then
+ * it may run without preemption for a time that is
+ * ill-suited for a waiting RT task. Therefore, try to
+ * wake this RT task on another runqueue.
+ *
+ * Also, if the current task on @p's runqueue is an RT task, then
* try to see if we can wake this RT task up on another
* runqueue. Otherwise simply start this RT task
* on its current runqueue.
@@ -1533,17 +1656,22 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags,
* This test is optimistic, if we get it wrong the load-balancer
* will have to sort it out.
*/
- if (curr && unlikely(rt_task(curr)) &&
+ may_not_preempt = task_may_not_preempt(curr, cpu);
+ if (may_not_preempt ||
+ (unlikely(rt_task(curr)) &&
(curr->nr_cpus_allowed < 2 ||
- curr->prio <= p->prio)) {
+ curr->prio <= p->prio))) {
int target = find_lowest_rq(p);
/*
- * Don't bother moving it if the destination CPU is
- * not running a lower priority task.
+ * If cpu is non-preemptible, prefer remote cpu
+ * even if it's running a higher-prio task.
+ * Otherwise: Don't bother moving it if the
+ * destination CPU is not running a lower priority task.
*/
if (target != -1 &&
- p->prio < cpu_rq(target)->rt.highest_prio.curr)
+ (may_not_preempt ||
+ p->prio < cpu_rq(target)->rt.highest_prio.curr))
cpu = target;
}
rcu_read_unlock();
@@ -1752,6 +1880,109 @@ static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
+#ifdef CONFIG_SCHED_HMP
+
+static int find_lowest_rq_hmp(struct task_struct *task)
+{
+ struct cpumask *lowest_mask = *this_cpu_ptr(&local_cpu_mask);
+ struct cpumask candidate_mask = CPU_MASK_NONE;
+ struct sched_cluster *cluster;
+ int best_cpu = -1;
+ int prev_cpu = task_cpu(task);
+ u64 cpu_load, min_load = ULLONG_MAX;
+ int i;
+ int restrict_cluster;
+ int boost_on_big;
+ int pack_task, wakeup_latency, least_wakeup_latency = INT_MAX;
+
+ boost_on_big = sched_boost() == FULL_THROTTLE_BOOST &&
+ sched_boost_policy() == SCHED_BOOST_ON_BIG;
+
+ restrict_cluster = sysctl_sched_restrict_cluster_spill;
+
+ /* Make sure the mask is initialized first */
+ if (unlikely(!lowest_mask))
+ return best_cpu;
+
+ if (task->nr_cpus_allowed == 1)
+ return best_cpu; /* No other targets possible */
+
+ if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
+ return best_cpu; /* No targets found */
+
+ pack_task = is_short_burst_task(task);
+
+ /*
+ * At this point we have built a mask of cpus representing the
+ * lowest priority tasks in the system. Now we want to elect
+ * the best one based on our affinity and topology.
+ */
+
+retry:
+ for_each_sched_cluster(cluster) {
+ if (boost_on_big && cluster->capacity != max_possible_capacity)
+ continue;
+
+ cpumask_and(&candidate_mask, &cluster->cpus, lowest_mask);
+ cpumask_andnot(&candidate_mask, &candidate_mask,
+ cpu_isolated_mask);
+ /*
+ * When placement boost is active, if there is no eligible CPU
+ * in the highest capacity cluster, we fallback to the other
+ * clusters. So clear the CPUs of the traversed cluster from
+ * the lowest_mask.
+ */
+ if (unlikely(boost_on_big))
+ cpumask_andnot(lowest_mask, lowest_mask,
+ &cluster->cpus);
+
+ if (cpumask_empty(&candidate_mask))
+ continue;
+
+ for_each_cpu(i, &candidate_mask) {
+ if (sched_cpu_high_irqload(i))
+ continue;
+
+ cpu_load = cpu_rq(i)->hmp_stats.cumulative_runnable_avg;
+ if (!restrict_cluster)
+ cpu_load = scale_load_to_cpu(cpu_load, i);
+
+ if (pack_task) {
+ wakeup_latency = cpu_rq(i)->wakeup_latency;
+
+ if (wakeup_latency > least_wakeup_latency)
+ continue;
+
+ if (wakeup_latency < least_wakeup_latency) {
+ least_wakeup_latency = wakeup_latency;
+ min_load = cpu_load;
+ best_cpu = i;
+ continue;
+ }
+ }
+
+ if (cpu_load < min_load ||
+ (cpu_load == min_load &&
+ (i == prev_cpu || (best_cpu != prev_cpu &&
+ cpus_share_cache(prev_cpu, i))))) {
+ min_load = cpu_load;
+ best_cpu = i;
+ }
+ }
+
+ if (restrict_cluster && best_cpu != -1)
+ break;
+ }
+
+ if (unlikely(boost_on_big && best_cpu == -1)) {
+ boost_on_big = 0;
+ goto retry;
+ }
+
+ return best_cpu;
+}
+#endif /* CONFIG_SCHED_HMP */
+
static int find_lowest_rq(struct task_struct *task)
{
struct sched_domain *sd;
@@ -1759,6 +1990,10 @@ static int find_lowest_rq(struct task_struct *task)
int this_cpu = smp_processor_id();
int cpu = task_cpu(task);
+#ifdef CONFIG_SCHED_HMP
+ return find_lowest_rq_hmp(task);
+#endif
+
/* Make sure the mask is initialized first */
if (unlikely(!lowest_mask))
return -1;
@@ -1975,11 +2210,13 @@ retry:
goto retry;
}
+ next_task->on_rq = TASK_ON_RQ_MIGRATING;
deactivate_task(rq, next_task, 0);
next_task->on_rq = TASK_ON_RQ_MIGRATING;
set_task_cpu(next_task, lowest_rq->cpu);
next_task->on_rq = TASK_ON_RQ_QUEUED;
activate_task(lowest_rq, next_task, 0);
+ next_task->on_rq = TASK_ON_RQ_QUEUED;
ret = 1;
resched_curr(lowest_rq);
@@ -2249,11 +2486,13 @@ static void pull_rt_task(struct rq *this_rq)
resched = true;
+ p->on_rq = TASK_ON_RQ_MIGRATING;
deactivate_task(src_rq, p, 0);
p->on_rq = TASK_ON_RQ_MIGRATING;
set_task_cpu(p, this_cpu);
p->on_rq = TASK_ON_RQ_QUEUED;
activate_task(this_rq, p, 0);
+ p->on_rq = TASK_ON_RQ_QUEUED;
/*
* We continue with the search, just in
* case there's an even higher prio task
@@ -2326,7 +2565,8 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
* we may need to handle the pulling of RT tasks
* now.
*/
- if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
+ if (!task_on_rq_queued(p) || rq->rt.rt_nr_running ||
+ cpu_isolated(cpu_of(rq)))
return;
queue_pull_task(rq);
@@ -2341,6 +2581,7 @@ void __init init_sched_rt_class(void)
GFP_KERNEL, cpu_to_node(i));
}
}
+
#endif /* CONFIG_SMP */
/*
@@ -2514,6 +2755,11 @@ const struct sched_class rt_sched_class = {
.switched_to = switched_to_rt,
.update_curr = update_curr_rt,
+#ifdef CONFIG_SCHED_HMP
+ .inc_hmp_sched_stats = inc_hmp_sched_stats_rt,
+ .dec_hmp_sched_stats = dec_hmp_sched_stats_rt,
+ .fixup_hmp_sched_stats = fixup_hmp_sched_stats_rt,
+#endif
};
#ifdef CONFIG_SCHED_DEBUG
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d066a6870245..90cc450dff7e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -28,14 +28,13 @@ extern unsigned long calc_load_update;
extern atomic_long_t calc_load_tasks;
extern void calc_global_load_tick(struct rq *this_rq);
+
extern long calc_load_fold_active(struct rq *this_rq);
#ifdef CONFIG_SMP
extern void update_cpu_load_active(struct rq *this_rq);
-extern void check_for_migration(struct rq *rq, struct task_struct *p);
#else
static inline void update_cpu_load_active(struct rq *this_rq) { }
-static inline void check_for_migration(struct rq *rq, struct task_struct *p) { }
#endif
/*
@@ -245,6 +244,10 @@ struct cfs_bandwidth {
struct task_group {
struct cgroup_subsys_state css;
+#ifdef CONFIG_SCHED_HMP
+ bool upmigrate_discouraged;
+#endif
+
#ifdef CONFIG_FAIR_GROUP_SCHED
/* schedulable entities of this group on each cpu */
struct sched_entity **se;
@@ -350,12 +353,96 @@ static inline void set_task_rq_fair(struct sched_entity *se,
#endif /* CONFIG_SMP */
#endif /* CONFIG_FAIR_GROUP_SCHED */
+extern struct task_group *css_tg(struct cgroup_subsys_state *css);
#else /* CONFIG_CGROUP_SCHED */
struct cfs_bandwidth { };
#endif /* CONFIG_CGROUP_SCHED */
+#ifdef CONFIG_SCHED_HMP
+
+#define NUM_TRACKED_WINDOWS 2
+#define NUM_LOAD_INDICES 1000
+
+struct hmp_sched_stats {
+ int nr_big_tasks;
+ u64 cumulative_runnable_avg;
+ u64 pred_demands_sum;
+};
+
+struct load_subtractions {
+ u64 window_start;
+ u64 subs;
+ u64 new_subs;
+};
+
+struct group_cpu_time {
+ u64 curr_runnable_sum;
+ u64 prev_runnable_sum;
+ u64 nt_curr_runnable_sum;
+ u64 nt_prev_runnable_sum;
+};
+
+struct sched_cluster {
+ raw_spinlock_t load_lock;
+ struct list_head list;
+ struct cpumask cpus;
+ int id;
+ int max_power_cost;
+ int min_power_cost;
+ int max_possible_capacity;
+ int capacity;
+ int efficiency; /* Differentiate cpus with different IPC capability */
+ int load_scale_factor;
+ unsigned int exec_scale_factor;
+ /*
+ * max_freq = user maximum
+ * max_mitigated_freq = thermal defined maximum
+ * max_possible_freq = maximum supported by hardware
+ */
+ unsigned int cur_freq, max_freq, max_mitigated_freq, min_freq;
+ unsigned int max_possible_freq;
+ bool freq_init_done;
+ int dstate, dstate_wakeup_latency, dstate_wakeup_energy;
+ unsigned int static_cluster_pwr_cost;
+ int notifier_sent;
+ bool wake_up_idle;
+ atomic64_t last_cc_update;
+ atomic64_t cycles;
+};
+
+extern unsigned long all_cluster_ids[];
+
+static inline int cluster_first_cpu(struct sched_cluster *cluster)
+{
+ return cpumask_first(&cluster->cpus);
+}
+
+struct related_thread_group {
+ int id;
+ raw_spinlock_t lock;
+ struct list_head tasks;
+ struct list_head list;
+ struct sched_cluster *preferred_cluster;
+ struct rcu_head rcu;
+ u64 last_update;
+};
+
+extern struct list_head cluster_head;
+extern struct sched_cluster *sched_cluster[NR_CPUS];
+
+struct cpu_cycle {
+ u64 cycles;
+ u64 time;
+};
+
+#define for_each_sched_cluster(cluster) \
+ list_for_each_entry_rcu(cluster, &cluster_head, list)
+
+extern unsigned int sched_disable_window_stats;
+#endif /* CONFIG_SCHED_HMP */
+
/* CFS-related fields in a runqueue */
struct cfs_rq {
struct load_weight load;
@@ -424,11 +511,12 @@ struct cfs_rq {
struct list_head leaf_cfs_rq_list;
struct task_group *tg; /* group that "owns" this runqueue */
-#ifdef CONFIG_SCHED_WALT
- u64 cumulative_runnable_avg;
+#ifdef CONFIG_CFS_BANDWIDTH
+
+#ifdef CONFIG_SCHED_HMP
+ struct hmp_sched_stats hmp_stats;
#endif
-#ifdef CONFIG_CFS_BANDWIDTH
int runtime_enabled;
u64 runtime_expires;
s64 runtime_remaining;
@@ -698,6 +786,38 @@ struct rq {
u64 max_idle_balance_cost;
#endif
+#ifdef CONFIG_SCHED_HMP
+ struct sched_cluster *cluster;
+ struct cpumask freq_domain_cpumask;
+ struct hmp_sched_stats hmp_stats;
+
+ int cstate, wakeup_latency, wakeup_energy;
+ u64 window_start;
+ u64 load_reported_window;
+ unsigned long hmp_flags;
+
+ u64 cur_irqload;
+ u64 avg_irqload;
+ u64 irqload_ts;
+ unsigned int static_cpu_pwr_cost;
+ struct task_struct *ed_task;
+ struct cpu_cycle cc;
+ u64 old_busy_time, old_busy_time_group;
+ u64 old_estimated_time;
+ u64 curr_runnable_sum;
+ u64 prev_runnable_sum;
+ u64 nt_curr_runnable_sum;
+ u64 nt_prev_runnable_sum;
+ struct group_cpu_time grp_time;
+ struct load_subtractions load_subs[NUM_TRACKED_WINDOWS];
+ DECLARE_BITMAP_ARRAY(top_tasks_bitmap,
+ NUM_TRACKED_WINDOWS, NUM_LOAD_INDICES);
+ u8 *top_tasks[NUM_TRACKED_WINDOWS];
+ u8 curr_table;
+ int prev_top;
+ int curr_top;
+#endif
+
#ifdef CONFIG_SCHED_WALT
u64 cumulative_runnable_avg;
u64 window_start;
@@ -711,7 +831,6 @@ struct rq {
u64 cum_window_demand;
#endif /* CONFIG_SCHED_WALT */
-
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
u64 prev_irq_time;
#endif
@@ -986,6 +1105,652 @@ static inline void sched_ttwu_pending(void) { }
#include "stats.h"
#include "auto_group.h"
+enum sched_boost_policy {
+ SCHED_BOOST_NONE,
+ SCHED_BOOST_ON_BIG,
+ SCHED_BOOST_ON_ALL,
+};
+
+#ifdef CONFIG_SCHED_HMP
+
+#define WINDOW_STATS_RECENT 0
+#define WINDOW_STATS_MAX 1
+#define WINDOW_STATS_MAX_RECENT_AVG 2
+#define WINDOW_STATS_AVG 3
+#define WINDOW_STATS_INVALID_POLICY 4
+
+#define SCHED_UPMIGRATE_MIN_NICE 15
+#define EXITING_TASK_MARKER 0xdeaddead
+
+#define UP_MIGRATION 1
+#define DOWN_MIGRATION 2
+#define IRQLOAD_MIGRATION 3
+
+extern struct mutex policy_mutex;
+extern unsigned int sched_ravg_window;
+extern unsigned int sched_disable_window_stats;
+extern unsigned int max_possible_freq;
+extern unsigned int min_max_freq;
+extern unsigned int pct_task_load(struct task_struct *p);
+extern unsigned int max_possible_efficiency;
+extern unsigned int min_possible_efficiency;
+extern unsigned int max_capacity;
+extern unsigned int min_capacity;
+extern unsigned int max_load_scale_factor;
+extern unsigned int max_possible_capacity;
+extern unsigned int min_max_possible_capacity;
+extern unsigned int max_power_cost;
+extern unsigned int sched_init_task_load_windows;
+extern unsigned int up_down_migrate_scale_factor;
+extern unsigned int sysctl_sched_restrict_cluster_spill;
+extern unsigned int sched_pred_alert_load;
+extern struct sched_cluster init_cluster;
+extern unsigned int __read_mostly sched_short_sleep_task_threshold;
+extern unsigned int __read_mostly sched_long_cpu_selection_threshold;
+extern unsigned int __read_mostly sched_big_waker_task_load;
+extern unsigned int __read_mostly sched_small_wakee_task_load;
+extern unsigned int __read_mostly sched_spill_load;
+extern unsigned int __read_mostly sched_upmigrate;
+extern unsigned int __read_mostly sched_downmigrate;
+extern unsigned int __read_mostly sched_load_granule;
+
+extern void init_new_task_load(struct task_struct *p);
+extern u64 sched_ktime_clock(void);
+extern int got_boost_kick(void);
+extern int register_cpu_cycle_counter_cb(struct cpu_cycle_counter_cb *cb);
+extern void update_task_ravg(struct task_struct *p, struct rq *rq, int event,
+ u64 wallclock, u64 irqtime);
+extern bool early_detection_notify(struct rq *rq, u64 wallclock);
+extern void clear_ed_task(struct task_struct *p, struct rq *rq);
+extern void fixup_busy_time(struct task_struct *p, int new_cpu);
+extern void clear_boost_kick(int cpu);
+extern void clear_hmp_request(int cpu);
+extern void mark_task_starting(struct task_struct *p);
+extern void set_window_start(struct rq *rq);
+extern void update_cluster_topology(void);
+extern void note_task_waking(struct task_struct *p, u64 wallclock);
+extern void set_task_last_switch_out(struct task_struct *p, u64 wallclock);
+extern void init_clusters(void);
+extern void reset_cpu_hmp_stats(int cpu, int reset_cra);
+extern unsigned int max_task_load(void);
+extern void sched_account_irqtime(int cpu, struct task_struct *curr,
+ u64 delta, u64 wallclock);
+extern void sched_account_irqstart(int cpu, struct task_struct *curr,
+ u64 wallclock);
+extern unsigned int cpu_temp(int cpu);
+extern unsigned int nr_eligible_big_tasks(int cpu);
+extern int update_preferred_cluster(struct related_thread_group *grp,
+ struct task_struct *p, u32 old_load);
+extern void set_preferred_cluster(struct related_thread_group *grp);
+extern void add_new_task_to_grp(struct task_struct *new);
+extern unsigned int update_freq_aggregate_threshold(unsigned int threshold);
+extern void update_avg_burst(struct task_struct *p);
+extern void update_avg(u64 *avg, u64 sample);
+
+#define NO_BOOST 0
+#define FULL_THROTTLE_BOOST 1
+#define CONSERVATIVE_BOOST 2
+#define RESTRAINED_BOOST 3
+
+static inline struct sched_cluster *cpu_cluster(int cpu)
+{
+ return cpu_rq(cpu)->cluster;
+}
+
+static inline int cpu_capacity(int cpu)
+{
+ return cpu_rq(cpu)->cluster->capacity;
+}
+
+static inline int cpu_max_possible_capacity(int cpu)
+{
+ return cpu_rq(cpu)->cluster->max_possible_capacity;
+}
+
+static inline int cpu_load_scale_factor(int cpu)
+{
+ return cpu_rq(cpu)->cluster->load_scale_factor;
+}
+
+static inline int cpu_efficiency(int cpu)
+{
+ return cpu_rq(cpu)->cluster->efficiency;
+}
+
+static inline unsigned int cpu_cur_freq(int cpu)
+{
+ return cpu_rq(cpu)->cluster->cur_freq;
+}
+
+static inline unsigned int cpu_min_freq(int cpu)
+{
+ return cpu_rq(cpu)->cluster->min_freq;
+}
+
+static inline unsigned int cluster_max_freq(struct sched_cluster *cluster)
+{
+ /*
+ * Governor and thermal driver don't know the other party's mitigation
+ * voting. So struct cluster saves both and return min() for current
+ * cluster fmax.
+ */
+ return min(cluster->max_mitigated_freq, cluster->max_freq);
+}
+
+static inline unsigned int cpu_max_freq(int cpu)
+{
+ return cluster_max_freq(cpu_rq(cpu)->cluster);
+}
+
+static inline unsigned int cpu_max_possible_freq(int cpu)
+{
+ return cpu_rq(cpu)->cluster->max_possible_freq;
+}
+
+static inline int same_cluster(int src_cpu, int dst_cpu)
+{
+ return cpu_rq(src_cpu)->cluster == cpu_rq(dst_cpu)->cluster;
+}
+
+static inline int cpu_max_power_cost(int cpu)
+{
+ return cpu_rq(cpu)->cluster->max_power_cost;
+}
+
+static inline int cpu_min_power_cost(int cpu)
+{
+ return cpu_rq(cpu)->cluster->min_power_cost;
+}
+
+static inline u32 cpu_cycles_to_freq(u64 cycles, u64 period)
+{
+ return div64_u64(cycles, period);
+}
+
+static inline bool hmp_capable(void)
+{
+ return max_possible_capacity != min_max_possible_capacity;
+}
+
+static inline bool is_max_capacity_cpu(int cpu)
+{
+ return cpu_max_possible_capacity(cpu) == max_possible_capacity;
+}
+
+static inline bool is_min_capacity_cpu(int cpu)
+{
+ return cpu_max_possible_capacity(cpu) == min_max_possible_capacity;
+}
+
+/*
+ * 'load' is in reference to "best cpu" at its best frequency.
+ * Scale that in reference to a given cpu, accounting for how bad it is
+ * in reference to "best cpu".
+ */
+static inline u64 scale_load_to_cpu(u64 task_load, int cpu)
+{
+ u64 lsf = cpu_load_scale_factor(cpu);
+
+ if (lsf != 1024) {
+ task_load *= lsf;
+ task_load /= 1024;
+ }
+
+ return task_load;
+}
+
+static inline unsigned int task_load(struct task_struct *p)
+{
+ return p->ravg.demand;
+}
+
+static inline void
+inc_cumulative_runnable_avg(struct hmp_sched_stats *stats,
+ struct task_struct *p)
+{
+ u32 task_load;
+
+ if (sched_disable_window_stats)
+ return;
+
+ task_load = sched_disable_window_stats ? 0 : p->ravg.demand;
+
+ stats->cumulative_runnable_avg += task_load;
+ stats->pred_demands_sum += p->ravg.pred_demand;
+}
+
+static inline void
+dec_cumulative_runnable_avg(struct hmp_sched_stats *stats,
+ struct task_struct *p)
+{
+ u32 task_load;
+
+ if (sched_disable_window_stats)
+ return;
+
+ task_load = sched_disable_window_stats ? 0 : p->ravg.demand;
+
+ stats->cumulative_runnable_avg -= task_load;
+
+ BUG_ON((s64)stats->cumulative_runnable_avg < 0);
+
+ stats->pred_demands_sum -= p->ravg.pred_demand;
+ BUG_ON((s64)stats->pred_demands_sum < 0);
+}
+
+static inline void
+fixup_cumulative_runnable_avg(struct hmp_sched_stats *stats,
+ struct task_struct *p, s64 task_load_delta,
+ s64 pred_demand_delta)
+{
+ if (sched_disable_window_stats)
+ return;
+
+ stats->cumulative_runnable_avg += task_load_delta;
+ BUG_ON((s64)stats->cumulative_runnable_avg < 0);
+
+ stats->pred_demands_sum += pred_demand_delta;
+ BUG_ON((s64)stats->pred_demands_sum < 0);
+}
+
+#define pct_to_real(tunable) \
+ (div64_u64((u64)tunable * (u64)max_task_load(), 100))
+
+#define real_to_pct(tunable) \
+ (div64_u64((u64)tunable * (u64)100, (u64)max_task_load()))
+
+#define SCHED_HIGH_IRQ_TIMEOUT 3
+static inline u64 sched_irqload(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ s64 delta;
+
+ delta = get_jiffies_64() - rq->irqload_ts;
+ /*
+ * Current context can be preempted by irq and rq->irqload_ts can be
+ * updated by irq context so that delta can be negative.
+ * But this is okay and we can safely return as this means there
+ * was recent irq occurrence.
+ */
+
+ if (delta < SCHED_HIGH_IRQ_TIMEOUT)
+ return rq->avg_irqload;
+ else
+ return 0;
+}
+
+static inline int sched_cpu_high_irqload(int cpu)
+{
+ return sched_irqload(cpu) >= sysctl_sched_cpu_high_irqload;
+}
+
+static inline bool task_in_related_thread_group(struct task_struct *p)
+{
+ return !!(rcu_access_pointer(p->grp) != NULL);
+}
+
+static inline
+struct related_thread_group *task_related_thread_group(struct task_struct *p)
+{
+ return rcu_dereference(p->grp);
+}
+
+#define PRED_DEMAND_DELTA ((s64)new_pred_demand - p->ravg.pred_demand)
+
+extern void
+check_for_freq_change(struct rq *rq, bool check_pred, bool check_groups);
+
+extern void notify_migration(int src_cpu, int dest_cpu,
+ bool src_cpu_dead, struct task_struct *p);
+
+/* Is frequency of two cpus synchronized with each other? */
+static inline int same_freq_domain(int src_cpu, int dst_cpu)
+{
+ struct rq *rq = cpu_rq(src_cpu);
+
+ if (src_cpu == dst_cpu)
+ return 1;
+
+ return cpumask_test_cpu(dst_cpu, &rq->freq_domain_cpumask);
+}
+
+#define BOOST_KICK 0
+#define CPU_RESERVED 1
+
+static inline int is_reserved(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ return test_bit(CPU_RESERVED, &rq->hmp_flags);
+}
+
+static inline int mark_reserved(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ /* Name boost_flags as hmp_flags? */
+ return test_and_set_bit(CPU_RESERVED, &rq->hmp_flags);
+}
+
+static inline void clear_reserved(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ clear_bit(CPU_RESERVED, &rq->hmp_flags);
+}
+
+static inline u64 cpu_cravg_sync(int cpu, int sync)
+{
+ struct rq *rq = cpu_rq(cpu);
+ u64 load;
+
+ load = rq->hmp_stats.cumulative_runnable_avg;
+
+ /*
+ * If load is being checked in a sync wakeup environment,
+ * we may want to discount the load of the currently running
+ * task.
+ */
+ if (sync && cpu == smp_processor_id()) {
+ if (load > rq->curr->ravg.demand)
+ load -= rq->curr->ravg.demand;
+ else
+ load = 0;
+ }
+
+ return load;
+}
+
+static inline bool is_short_burst_task(struct task_struct *p)
+{
+ return p->ravg.avg_burst < sysctl_sched_short_burst &&
+ p->ravg.avg_sleep_time > sysctl_sched_short_sleep;
+}
+
+extern void check_for_migration(struct rq *rq, struct task_struct *p);
+extern void pre_big_task_count_change(const struct cpumask *cpus);
+extern void post_big_task_count_change(const struct cpumask *cpus);
+extern void set_hmp_defaults(void);
+extern int power_delta_exceeded(unsigned int cpu_cost, unsigned int base_cost);
+extern unsigned int power_cost(int cpu, u64 demand);
+extern void reset_all_window_stats(u64 window_start, unsigned int window_size);
+extern int sched_boost(void);
+extern int task_load_will_fit(struct task_struct *p, u64 task_load, int cpu,
+ enum sched_boost_policy boost_policy);
+extern enum sched_boost_policy sched_boost_policy(void);
+extern int task_will_fit(struct task_struct *p, int cpu);
+extern u64 cpu_load(int cpu);
+extern u64 cpu_load_sync(int cpu, int sync);
+extern int preferred_cluster(struct sched_cluster *cluster,
+ struct task_struct *p);
+extern void inc_nr_big_task(struct hmp_sched_stats *stats,
+ struct task_struct *p);
+extern void dec_nr_big_task(struct hmp_sched_stats *stats,
+ struct task_struct *p);
+extern void inc_rq_hmp_stats(struct rq *rq,
+ struct task_struct *p, int change_cra);
+extern void dec_rq_hmp_stats(struct rq *rq,
+ struct task_struct *p, int change_cra);
+extern void reset_hmp_stats(struct hmp_sched_stats *stats, int reset_cra);
+extern int is_big_task(struct task_struct *p);
+extern int upmigrate_discouraged(struct task_struct *p);
+extern struct sched_cluster *rq_cluster(struct rq *rq);
+extern int nr_big_tasks(struct rq *rq);
+extern void fixup_nr_big_tasks(struct hmp_sched_stats *stats,
+ struct task_struct *p, s64 delta);
+extern void reset_task_stats(struct task_struct *p);
+extern void reset_cfs_rq_hmp_stats(int cpu, int reset_cra);
+extern void _inc_hmp_sched_stats_fair(struct rq *rq,
+ struct task_struct *p, int change_cra);
+extern u64 cpu_upmigrate_discourage_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft);
+extern int cpu_upmigrate_discourage_write_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 upmigrate_discourage);
+extern void sched_boost_parse_dt(void);
+extern void clear_top_tasks_bitmap(unsigned long *bitmap);
+
+#if defined(CONFIG_SCHED_TUNE) && defined(CONFIG_CGROUP_SCHEDTUNE)
+extern bool task_sched_boost(struct task_struct *p);
+extern int sync_cgroup_colocation(struct task_struct *p, bool insert);
+extern bool same_schedtune(struct task_struct *tsk1, struct task_struct *tsk2);
+extern void update_cgroup_boost_settings(void);
+extern void restore_cgroup_boost_settings(void);
+
+#else
+static inline bool
+same_schedtune(struct task_struct *tsk1, struct task_struct *tsk2)
+{
+ return true;
+}
+
+static inline bool task_sched_boost(struct task_struct *p)
+{
+ return true;
+}
+
+static inline void update_cgroup_boost_settings(void) { }
+static inline void restore_cgroup_boost_settings(void) { }
+#endif
+
+extern int alloc_related_thread_groups(void);
+
+#else /* CONFIG_SCHED_HMP */
+
+struct hmp_sched_stats;
+struct related_thread_group;
+struct sched_cluster;
+
+static inline enum sched_boost_policy sched_boost_policy(void)
+{
+ return SCHED_BOOST_NONE;
+}
+
+static inline bool task_sched_boost(struct task_struct *p)
+{
+ return true;
+}
+
+static inline int got_boost_kick(void)
+{
+ return 0;
+}
+
+static inline void update_task_ravg(struct task_struct *p, struct rq *rq,
+ int event, u64 wallclock, u64 irqtime) { }
+
+static inline bool early_detection_notify(struct rq *rq, u64 wallclock)
+{
+ return 0;
+}
+
+static inline void clear_ed_task(struct task_struct *p, struct rq *rq) { }
+static inline void fixup_busy_time(struct task_struct *p, int new_cpu) { }
+static inline void clear_boost_kick(int cpu) { }
+static inline void clear_hmp_request(int cpu) { }
+static inline void mark_task_starting(struct task_struct *p) { }
+static inline void set_window_start(struct rq *rq) { }
+static inline void init_clusters(void) {}
+static inline void update_cluster_topology(void) { }
+static inline void note_task_waking(struct task_struct *p, u64 wallclock) { }
+static inline void set_task_last_switch_out(struct task_struct *p,
+ u64 wallclock) { }
+
+static inline int task_will_fit(struct task_struct *p, int cpu)
+{
+ return 1;
+}
+
+static inline int select_best_cpu(struct task_struct *p, int target,
+ int reason, int sync)
+{
+ return 0;
+}
+
+static inline unsigned int power_cost(int cpu, u64 demand)
+{
+ return SCHED_CAPACITY_SCALE;
+}
+
+static inline int sched_boost(void)
+{
+ return 0;
+}
+
+static inline int is_big_task(struct task_struct *p)
+{
+ return 0;
+}
+
+static inline int nr_big_tasks(struct rq *rq)
+{
+ return 0;
+}
+
+static inline int is_cpu_throttling_imminent(int cpu)
+{
+ return 0;
+}
+
+static inline int is_task_migration_throttled(struct task_struct *p)
+{
+ return 0;
+}
+
+static inline unsigned int cpu_temp(int cpu)
+{
+ return 0;
+}
+
+static inline void
+inc_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra) { }
+
+static inline void
+dec_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra) { }
+
+static inline void
+inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p) { }
+
+static inline void
+dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p) { }
+
+static inline int
+preferred_cluster(struct sched_cluster *cluster, struct task_struct *p)
+{
+ return 1;
+}
+
+static inline struct sched_cluster *rq_cluster(struct rq *rq)
+{
+ return NULL;
+}
+
+static inline void init_new_task_load(struct task_struct *p)
+{
+}
+
+static inline u64 scale_load_to_cpu(u64 load, int cpu)
+{
+ return load;
+}
+
+static inline unsigned int nr_eligible_big_tasks(int cpu)
+{
+ return 0;
+}
+
+static inline bool is_max_capacity_cpu(int cpu) { return true; }
+
+static inline int pct_task_load(struct task_struct *p) { return 0; }
+
+static inline int cpu_capacity(int cpu)
+{
+ return SCHED_LOAD_SCALE;
+}
+
+static inline int same_cluster(int src_cpu, int dst_cpu) { return 1; }
+
+static inline void inc_cumulative_runnable_avg(struct hmp_sched_stats *stats,
+ struct task_struct *p)
+{
+}
+
+static inline void dec_cumulative_runnable_avg(struct hmp_sched_stats *stats,
+ struct task_struct *p)
+{
+}
+
+static inline void sched_account_irqtime(int cpu, struct task_struct *curr,
+ u64 delta, u64 wallclock)
+{
+}
+
+static inline void sched_account_irqstart(int cpu, struct task_struct *curr,
+ u64 wallclock)
+{
+}
+
+static inline int sched_cpu_high_irqload(int cpu) { return 0; }
+
+static inline void set_preferred_cluster(struct related_thread_group *grp) { }
+
+static inline bool task_in_related_thread_group(struct task_struct *p)
+{
+ return false;
+}
+
+static inline
+struct related_thread_group *task_related_thread_group(struct task_struct *p)
+{
+ return NULL;
+}
+
+static inline u32 task_load(struct task_struct *p) { return 0; }
+
+static inline int update_preferred_cluster(struct related_thread_group *grp,
+ struct task_struct *p, u32 old_load)
+{
+ return 0;
+}
+
+static inline void add_new_task_to_grp(struct task_struct *new) {}
+
+#define PRED_DEMAND_DELTA (0)
+
+static inline void
+check_for_freq_change(struct rq *rq, bool check_pred, bool check_groups) { }
+
+static inline void notify_migration(int src_cpu, int dest_cpu,
+ bool src_cpu_dead, struct task_struct *p) { }
+
+static inline int same_freq_domain(int src_cpu, int dst_cpu)
+{
+ return 1;
+}
+
+static inline void check_for_migration(struct rq *rq, struct task_struct *p) { }
+static inline void pre_big_task_count_change(void) { }
+static inline void post_big_task_count_change(void) { }
+static inline void set_hmp_defaults(void) { }
+
+static inline void clear_reserved(int cpu) { }
+static inline void sched_boost_parse_dt(void) {}
+static inline int alloc_related_thread_groups(void) { return 0; }
+
+#define trace_sched_cpu_load(...)
+#define trace_sched_cpu_load_lb(...)
+#define trace_sched_cpu_load_cgroup(...)
+#define trace_sched_cpu_load_wakeup(...)
+
+static inline void update_avg_burst(struct task_struct *p) {}
+
+#endif /* CONFIG_SCHED_HMP */
+
+/*
+ * Returns the rq capacity of any rq in a group. This does not play
+ * well with groups where rq capacity can change independently.
+ */
+#define group_rq_capacity(group) cpu_capacity(group_first_cpu(group))
+
#ifdef CONFIG_CGROUP_SCHED
/*
@@ -1032,7 +1797,6 @@ static inline struct task_group *task_group(struct task_struct *p)
{
return NULL;
}
-
#endif /* CONFIG_CGROUP_SCHED */
static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
@@ -1090,7 +1854,7 @@ static __always_inline bool static_branch_##name(struct static_key *key) \
extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))
#else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */
-#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
+#define sched_feat(x) !!(sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
extern struct static_key_false sched_numa_balancing;
@@ -1186,6 +1950,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */
#define WF_FORK 0x02 /* child wakeup after fork */
#define WF_MIGRATED 0x4 /* internal use, task got migrated */
+#define WF_NO_NOTIFIER 0x08 /* do not notify governor */
/*
* To aid in avoiding the subversion of "niceness" due to uneven distribution
@@ -1240,19 +2005,41 @@ static const u32 prio_to_wmult[40] = {
/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
};
+/*
+ * {de,en}queue flags:
+ *
+ * DEQUEUE_SLEEP - task is no longer runnable
+ * ENQUEUE_WAKEUP - task just became runnable
+ *
+ * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks
+ * are in a known state which allows modification. Such pairs
+ * should preserve as much state as possible.
+ *
+ * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location
+ * in the runqueue.
+ *
+ * ENQUEUE_HEAD - place at front of runqueue (tail if not specified)
+ * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
+ * ENQUEUE_WAKING - sched_class::task_waking was called
+ *
+ */
+
+#define DEQUEUE_SLEEP 0x01
+#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */
+#define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */
+
#define ENQUEUE_WAKEUP 0x01
-#define ENQUEUE_HEAD 0x02
+#define ENQUEUE_RESTORE 0x02
+#define ENQUEUE_MOVE 0x04
+
+#define ENQUEUE_HEAD 0x08
+#define ENQUEUE_REPLENISH 0x10
#ifdef CONFIG_SMP
-#define ENQUEUE_WAKING 0x04 /* sched_class::task_waking was called */
+#define ENQUEUE_WAKING 0x20
#else
#define ENQUEUE_WAKING 0x00
#endif
-#define ENQUEUE_REPLENISH 0x08
-#define ENQUEUE_RESTORE 0x10
-#define ENQUEUE_WAKEUP_NEW 0x20
-
-#define DEQUEUE_SLEEP 0x01
-#define DEQUEUE_SAVE 0x02
+#define ENQUEUE_WAKEUP_NEW 0x40
#define RETRY_TASK ((void *)-1UL)
@@ -1319,6 +2106,12 @@ struct sched_class {
#ifdef CONFIG_FAIR_GROUP_SCHED
void (*task_change_group)(struct task_struct *p, int type);
#endif
+#ifdef CONFIG_SCHED_HMP
+ void (*inc_hmp_sched_stats)(struct rq *rq, struct task_struct *p);
+ void (*dec_hmp_sched_stats)(struct rq *rq, struct task_struct *p);
+ void (*fixup_hmp_sched_stats)(struct rq *rq, struct task_struct *p,
+ u32 new_task_load, u32 new_pred_demand);
+#endif
};
static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
@@ -1343,6 +2136,7 @@ extern void init_max_cpu_capacity(struct max_cpu_capacity *mcc);
extern void update_group_capacity(struct sched_domain *sd, int cpu);
extern void trigger_load_balance(struct rq *rq);
+extern void nohz_balance_clear_nohz_mask(int cpu);
extern void idle_enter_fair(struct rq *this_rq);
extern void idle_exit_fair(struct rq *this_rq);
@@ -1400,7 +2194,9 @@ static inline int idle_get_state_idx(struct rq *rq)
}
#endif
+#ifdef CONFIG_SYSRQ_SCHED_DEBUG
extern void sysrq_sched_debug_show(void);
+#endif
extern void sched_init_granularity(void);
extern void update_max_interval(void);
@@ -1428,6 +2224,7 @@ static inline void __add_nr_running(struct rq *rq, unsigned count)
{
unsigned prev_nr = rq->nr_running;
+ sched_update_nr_prod(cpu_of(rq), count, true);
rq->nr_running = prev_nr + count;
if (prev_nr < 2 && rq->nr_running >= 2) {
@@ -1454,6 +2251,7 @@ static inline void __add_nr_running(struct rq *rq, unsigned count)
static inline void __sub_nr_running(struct rq *rq, unsigned count)
{
+ sched_update_nr_prod(cpu_of(rq), count, false);
rq->nr_running -= count;
}
@@ -1617,6 +2415,7 @@ static inline unsigned long __cpu_util(int cpu, int delta)
util = div64_u64(cpu_rq(cpu)->cumulative_runnable_avg,
walt_ravg_window >> SCHED_LOAD_SHIFT);
#endif
+
delta += util;
if (delta < 0)
return 0;
@@ -1644,6 +2443,20 @@ static inline unsigned long cpu_util_freq(int cpu)
#endif
+#ifdef CONFIG_SCHED_HMP
+/*
+ * HMP and EAS are orthogonal. Hopefully the compiler just elides out all code
+ * with the energy_aware() check, so that we don't even pay the comparison
+ * penalty at runtime.
+ */
+#define energy_aware() false
+#else
+static inline bool energy_aware(void)
+{
+ return sched_feat(ENERGY_AWARE);
+}
+#endif
+
static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
{
rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
@@ -1884,6 +2697,11 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
__release(rq2->lock);
}
+/*
+ * task_may_not_preempt - check whether a task may not be preemptible soon
+ */
+extern bool task_may_not_preempt(struct task_struct *task, int cpu);
+
#else /* CONFIG_SMP */
/*
@@ -1951,6 +2769,9 @@ enum rq_nohz_flag_bits {
NOHZ_BALANCE_KICK,
};
+#define NOHZ_KICK_ANY 0
+#define NOHZ_KICK_RESTRICT 1
+
#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
#endif
@@ -2032,6 +2853,18 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
{
struct update_util_data *data;
+#ifdef CONFIG_SCHED_HMP
+ /*
+ * Skip if we've already reported, but not if this is an inter-cluster
+ * migration
+ */
+ if (!sched_disable_window_stats &&
+ (rq->load_reported_window == rq->window_start) &&
+ !(flags & SCHED_CPUFREQ_INTERCLUSTER_MIG))
+ return;
+ rq->load_reported_window = rq->window_start;
+#endif
+
data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
if (data)
data->func(data, rq_clock(rq), flags);
diff --git a/kernel/sched/sched_avg.c b/kernel/sched/sched_avg.c
new file mode 100644
index 000000000000..f03ed685f102
--- /dev/null
+++ b/kernel/sched/sched_avg.c
@@ -0,0 +1,199 @@
+/* Copyright (c) 2012, 2015-2017, 2018 The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+/*
+ * Scheduler hook for average runqueue determination
+ */
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/hrtimer.h>
+#include <linux/sched.h>
+#include <linux/math64.h>
+
+#include "sched.h"
+#include <trace/events/sched.h>
+
+static DEFINE_PER_CPU(u64, nr_prod_sum);
+static DEFINE_PER_CPU(u64, last_time);
+static DEFINE_PER_CPU(u64, nr_big_prod_sum);
+static DEFINE_PER_CPU(u64, nr);
+static DEFINE_PER_CPU(u64, nr_max);
+
+static DEFINE_PER_CPU(unsigned long, iowait_prod_sum);
+static DEFINE_PER_CPU(spinlock_t, nr_lock) = __SPIN_LOCK_UNLOCKED(nr_lock);
+static s64 last_get_time;
+
+#define DIV64_U64_ROUNDUP(X, Y) div64_u64((X) + (Y - 1), Y)
+/**
+ * sched_get_nr_running_avg
+ * @return: Average nr_running, iowait and nr_big_tasks value since last poll.
+ * Returns the avg * 100 to return up to two decimal points
+ * of accuracy.
+ *
+ * Obtains the average nr_running value since the last poll.
+ * This function may not be called concurrently with itself
+ */
+void sched_get_nr_running_avg(int *avg, int *iowait_avg, int *big_avg,
+ unsigned int *max_nr, unsigned int *big_max_nr)
+{
+ int cpu;
+ u64 curr_time = sched_clock();
+ u64 diff = curr_time - last_get_time;
+ u64 tmp_avg = 0, tmp_iowait = 0, tmp_big_avg = 0;
+
+ *avg = 0;
+ *iowait_avg = 0;
+ *big_avg = 0;
+ *max_nr = 0;
+ *big_max_nr = 0;
+
+ if (!diff)
+ return;
+
+ /* read and reset nr_running counts */
+ for_each_possible_cpu(cpu) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&per_cpu(nr_lock, cpu), flags);
+ curr_time = sched_clock();
+ diff = curr_time - per_cpu(last_time, cpu);
+ BUG_ON((s64)diff < 0);
+
+ tmp_avg += per_cpu(nr_prod_sum, cpu);
+ tmp_avg += per_cpu(nr, cpu) * diff;
+
+ tmp_big_avg += per_cpu(nr_big_prod_sum, cpu);
+ tmp_big_avg += nr_eligible_big_tasks(cpu) * diff;
+
+ tmp_iowait += per_cpu(iowait_prod_sum, cpu);
+ tmp_iowait += nr_iowait_cpu(cpu) * diff;
+
+ per_cpu(last_time, cpu) = curr_time;
+
+ per_cpu(nr_prod_sum, cpu) = 0;
+ per_cpu(nr_big_prod_sum, cpu) = 0;
+ per_cpu(iowait_prod_sum, cpu) = 0;
+
+ if (*max_nr < per_cpu(nr_max, cpu))
+ *max_nr = per_cpu(nr_max, cpu);
+
+ if (is_max_capacity_cpu(cpu)) {
+ if (*big_max_nr < per_cpu(nr_max, cpu))
+ *big_max_nr = per_cpu(nr_max, cpu);
+ }
+
+ per_cpu(nr_max, cpu) = per_cpu(nr, cpu);
+ spin_unlock_irqrestore(&per_cpu(nr_lock, cpu), flags);
+ }
+
+ diff = curr_time - last_get_time;
+ last_get_time = curr_time;
+
+ /*
+ * Any task running on BIG cluster and BIG tasks running on little
+ * cluster contributes to big_avg. Small or medium tasks can also
+ * run on BIG cluster when co-location and scheduler boost features
+ * are activated. We don't want these tasks to downmigrate to little
+ * cluster when BIG CPUs are available but isolated. Round up the
+ * average values so that core_ctl aggressively unisolate BIG CPUs.
+ */
+ *avg = (int)DIV64_U64_ROUNDUP(tmp_avg, diff);
+ *big_avg = (int)DIV64_U64_ROUNDUP(tmp_big_avg, diff);
+ *iowait_avg = (int)DIV64_U64_ROUNDUP(tmp_iowait, diff);
+
+ trace_sched_get_nr_running_avg(*avg, *big_avg, *iowait_avg,
+ *max_nr, *big_max_nr);
+
+ BUG_ON(*avg < 0 || *big_avg < 0 || *iowait_avg < 0);
+ pr_debug("%s - avg:%d big_avg:%d iowait_avg:%d\n",
+ __func__, *avg, *big_avg, *iowait_avg);
+}
+EXPORT_SYMBOL(sched_get_nr_running_avg);
+
+static DEFINE_PER_CPU(atomic64_t, last_busy_time) = ATOMIC64_INIT(0);
+
+#define BUSY_NR_RUN 3
+#define BUSY_LOAD_FACTOR 10
+
+#ifdef CONFIG_SCHED_HMP
+static inline void update_last_busy_time(int cpu, bool dequeue,
+ unsigned long prev_nr_run, u64 curr_time)
+{
+ bool nr_run_trigger = false, load_trigger = false;
+
+ if (!hmp_capable() || is_min_capacity_cpu(cpu))
+ return;
+
+ if (prev_nr_run >= BUSY_NR_RUN && per_cpu(nr, cpu) < BUSY_NR_RUN)
+ nr_run_trigger = true;
+
+ if (dequeue) {
+ u64 load;
+
+ load = cpu_rq(cpu)->hmp_stats.cumulative_runnable_avg;
+ load = scale_load_to_cpu(load, cpu);
+
+ if (load * BUSY_LOAD_FACTOR > sched_ravg_window)
+ load_trigger = true;
+ }
+
+ if (nr_run_trigger || load_trigger)
+ atomic64_set(&per_cpu(last_busy_time, cpu), curr_time);
+}
+#else
+static inline void update_last_busy_time(int cpu, bool dequeue,
+ unsigned long prev_nr_run, u64 curr_time)
+{
+}
+#endif
+
+/**
+ * sched_update_nr_prod
+ * @cpu: The core id of the nr running driver.
+ * @delta: Adjust nr by 'delta' amount
+ * @inc: Whether we are increasing or decreasing the count
+ * @return: N/A
+ *
+ * Update average with latest nr_running value for CPU
+ */
+void sched_update_nr_prod(int cpu, long delta, bool inc)
+{
+ u64 diff;
+ u64 curr_time;
+ unsigned long flags, nr_running;
+
+ spin_lock_irqsave(&per_cpu(nr_lock, cpu), flags);
+ nr_running = per_cpu(nr, cpu);
+ curr_time = sched_clock();
+ diff = curr_time - per_cpu(last_time, cpu);
+ BUG_ON((s64)diff < 0);
+ per_cpu(last_time, cpu) = curr_time;
+ per_cpu(nr, cpu) = nr_running + (inc ? delta : -delta);
+
+ BUG_ON((s64)per_cpu(nr, cpu) < 0);
+
+ if (per_cpu(nr, cpu) > per_cpu(nr_max, cpu))
+ per_cpu(nr_max, cpu) = per_cpu(nr, cpu);
+
+ update_last_busy_time(cpu, !inc, nr_running, curr_time);
+
+ per_cpu(nr_prod_sum, cpu) += nr_running * diff;
+ per_cpu(nr_big_prod_sum, cpu) += nr_eligible_big_tasks(cpu) * diff;
+ per_cpu(iowait_prod_sum, cpu) += nr_iowait_cpu(cpu) * diff;
+ spin_unlock_irqrestore(&per_cpu(nr_lock, cpu), flags);
+}
+EXPORT_SYMBOL(sched_update_nr_prod);
+
+u64 sched_get_cpu_last_busy_time(int cpu)
+{
+ return atomic64_read(&per_cpu(last_busy_time, cpu));
+}
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index a5567ccd8803..3278c81cefb1 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -1,5 +1,4 @@
#include "sched.h"
-#include "walt.h"
/*
* stop-task scheduling class.
@@ -19,6 +18,41 @@ select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags,
}
#endif /* CONFIG_SMP */
+#ifdef CONFIG_SCHED_HMP
+
+static void
+inc_hmp_sched_stats_stop(struct rq *rq, struct task_struct *p)
+{
+ inc_cumulative_runnable_avg(&rq->hmp_stats, p);
+}
+
+static void
+dec_hmp_sched_stats_stop(struct rq *rq, struct task_struct *p)
+{
+ dec_cumulative_runnable_avg(&rq->hmp_stats, p);
+}
+
+static void
+fixup_hmp_sched_stats_stop(struct rq *rq, struct task_struct *p,
+ u32 new_task_load, u32 new_pred_demand)
+{
+ s64 task_load_delta = (s64)new_task_load - task_load(p);
+ s64 pred_demand_delta = PRED_DEMAND_DELTA;
+
+ fixup_cumulative_runnable_avg(&rq->hmp_stats, p, task_load_delta,
+ pred_demand_delta);
+}
+
+#else /* CONFIG_SCHED_HMP */
+
+static inline void
+inc_hmp_sched_stats_stop(struct rq *rq, struct task_struct *p) { }
+
+static inline void
+dec_hmp_sched_stats_stop(struct rq *rq, struct task_struct *p) { }
+
+#endif /* CONFIG_SCHED_HMP */
+
static void
check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
{
@@ -44,14 +78,14 @@ static void
enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
{
add_nr_running(rq, 1);
- walt_inc_cumulative_runnable_avg(rq, p);
+ inc_hmp_sched_stats_stop(rq, p);
}
static void
dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
{
sub_nr_running(rq, 1);
- walt_dec_cumulative_runnable_avg(rq, p);
+ dec_hmp_sched_stats_stop(rq, p);
}
static void yield_task_stop(struct rq *rq)
@@ -138,4 +172,9 @@ const struct sched_class stop_sched_class = {
.prio_changed = prio_changed_stop,
.switched_to = switched_to_stop,
.update_curr = update_curr_stop,
+#ifdef CONFIG_SCHED_HMP
+ .inc_hmp_sched_stats = inc_hmp_sched_stats_stop,
+ .dec_hmp_sched_stats = dec_hmp_sched_stats_stop,
+ .fixup_hmp_sched_stats = fixup_hmp_sched_stats_stop,
+#endif
};
diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
index d444fc1a4d58..b84d13750604 100644
--- a/kernel/sched/tune.c
+++ b/kernel/sched/tune.c
@@ -121,6 +121,33 @@ struct schedtune {
/* Boost value for tasks on that SchedTune CGroup */
int boost;
+#ifdef CONFIG_SCHED_HMP
+ /* Toggle ability to override sched boost enabled */
+ bool sched_boost_no_override;
+
+ /*
+ * Controls whether a cgroup is eligible for sched boost or not. This
+ * can temporariliy be disabled by the kernel based on the no_override
+ * flag above.
+ */
+ bool sched_boost_enabled;
+
+ /*
+ * This tracks the default value of sched_boost_enabled and is used
+ * restore the value following any temporary changes to that flag.
+ */
+ bool sched_boost_enabled_backup;
+
+ /*
+ * Controls whether tasks of this cgroup should be colocated with each
+ * other and tasks of other cgroups that have the same flag turned on.
+ */
+ bool colocate;
+
+ /* Controls whether further updates are allowed to the colocate flag */
+ bool colocate_update_disabled;
+#endif
+
/* Performance Boost (B) region threshold params */
int perf_boost_idx;
@@ -134,7 +161,7 @@ struct schedtune {
static inline struct schedtune *css_st(struct cgroup_subsys_state *css)
{
- return css ? container_of(css, struct schedtune, css) : NULL;
+ return container_of(css, struct schedtune, css);
}
static inline struct schedtune *task_schedtune(struct task_struct *tsk)
@@ -159,6 +186,13 @@ static inline struct schedtune *parent_st(struct schedtune *st)
static struct schedtune
root_schedtune = {
.boost = 0,
+#ifdef CONFIG_SCHED_HMP
+ .sched_boost_no_override = false,
+ .sched_boost_enabled = true,
+ .sched_boost_enabled_backup = true,
+ .colocate = false,
+ .colocate_update_disabled = false,
+#endif
.perf_boost_idx = 0,
.perf_constrain_idx = 0,
.prefer_idle = 0,
@@ -239,6 +273,121 @@ struct boost_groups {
/* Boost groups affecting each CPU in the system */
DEFINE_PER_CPU(struct boost_groups, cpu_boost_groups);
+#ifdef CONFIG_SCHED_HMP
+static inline void init_sched_boost(struct schedtune *st)
+{
+ st->sched_boost_no_override = false;
+ st->sched_boost_enabled = true;
+ st->sched_boost_enabled_backup = st->sched_boost_enabled;
+ st->colocate = false;
+ st->colocate_update_disabled = false;
+}
+
+bool same_schedtune(struct task_struct *tsk1, struct task_struct *tsk2)
+{
+ return task_schedtune(tsk1) == task_schedtune(tsk2);
+}
+
+void update_cgroup_boost_settings(void)
+{
+ int i;
+
+ for (i = 0; i < BOOSTGROUPS_COUNT; i++) {
+ if (!allocated_group[i])
+ break;
+
+ if (allocated_group[i]->sched_boost_no_override)
+ continue;
+
+ allocated_group[i]->sched_boost_enabled = false;
+ }
+}
+
+void restore_cgroup_boost_settings(void)
+{
+ int i;
+
+ for (i = 0; i < BOOSTGROUPS_COUNT; i++) {
+ if (!allocated_group[i])
+ break;
+
+ allocated_group[i]->sched_boost_enabled =
+ allocated_group[i]->sched_boost_enabled_backup;
+ }
+}
+
+bool task_sched_boost(struct task_struct *p)
+{
+ struct schedtune *st = task_schedtune(p);
+
+ return st->sched_boost_enabled;
+}
+
+static u64
+sched_boost_override_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct schedtune *st = css_st(css);
+
+ return st->sched_boost_no_override;
+}
+
+static int sched_boost_override_write(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 override)
+{
+ struct schedtune *st = css_st(css);
+
+ st->sched_boost_no_override = !!override;
+
+ return 0;
+}
+
+static u64 sched_boost_enabled_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct schedtune *st = css_st(css);
+
+ return st->sched_boost_enabled;
+}
+
+static int sched_boost_enabled_write(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 enable)
+{
+ struct schedtune *st = css_st(css);
+
+ st->sched_boost_enabled = !!enable;
+ st->sched_boost_enabled_backup = st->sched_boost_enabled;
+
+ return 0;
+}
+
+static u64 sched_colocate_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct schedtune *st = css_st(css);
+
+ return st->colocate;
+}
+
+static int sched_colocate_write(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 colocate)
+{
+ struct schedtune *st = css_st(css);
+
+ if (st->colocate_update_disabled)
+ return -EPERM;
+
+ st->colocate = !!colocate;
+ st->colocate_update_disabled = true;
+ return 0;
+}
+
+#else /* CONFIG_SCHED_HMP */
+
+static inline void init_sched_boost(struct schedtune *st) { }
+
+#endif /* CONFIG_SCHED_HMP */
+
static void
schedtune_cpu_update(int cpu)
{
@@ -569,7 +718,7 @@ prefer_idle_write(struct cgroup_subsys_state *css, struct cftype *cft,
u64 prefer_idle)
{
struct schedtune *st = css_st(css);
- st->prefer_idle = prefer_idle;
+ st->prefer_idle = !!prefer_idle;
return 0;
}
@@ -619,6 +768,22 @@ boost_write(struct cgroup_subsys_state *css, struct cftype *cft,
return 0;
}
+static void schedtune_attach(struct cgroup_taskset *tset)
+{
+ struct task_struct *task;
+ struct cgroup_subsys_state *css;
+ struct schedtune *st;
+ bool colocate;
+
+ cgroup_taskset_first(tset, &css);
+ st = css_st(css);
+
+ colocate = st->colocate;
+
+ cgroup_taskset_for_each(task, css, tset)
+ sync_cgroup_colocation(task, colocate);
+}
+
static struct cftype files[] = {
{
.name = "boost",
@@ -630,6 +795,23 @@ static struct cftype files[] = {
.read_u64 = prefer_idle_read,
.write_u64 = prefer_idle_write,
},
+#ifdef CONFIG_SCHED_HMP
+ {
+ .name = "sched_boost_no_override",
+ .read_u64 = sched_boost_override_read,
+ .write_u64 = sched_boost_override_write,
+ },
+ {
+ .name = "sched_boost_enabled",
+ .read_u64 = sched_boost_enabled_read,
+ .write_u64 = sched_boost_enabled_write,
+ },
+ {
+ .name = "colocate",
+ .read_u64 = sched_colocate_read,
+ .write_u64 = sched_colocate_write,
+ },
+#endif
{ } /* terminate */
};
@@ -683,6 +865,7 @@ schedtune_css_alloc(struct cgroup_subsys_state *parent_css)
/* Initialize per CPUs boost group support */
st->idx = idx;
+ init_sched_boost(st);
if (schedtune_boostgroup_init(st))
goto release;
@@ -720,6 +903,7 @@ struct cgroup_subsys schedtune_cgrp_subsys = {
.cancel_attach = schedtune_cancel_attach,
.legacy_cftypes = files,
.early_init = 1,
+ .attach = schedtune_attach,
};
static inline void
@@ -915,7 +1099,8 @@ schedtune_init(void)
*/
sd = rcu_dereference(per_cpu(sd_ea, cpumask_first(cpu_online_mask)));
if (!sd) {
- pr_info("schedtune: no energy model data\n");
+ if (energy_aware())
+ pr_warn("schedtune: no energy model data\n");
goto nodata;
}