sched: Avoid frequent task migration due to EA in lb

A new tunable exists that allow task migration to be throttled when the scheduler tries to do task migrations due to Energy Awareness (EA). This tunable is only taken into account when migrations occur in the tick path. Extend the usage of the tunable to take into account the load balancer (lb) path also. In addition ensure that the start of task execution on a CPU is updated correctly. If a task is preempted but still runnable on the same CPU the start of execution should not be updated. Only update the start of execution when a task wakes up after sleep or moves to a new CPU. Change-Id: I6b2a8e06d8d2df8e0f9f62b7aba3b4ee4b2c1c4d Signed-off-by: Olav Haugan <ohaugan@codeaurora.org> [rameezmustafa@codeaurora.org]: Port to msm-3.18] Signed-off-by: Syed Rameez Mustafa <rameezmustafa@codeaurora.org [joonwoop@codeaurora.org: fixed conflict in group_classify() and set_task_cpu().] Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
author: Olav Haugan <ohaugan@codeaurora.org> 2014-12-06 10:09:43 -0800
committer: David Keitel <dkeitel@codeaurora.org> 2016-03-23 20:01:21 -0700
commit: 90dc3fa9a78bc47e995772172cce0089b6f17ec1 (patch)
tree: ded55888710d6b3fcbc8e378c9293bdefb36d18f
parent: c7b587d9aa093de529b2fc6efd90b1f856a78bc7 (diff)
4 files changed, 40 insertions, 14 deletions
diff --git a/Documentation/scheduler/sched-hmp.txt b/Documentation/scheduler/sched-hmp.txt
index aeecb0c15657..54add3c4b94f 100644
--- a/Documentation/scheduler/sched-hmp.txt
+++ b/Documentation/scheduler/sched-hmp.txt
@@ -1254,7 +1254,7 @@ idle CPUs which are not completely idle, increasing task packing behavior.
 
 Appears at: /proc/sys/kernel/sched_min_runtime
 
-Default value: 200000000 (200ms)
+Default value: 0 (0 ms)
 
 This tunable helps avouid frequent migration of task on account of
 energy-awareness. During scheduler tick, a check is made (in migration_needed())
@@ -1267,6 +1267,8 @@ is used in migration_needed() to avoid "frequent" migrations. Once a task has
 been associated with a cpu (in either running or runnable state) for more than
 'sched_min_vruntime' ns, it is considered eligible for migration in tick path on
 account of energy awareness reasons.
+The same logic also applies to the load balancer path to avoid frequent
+migrations due to energy awareness.
 
 =========================
 8. HMP SCHEDULER TRACE POINTS
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f3d385c2dac6..290b8df6f6e0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2846,7 +2846,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 
 	trace_sched_migrate_task(p, new_cpu, pct_task_load(p));
 
-	note_run_start(p, -1);
+	note_run_start(p, sched_clock());
 
 	if (task_cpu(p) != new_cpu) {
 		if (p->sched_class->migrate_task_rq)
@@ -3584,6 +3584,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 	if (src_cpu != cpu) {
 		wake_flags |= WF_MIGRATED;
 		set_task_cpu(p, cpu);
+	} else {
+		note_run_start(p, wallclock);
 	}
 #endif /* CONFIG_SMP */
 
@@ -4786,7 +4788,6 @@ static void __sched notrace __schedule(bool preempt)
 			prev->state = TASK_RUNNING;
 		} else {
 			deactivate_task(rq, prev, DEQUEUE_SLEEP);
-			note_run_start(prev, -1);
 			prev->on_rq = 0;
 
 			/*
@@ -4815,7 +4816,6 @@ static void __sched notrace __schedule(bool preempt)
 	clear_tsk_need_resched(prev);
 	clear_preempt_need_resched();
 	rq->clock_skip_update = 0;
-	note_run_start(next, wallclock);
 
 	BUG_ON(task_cpu(next) != cpu_of(rq));
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f0d34f818428..910e50456c60 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2543,7 +2543,12 @@ unsigned int __read_mostly sched_init_task_load_pelt;
 unsigned int __read_mostly sched_init_task_load_windows;
 unsigned int __read_mostly sysctl_sched_init_task_load_pct = 15;
 
-unsigned int __read_mostly sysctl_sched_min_runtime = 200000000; /* 200 ms */
+/*
+ * Keep these two below in sync. One is in unit of ns and the
+ * other in unit of us.
+ */
+unsigned int __read_mostly sysctl_sched_min_runtime = 0; /* 0 ms */
+u64 __read_mostly sched_min_runtime = 0; /* 0 ms */
 
 static inline unsigned int task_load(struct task_struct *p)
 {
@@ -3511,6 +3516,10 @@ int sched_hmp_proc_update_handler(struct ctl_table *table, int write,
 	if (ret || !write || !sched_enable_hmp)
 		return ret;
 
+	if (data == &sysctl_sched_min_runtime) {
+		sched_min_runtime = ((u64) sysctl_sched_min_runtime) * 1000;
+		return 0;
+	}
 	if ((sysctl_sched_downmigrate_pct > sysctl_sched_upmigrate_pct) ||
 				*data > 100) {
 		*data = old_val;
@@ -3610,10 +3619,6 @@ static int lower_power_cpu_available(struct task_struct *p, int cpu)
 	int i;
 	int lowest_power_cpu = task_cpu(p);
 	int lowest_power = power_cost(p, task_cpu(p));
-	u64 delta = sched_clock() - p->run_start;
-
-	if (delta < sysctl_sched_min_runtime)
-		return 0;
 
 	/* Is a lower-powered idle CPU available which will fit this task? */
 	for_each_cpu_and(i, tsk_cpus_allowed(p), cpu_online_mask) {
@@ -3630,6 +3635,7 @@ static int lower_power_cpu_available(struct task_struct *p, int cpu)
 }
 
 static inline int is_cpu_throttling_imminent(int cpu);
+static inline int is_task_migration_throttled(struct task_struct *p);
 
 /*
  * Check if a task is on the "wrong" cpu (i.e its current cpu is not the ideal
@@ -3663,6 +3669,7 @@ static inline int migration_needed(struct rq *rq, struct task_struct *p)
 		return MOVE_TO_BIG_CPU;
 
 	if (sched_enable_power_aware &&
+	    !is_task_migration_throttled(p) &&
 	    is_cpu_throttling_imminent(cpu_of(rq)) &&
 	    lower_power_cpu_available(p, cpu_of(rq)))
 		return MOVE_TO_POWER_EFFICIENT_CPU;
@@ -3737,6 +3744,13 @@ static inline int is_cpu_throttling_imminent(int cpu)
 	return throttling;
 }
 
+static inline int is_task_migration_throttled(struct task_struct *p)
+{
+	u64 delta = sched_clock() - p->run_start;
+
+	return delta < sched_min_runtime;
+}
+
 unsigned int cpu_temp(int cpu)
 {
 	struct cpu_pwr_stats *per_cpu_info = get_cpu_pwr_stats();
@@ -3808,6 +3822,11 @@ static inline int is_cpu_throttling_imminent(int cpu)
 	return 0;
 }
 
+static inline int is_task_migration_throttled(struct task_struct *p)
+{
+	return 0;
+}
+
 unsigned int cpu_temp(int cpu)
 {
 	return 0;
@@ -7768,6 +7787,8 @@ static inline enum
 group_type group_classify(struct sched_group *group,
 			  struct sg_lb_stats *sgs, struct lb_env *env)
 {
+	int cpu;
+
 	if (sgs->group_no_capacity) {
 		env->flags &= ~LBF_PWR_ACTIVE_BALANCE;
 		return group_overloaded;
@@ -7780,13 +7801,16 @@ group_type group_classify(struct sched_group *group,
 
 
 	/* Mark a less power-efficient CPU as busy only if we haven't
-	 * seen a busy group yet. We want to prioritize spreading
-	 * work over power optimization. */
+	 * seen a busy group yet and we are close to throttling. We want to
+	 * prioritize spreading work over power optimization.
+	 */
+	cpu = cpumask_first(sched_group_cpus(group));
 	if ((capacity(env->dst_rq) == group_rq_capacity(group)) &&
 	    sgs->sum_nr_running && (env->idle != CPU_NOT_IDLE) &&
 	    power_cost_at_freq(env->dst_cpu, 0) <
-	    power_cost_at_freq(cpumask_first(sched_group_cpus(group)), 0) &&
-	    is_cpu_throttling_imminent(cpumask_first(sched_group_cpus(group)))) {
+	    power_cost_at_freq(cpu, 0) &&
+	    !is_task_migration_throttled(cpu_rq(cpu)->curr) &&
+	    is_cpu_throttling_imminent(cpu)) {
 		env->flags |= LBF_PWR_ACTIVE_BALANCE;
 		return group_ea;
 	}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 9c2719cc9cc9..5e171b035482 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -377,7 +377,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &sysctl_sched_min_runtime,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler   = sched_hmp_proc_update_handler,
 	},
 	{
 		.procname	= "sched_spill_load",
author	Olav Haugan <ohaugan@codeaurora.org>	2014-12-06 10:09:43 -0800
committer	David Keitel <dkeitel@codeaurora.org>	2016-03-23 20:01:21 -0700
commit	90dc3fa9a78bc47e995772172cce0089b6f17ec1 (patch)
tree	ded55888710d6b3fcbc8e378c9293bdefb36d18f
parent	c7b587d9aa093de529b2fc6efd90b1f856a78bc7 (diff)