5 files changed, 59 insertions, 8 deletions
diff --git a/Documentation/scheduler/sched-hmp.txt b/Documentation/scheduler/sched-hmp.txt
index 22449aec5558..b400e053e55d 100644
--- a/Documentation/scheduler/sched-hmp.txt
+++ b/Documentation/scheduler/sched-hmp.txt
@@ -1220,6 +1220,23 @@ This tunable is a percentage.  Configure the minimum demand of big sync waker
 task.  Scheduler places small wakee tasks woken up by big sync waker on the
 waker's cluster.
 
+*** 7.19 sched_prefer_sync_wakee_to_waker
+
+Appears at: /proc/sys/kernel/sched_prefer_sync_wakee_to_waker
+
+Default value: 0
+
+The default sync wakee policy has a preference to select an idle CPU in the
+waker cluster compared to the waker CPU running only 1 task. By selecting
+an idle CPU, it eliminates the chance of waker migrating to a different CPU
+after the wakee preempts it. This policy is also not susceptible to the
+incorrect "sync" usage i.e the waker does not goto sleep after waking up
+the wakee.
+
+However LPM exit latency associated with an idle CPU outweigh the above
+benefits on some targets. When this knob is turned on, the waker CPU is
+selected if it has only 1 runnable task.
+
 =========================
 8. HMP SCHEDULER TRACE POINTS
 =========================
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 1f9c2c734b20..861f715a673d 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -64,6 +64,7 @@ extern unsigned int sysctl_sched_pred_alert_freq;
 extern unsigned int sysctl_sched_freq_aggregate;
 extern unsigned int sysctl_sched_enable_thread_grouping;
 extern unsigned int sysctl_sched_freq_aggregate_threshold_pct;
+extern unsigned int sysctl_sched_prefer_sync_wakee_to_waker;
 
 #else /* CONFIG_SCHED_HMP */
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 14b8977d1be4..4489bec5d68a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2590,6 +2590,7 @@ static u32 __compute_runnable_contrib(u64 n)
 #define SBC_FLAG_COST_CSTATE_PREV_CPU_TIE_BREAKER	0x80
 #define SBC_FLAG_CSTATE_LOAD				0x100
 #define SBC_FLAG_BEST_SIBLING				0x200
+#define SBC_FLAG_WAKER_CPU				0x400
 
 /* Cluster selection flag */
 #define SBC_FLAG_COLOC_CLUSTER				0x10000
@@ -3060,6 +3061,15 @@ wake_to_waker_cluster(struct cpu_select_env *env)
 	       task_load(env->p) < sched_small_wakee_task_load;
 }
 
+static inline bool
+bias_to_waker_cpu(struct task_struct *p, int cpu)
+{
+	return sysctl_sched_prefer_sync_wakee_to_waker &&
+	       cpu_rq(cpu)->nr_running == 1 &&
+	       cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) &&
+	       cpu_active(cpu) && !cpu_isolated(cpu);
+}
+
 static inline int
 cluster_allowed(struct task_struct *p, struct sched_cluster *cluster)
 {
@@ -3080,6 +3090,7 @@ static int select_best_cpu(struct task_struct *p, int target, int reason,
 	struct cluster_cpu_stats stats;
 	struct related_thread_group *grp;
 	unsigned int sbc_flag = 0;
+	int cpu = raw_smp_processor_id();
 
 	struct cpu_select_env env = {
 		.p			= p,
@@ -3111,14 +3122,20 @@ static int select_best_cpu(struct task_struct *p, int target, int reason,
 		else
 			env.rtg = grp;
 	} else {
-		cluster = cpu_rq(smp_processor_id())->cluster;
-		if (wake_to_waker_cluster(&env) &&
-		    cluster_allowed(p, cluster)) {
-			env.need_waker_cluster = 1;
-			bitmap_zero(env.candidate_list, NR_CPUS);
-			__set_bit(cluster->id, env.candidate_list);
-			env.sbc_best_cluster_flag = SBC_FLAG_WAKER_CLUSTER;
-
+		cluster = cpu_rq(cpu)->cluster;
+		if (wake_to_waker_cluster(&env)) {
+			if (bias_to_waker_cpu(p, cpu)) {
+				target = cpu;
+				sbc_flag = SBC_FLAG_WAKER_CLUSTER |
+					   SBC_FLAG_WAKER_CPU;
+				goto out;
+			} else if (cluster_allowed(p, cluster)) {
+				env.need_waker_cluster = 1;
+				bitmap_zero(env.candidate_list, NR_CPUS);
+				__set_bit(cluster->id, env.candidate_list);
+				env.sbc_best_cluster_flag =
+							SBC_FLAG_WAKER_CLUSTER;
+			}
 		} else if (bias_to_prev_cpu(&env, &stats)) {
 			sbc_flag = SBC_FLAG_PREV_CPU;
 			goto out;
diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c
index 3bbf3ee46cf9..61e352aeec00 100644
--- a/kernel/sched/hmp.c
+++ b/kernel/sched/hmp.c
@@ -897,6 +897,13 @@ unsigned int __read_mostly sched_spill_load;
 unsigned int __read_mostly sysctl_sched_spill_load_pct = 100;
 
 /*
+ * Prefer the waker CPU for sync wakee task, if the CPU has only 1 runnable
+ * task. This eliminates the LPM exit latency associated with the idle
+ * CPUs in the waker cluster.
+ */
+unsigned int __read_mostly sysctl_sched_prefer_sync_wakee_to_waker;
+
+/*
  * Tasks whose bandwidth consumption on a cpu is more than
  * sched_upmigrate are considered "big" tasks. Big tasks will be
  * considered for "up" migration, i.e migrating to a cpu with better
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index dad3324e7372..cdce7d0f5a0e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -422,6 +422,15 @@ static struct ctl_table kern_table[] = {
 		.extra2		= &one_hundred,
 	},
 	{
+		.procname	= "sched_prefer_sync_wakee_to_waker",
+		.data		= &sysctl_sched_prefer_sync_wakee_to_waker,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+	{
 		.procname       = "sched_enable_thread_grouping",
 		.data           = &sysctl_sched_enable_thread_grouping,
 		.maxlen         = sizeof(unsigned int),