sched: Fix race condition with active balance

There is a race condition between checking for whether an active load balance request has been set and clearing the request. A cpu might have an active load balance request set and queued but not executed yet. Before the load balance request is executed the request flag might be cleared by cpu isolation. Then subsequently the load balancer or tick might try to do another active load balance. This can cause the same active load balance work to be queued twice causing report of list corruption. Fix this by moving the clearing of the request to the stopper thread and ensuring that load balance will not try to queue a request on an already isolated cpu. Change-Id: I5c900d2ee161fa692d66e3e66012398869715662 Signed-off-by: Olav Haugan <ohaugan@codeaurora.org>
author: Olav Haugan <ohaugan@codeaurora.org> 2016-11-01 17:30:36 -0700
committer: Olav Haugan <ohaugan@codeaurora.org> 2016-11-07 17:51:25 -0800
commit: 411a978bce803f17a0aa757b9491969e0ca60b79 (patch)
tree: 9f013a23982886239eab832de1200a3699fef27c /kernel/sched
parent: 85d7e134cc5d95dfd3a1a5ee5a1d1435633288cd (diff)
2 files changed, 31 insertions, 10 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f3b1688b3be7..f20c706ab24e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1912,7 +1912,7 @@ void scheduler_ipi(void)
 	/*
 	 * Check if someone kicked us for doing the nohz idle load balance.
 	 */
-	if (unlikely(got_nohz_idle_kick())) {
+	if (unlikely(got_nohz_idle_kick()) && !cpu_isolated(cpu)) {
 		this_rq()->idle_balance = 1;
 		raise_softirq_irqoff(SCHED_SOFTIRQ);
 	}
@@ -5570,7 +5570,6 @@ static void set_rq_offline(struct rq *rq);
 
 int do_isolation_work_cpu_stop(void *data)
 {
-	unsigned long flags;
 	unsigned int cpu = smp_processor_id();
 	struct rq *rq = cpu_rq(cpu);
 
@@ -5578,9 +5577,12 @@ int do_isolation_work_cpu_stop(void *data)
 
 	irq_migrate_all_off_this_cpu();
 
+	local_irq_disable();
+
 	sched_ttwu_pending();
+
 	/* Update our root-domain */
-	raw_spin_lock_irqsave(&rq->lock, flags);
+	raw_spin_lock(&rq->lock);
 
 	if (rq->rd) {
 		BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -5588,13 +5590,16 @@ int do_isolation_work_cpu_stop(void *data)
 	}
 
 	migrate_tasks(rq, false);
-	raw_spin_unlock_irqrestore(&rq->lock, flags);
+	raw_spin_unlock(&rq->lock);
 
 	/*
 	 * We might have been in tickless state. Clear NOHZ flags to avoid
 	 * us being kicked for helping out with balancing
 	 */
 	nohz_balance_clear_nohz_mask(cpu);
+
+	clear_hmp_request(cpu);
+	local_irq_enable();
 	return 0;
 }
 
@@ -5703,7 +5708,6 @@ int sched_isolate_cpu(int cpu)
 	migrate_sync_cpu(cpu, cpumask_first(&avail_cpus));
 	stop_cpus(cpumask_of(cpu), do_isolation_work_cpu_stop, 0);
 
-	clear_hmp_request(cpu);
 	calc_load_migrate(rq);
 	update_max_interval();
 	sched_update_group_capacities(cpu);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 21a60beb8288..1674b1054f83 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8121,8 +8121,11 @@ static struct rq *find_busiest_queue_hmp(struct lb_env *env,
 	int max_nr_big = 0, nr_big;
 	bool find_big = !!(env->flags & LBF_BIG_TASK_ACTIVE_BALANCE);
 	int i;
+	cpumask_t cpus;
 
-	for_each_cpu(i, sched_group_cpus(group)) {
+	cpumask_andnot(&cpus, sched_group_cpus(group), cpu_isolated_mask);
+
+	for_each_cpu(i, &cpus) {
 		struct rq *rq = cpu_rq(i);
 		u64 cumulative_runnable_avg =
 				rq->hmp_stats.cumulative_runnable_avg;
@@ -8285,6 +8288,15 @@ static int need_active_balance(struct lb_env *env)
 			sd->cache_nice_tries + NEED_ACTIVE_BALANCE_THRESHOLD);
 }
 
+static int group_balance_cpu_not_isolated(struct sched_group *sg)
+{
+	cpumask_t cpus;
+
+	cpumask_and(&cpus, sched_group_cpus(sg), sched_group_mask(sg));
+	cpumask_andnot(&cpus, &cpus, cpu_isolated_mask);
+	return cpumask_first(&cpus);
+}
+
 static int should_we_balance(struct lb_env *env)
 {
 	struct sched_group *sg = env->sd->groups;
@@ -8302,7 +8314,8 @@ static int should_we_balance(struct lb_env *env)
 	sg_mask = sched_group_mask(sg);
 	/* Try to find first idle cpu */
 	for_each_cpu_and(cpu, sg_cpus, env->cpus) {
-		if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))
+		if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu) ||
+		    cpu_isolated(cpu))
 			continue;
 
 		balance_cpu = cpu;
@@ -8310,7 +8323,7 @@ static int should_we_balance(struct lb_env *env)
 	}
 
 	if (balance_cpu == -1)
-		balance_cpu = group_balance_cpu(sg);
+		balance_cpu = group_balance_cpu_not_isolated(sg);
 
 	/*
 	 * First idle cpu or the first cpu(busiest) in this sched group
@@ -8530,7 +8543,8 @@ no_move:
 			 * ->active_balance_work.  Once set, it's cleared
 			 * only after active load balance is finished.
 			 */
-			if (!busiest->active_balance) {
+			if (!busiest->active_balance &&
+			    !cpu_isolated(cpu_of(busiest))) {
 				busiest->active_balance = 1;
 				busiest->push_cpu = this_cpu;
 				active_balance = 1;
@@ -9198,12 +9212,15 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 	/* Earliest time when we have to do rebalance again */
 	unsigned long next_balance = jiffies + 60*HZ;
 	int update_next_balance = 0;
+	cpumask_t cpus;
 
 	if (idle != CPU_IDLE ||
 	    !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
 		goto end;
 
-	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
+	cpumask_andnot(&cpus, nohz.idle_cpus_mask, cpu_isolated_mask);
+
+	for_each_cpu(balance_cpu, &cpus) {
 		if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
 			continue;
author	Olav Haugan <ohaugan@codeaurora.org>	2016-11-01 17:30:36 -0700
committer	Olav Haugan <ohaugan@codeaurora.org>	2016-11-07 17:51:25 -0800
commit	411a978bce803f17a0aa757b9491969e0ca60b79 (patch)
tree	9f013a23982886239eab832de1200a3699fef27c /kernel/sched
parent	85d7e134cc5d95dfd3a1a5ee5a1d1435633288cd (diff)