sched: Introduce the concept CPU clusters in the scheduler

A cluster is set of CPUs sharing some power controls and an L2 cache. This patch buids a list of clusters at bootup which are sorted by their max_power_cost. Many cluster-shared attributes like cur_freq, max_freq etc are needlessly maintained in per-cpu 'struct rq' currently. Consolidate them in a cluster structure. Change-Id: I0567672ad5fb67d211d9336181ceb53b9f6023af Signed-off-by: Srivatsa Vaddagiri <vatsa@codeaurora.org> Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org> [joonwoop@codeaurora.org: fixed minor conflict in arch/arm64/kernel/topology.c. fixed conflict due to ommited changes for CONFIG_SCHED_QHMP.] Signed-off-by: Syed Rameez Mustafa <rameezmustafa@codeaurora.org>
author: Srivatsa Vaddagiri <vatsa@codeaurora.org> 2015-04-20 12:35:48 +0530
committer: David Keitel <dkeitel@codeaurora.org> 2016-03-23 21:25:10 -0700
commit: cb1bb6a8f4a881b2c6c80160a94975dd93175558 (patch)
tree: 663ceabac7e860cfa7c15bae3dc7994c9c275da0 /kernel
parent: f36ae7405af944bb43f807c3e59bcddf47fd41e4 (diff)
4 files changed, 580 insertions, 384 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f0b35834bd46..8bd6fbde7efe 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -76,6 +76,7 @@
 #include <linux/compiler.h>
 #include <linux/cpufreq.h>
 #include <linux/syscore_ops.h>
+#include <linux/list_sort.h>
 
 #include <asm/switch_to.h>
 #include <asm/tlb.h>
@@ -800,15 +801,11 @@ sched_set_cpu_cstate(int cpu, int cstate, int wakeup_energy, int wakeup_latency)
 void sched_set_cluster_dstate(const cpumask_t *cluster_cpus, int dstate,
 			int wakeup_energy, int wakeup_latency)
 {
-	int cpu;
-
-	for_each_cpu(cpu, cluster_cpus) {
-		struct rq *rq = cpu_rq(cpu);
-
-		rq->dstate = dstate;
-		rq->dstate_wakeup_energy = wakeup_energy;
-		rq->dstate_wakeup_latency = wakeup_latency;
-	}
+	struct sched_cluster *cluster =
+		cpu_rq(cpumask_first(cluster_cpus))->cluster;
+	cluster->dstate = dstate;
+	cluster->dstate_wakeup_energy = wakeup_energy;
+	cluster->dstate_wakeup_latency = wakeup_latency;
 }
 
 #endif /* CONFIG_SMP */
@@ -1166,6 +1163,355 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 }
 
 #ifdef CONFIG_SCHED_HMP
+unsigned int max_possible_efficiency = 1;
+unsigned int min_possible_efficiency = UINT_MAX;
+
+unsigned long __weak arch_get_cpu_efficiency(int cpu)
+{
+	return SCHED_LOAD_SCALE;
+}
+
+/* Keep track of max/min capacity possible across CPUs "currently" */
+static void __update_min_max_capacity(void)
+{
+	int i;
+	int max_cap = 0, min_cap = INT_MAX;
+
+	for_each_online_cpu(i) {
+		max_cap = max(max_cap, cpu_capacity(i));
+		min_cap = min(min_cap, cpu_capacity(i));
+	}
+
+	max_capacity = max_cap;
+	min_capacity = min_cap;
+}
+
+static void update_min_max_capacity(void)
+{
+	unsigned long flags;
+	int i;
+
+	local_irq_save(flags);
+	for_each_possible_cpu(i)
+		raw_spin_lock(&cpu_rq(i)->lock);
+
+	__update_min_max_capacity();
+
+	for_each_possible_cpu(i)
+		raw_spin_unlock(&cpu_rq(i)->lock);
+	local_irq_restore(flags);
+}
+
+/*
+ * Return 'capacity' of a cpu in reference to "least" efficient cpu, such that
+ * least efficient cpu gets capacity of 1024
+ */
+static unsigned long
+capacity_scale_cpu_efficiency(struct sched_cluster *cluster)
+{
+	return (1024 * cluster->efficiency) / min_possible_efficiency;
+}
+
+/*
+ * Return 'capacity' of a cpu in reference to cpu with lowest max_freq
+ * (min_max_freq), such that one with lowest max_freq gets capacity of 1024.
+ */
+static unsigned long capacity_scale_cpu_freq(struct sched_cluster *cluster)
+{
+	return (1024 * cluster->max_freq) / min_max_freq;
+}
+
+/*
+ * Return load_scale_factor of a cpu in reference to "most" efficient cpu, so
+ * that "most" efficient cpu gets a load_scale_factor of 1
+ */
+static inline unsigned long
+load_scale_cpu_efficiency(struct sched_cluster *cluster)
+{
+	return DIV_ROUND_UP(1024 * max_possible_efficiency,
+			    cluster->efficiency);
+}
+
+/*
+ * Return load_scale_factor of a cpu in reference to cpu with best max_freq
+ * (max_possible_freq), so that one with best max_freq gets a load_scale_factor
+ * of 1.
+ */
+static inline unsigned long load_scale_cpu_freq(struct sched_cluster *cluster)
+{
+	return DIV_ROUND_UP(1024 * max_possible_freq, cluster->max_freq);
+}
+
+static int compute_capacity(struct sched_cluster *cluster)
+{
+	int capacity = 1024;
+
+	capacity *= capacity_scale_cpu_efficiency(cluster);
+	capacity >>= 10;
+
+	capacity *= capacity_scale_cpu_freq(cluster);
+	capacity >>= 10;
+
+	return capacity;
+}
+
+static int compute_max_possible_capacity(struct sched_cluster *cluster)
+{
+	int capacity = 1024;
+
+	capacity *= capacity_scale_cpu_efficiency(cluster);
+	capacity >>= 10;
+
+	capacity *= (1024 * cluster->max_possible_freq) / min_max_freq;
+	capacity >>= 10;
+
+	return capacity;
+}
+
+static int compute_load_scale_factor(struct sched_cluster *cluster)
+{
+	int load_scale = 1024;
+
+	/*
+	 * load_scale_factor accounts for the fact that task load
+	 * is in reference to "best" performing cpu. Task's load will need to be
+	 * scaled (up) by a factor to determine suitability to be placed on a
+	 * (little) cpu.
+	 */
+	load_scale *= load_scale_cpu_efficiency(cluster);
+	load_scale >>= 10;
+
+	load_scale *= load_scale_cpu_freq(cluster);
+	load_scale >>= 10;
+
+	return load_scale;
+}
+
+static struct list_head cluster_head;
+static DEFINE_MUTEX(cluster_lock);
+static cpumask_t all_cluster_cpus = CPU_MASK_NONE;
+DECLARE_BITMAP(all_cluster_ids, NR_CPUS);
+struct sched_cluster *sched_cluster[NR_CPUS];
+int num_clusters;
+
+static struct sched_cluster init_cluster = {
+	.list			=	LIST_HEAD_INIT(init_cluster.list),
+	.id			=	0,
+	.max_power_cost		=	1,
+	.capacity		=	1024,
+	.max_possible_capacity	=	1024,
+	.efficiency		=	1,
+	.load_scale_factor	=	1024,
+	.cur_freq		=	1,
+	.max_freq		=	1,
+	.min_freq		=	1,
+	.max_possible_freq	=	1,
+	.dstate			=	0,
+	.dstate_wakeup_energy	=	0,
+	.dstate_wakeup_latency	=	0,
+};
+
+#define for_each_sched_cluster(cluster) \
+	list_for_each_entry_rcu(cluster, &cluster_head, list)
+
+void update_all_clusters_stats(void)
+{
+	struct sched_cluster *cluster;
+	u64 highest_mpc = 0;
+
+	pre_big_task_count_change(cpu_possible_mask);
+
+	for_each_sched_cluster(cluster) {
+		u64 mpc;
+
+		cluster->capacity = compute_capacity(cluster);
+		mpc = cluster->max_possible_capacity =
+			compute_max_possible_capacity(cluster);
+		cluster->load_scale_factor = compute_load_scale_factor(cluster);
+
+		if (mpc > highest_mpc)
+			highest_mpc = mpc;
+	}
+
+	max_possible_capacity = highest_mpc;
+
+	__update_min_max_capacity();
+	sched_update_freq_max_load(cpu_possible_mask);
+	post_big_task_count_change(cpu_possible_mask);
+}
+
+static void assign_cluster_ids(struct list_head *head)
+{
+	struct sched_cluster *cluster;
+	int pos = 0;
+
+	list_for_each_entry(cluster, head, list) {
+		cluster->id = pos;
+		sched_cluster[pos++] = cluster;
+	}
+}
+
+static void
+move_list(struct list_head *dst, struct list_head *src, bool sync_rcu)
+{
+	struct list_head *first, *last;
+
+	first = src->next;
+	last = src->prev;
+
+	if (sync_rcu) {
+		INIT_LIST_HEAD_RCU(src);
+		synchronize_rcu();
+	}
+
+	first->prev = dst;
+	dst->prev = last;
+	last->next = dst;
+
+	/* Ensure list sanity before making the head visible to all CPUs. */
+	smp_mb();
+	dst->next = first;
+}
+
+static int
+compare_clusters(void *priv, struct list_head *a, struct list_head *b)
+{
+	struct sched_cluster *cluster1, *cluster2;
+	int ret;
+
+	cluster1 = container_of(a, struct sched_cluster, list);
+	cluster2 = container_of(b, struct sched_cluster, list);
+
+	ret = cluster1->max_power_cost > cluster2->max_power_cost ||
+		(cluster1->max_power_cost == cluster2->max_power_cost &&
+		cluster1->max_possible_capacity <
+				cluster2->max_possible_capacity);
+
+	return ret;
+}
+
+static void sort_clusters(void)
+{
+	struct sched_cluster *cluster;
+	struct list_head new_head;
+
+	INIT_LIST_HEAD(&new_head);
+
+	for_each_sched_cluster(cluster)
+		cluster->max_power_cost = power_cost(cluster_first_cpu(cluster),
+							       max_task_load());
+
+	move_list(&new_head, &cluster_head, true);
+
+	list_sort(NULL, &new_head, compare_clusters);
+	assign_cluster_ids(&new_head);
+
+	/*
+	 * Ensure cluster ids are visible to all CPUs before making
+	 * cluster_head visible.
+	 */
+	move_list(&cluster_head, &new_head, false);
+}
+
+static void
+insert_cluster(struct sched_cluster *cluster, struct list_head *head)
+{
+	struct sched_cluster *tmp;
+	struct list_head *iter = head;
+
+	list_for_each_entry(tmp, head, list) {
+		if (cluster->max_power_cost < tmp->max_power_cost)
+			break;
+		iter = &tmp->list;
+	}
+
+	list_add(&cluster->list, iter);
+}
+
+static struct sched_cluster *alloc_new_cluster(const struct cpumask *cpus)
+{
+	struct sched_cluster *cluster = NULL;
+
+	cluster = kzalloc(sizeof(struct sched_cluster), GFP_ATOMIC);
+	if (!cluster) {
+		__WARN_printf("Cluster allocation failed. \
+				Possible bad scheduling\n");
+		return NULL;
+	}
+
+	INIT_LIST_HEAD(&cluster->list);
+	cluster->max_power_cost		=	1;
+	cluster->capacity		=	1024;
+	cluster->max_possible_capacity	=	1024;
+	cluster->efficiency		=	1;
+	cluster->load_scale_factor	=	1024;
+	cluster->cur_freq		=	1;
+	cluster->max_freq		=	1;
+	cluster->min_freq		=	1;
+	cluster->max_possible_freq	=	1;
+	cluster->dstate			=	0;
+	cluster->dstate_wakeup_energy	=	0;
+	cluster->dstate_wakeup_latency	=	0;
+	cluster->freq_init_done		=	false;
+
+	cluster->cpus = *cpus;
+	cluster->efficiency = arch_get_cpu_efficiency(cpumask_first(cpus));
+
+	if (cluster->efficiency > max_possible_efficiency)
+		max_possible_efficiency = cluster->efficiency;
+	if (cluster->efficiency < min_possible_efficiency)
+		min_possible_efficiency = cluster->efficiency;
+
+	return cluster;
+}
+
+static void add_cluster(const struct cpumask *cpus, struct list_head *head)
+{
+	struct sched_cluster *cluster = alloc_new_cluster(cpus);
+	int i;
+
+	if (!cluster)
+		return;
+
+	for_each_cpu(i, cpus)
+		cpu_rq(i)->cluster = cluster;
+
+	insert_cluster(cluster, head);
+	set_bit(num_clusters, all_cluster_ids);
+	num_clusters++;
+}
+
+static void update_cluster_topology(void)
+{
+	struct cpumask cpus = *cpu_possible_mask;
+	const struct cpumask *cluster_cpus;
+	struct list_head new_head;
+	int i;
+
+	INIT_LIST_HEAD(&new_head);
+
+	for_each_cpu(i, &cpus) {
+		cluster_cpus = cpu_coregroup_mask(i);
+		cpumask_or(&all_cluster_cpus, &all_cluster_cpus, cluster_cpus);
+		cpumask_andnot(&cpus, &cpus, cluster_cpus);
+		add_cluster(cluster_cpus, &new_head);
+	}
+
+	assign_cluster_ids(&new_head);
+
+	/*
+	 * Ensure cluster ids are visible to all CPUs before making
+	 * cluster_head visible.
+	 */
+	move_list(&cluster_head, &new_head, false);
+}
+
+static void init_clusters(void)
+{
+	bitmap_clear(all_cluster_ids, 0, NR_CPUS);
+	init_cluster.cpus = *cpu_possible_mask;
+	INIT_LIST_HEAD(&cluster_head);
+}
 
 static int __init set_sched_enable_hmp(char *str)
 {
@@ -1251,15 +1597,15 @@ unsigned int sched_get_static_cpu_pwr_cost(int cpu)
 
 int sched_set_static_cluster_pwr_cost(int cpu, unsigned int cost)
 {
-	struct rq *rq = cpu_rq(cpu);
+	struct sched_cluster *cluster = cpu_rq(cpu)->cluster;
 
-	rq->static_cluster_pwr_cost = cost;
+	cluster->static_cluster_pwr_cost = cost;
 	return 0;
 }
 
 unsigned int sched_get_static_cluster_pwr_cost(int cpu)
 {
-	return cpu_rq(cpu)->static_cluster_pwr_cost;
+	return cpu_rq(cpu)->cluster->static_cluster_pwr_cost;
 }
 
 #else
@@ -1334,9 +1680,6 @@ static __read_mostly unsigned int sched_io_is_busy;
 /* 1 -> use PELT based load stats, 0 -> use window-based load stats */
 unsigned int __read_mostly sched_use_pelt;
 
-unsigned int max_possible_efficiency = 1024;
-unsigned int min_possible_efficiency = 1024;
-
 /*
  * Maximum possible frequency across all cpus. Task demand and cpu
  * capacity (cpu_power) metrics are scaled in reference to it.
@@ -1353,12 +1696,8 @@ unsigned int min_max_freq = 1;
 
 unsigned int max_capacity = 1024; /* max(rq->capacity) */
 unsigned int min_capacity = 1024; /* min(rq->capacity) */
-unsigned int max_load_scale_factor = 1024; /* max possible load scale factor */
 unsigned int max_possible_capacity = 1024; /* max(rq->max_possible_capacity) */
 
-/* Mask of all CPUs that have  max_possible_capacity */
-cpumask_t mpc_mask = CPU_MASK_ALL;
-
 /* Window size (in ns) */
 __read_mostly unsigned int sched_ravg_window = 10000000;
 
@@ -1409,17 +1748,18 @@ update_window_start(struct rq *rq, u64 wallclock)
 
 static inline u64 scale_exec_time(u64 delta, struct rq *rq)
 {
-	unsigned int cur_freq = rq->cur_freq;
+	int cpu = cpu_of(rq);
+	unsigned int cur_freq = cpu_cur_freq(cpu);
 	int sf;
 
 	if (unlikely(cur_freq > max_possible_freq))
-		cur_freq = rq->max_possible_freq;
+		cur_freq = max_possible_freq;
 
 	/* round up div64 */
 	delta = div64_u64(delta * cur_freq + max_possible_freq - 1,
 			  max_possible_freq);
 
-	sf = DIV_ROUND_UP(rq->efficiency * 1024, max_possible_efficiency);
+	sf = DIV_ROUND_UP(cpu_efficiency(cpu) * 1024, max_possible_efficiency);
 
 	delta *= sf;
 	delta >>= 10;
@@ -1460,7 +1800,7 @@ static inline unsigned int load_to_freq(struct rq *rq, u64 load)
 	load *= 128;
 	load = div64_u64(load, max_task_load());
 
-	freq = load * rq->max_possible_freq;
+	freq = load * cpu_max_possible_freq(cpu_of(rq));
 	freq /= 128;
 
 	return freq;
@@ -1849,7 +2189,7 @@ int sched_update_freq_max_load(const cpumask_t *cpumask)
 		max_load->length = per_cpu_info[cpu].len;
 
 		max_demand = max_demand_capacity *
-			     cpu_rq(cpu)->max_possible_capacity;
+			     cpu_max_possible_capacity(cpu);
 
 		i = 0;
 		costs = per_cpu_info[cpu].ptable;
@@ -1862,7 +2202,7 @@ int sched_update_freq_max_load(const cpumask_t *cpumask)
 			hfreq = div64_u64((u64)freq * hpct , 100);
 			entry->hdemand =
 			    div64_u64(max_demand * hfreq,
-				      cpu_rq(cpu)->max_possible_freq);
+				      cpu_max_possible_freq(cpu));
 			i++;
 		}
 
@@ -2151,35 +2491,6 @@ void sched_account_irqtime(int cpu, struct task_struct *curr,
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 
-unsigned long __weak arch_get_cpu_efficiency(int cpu)
-{
-	return SCHED_LOAD_SCALE;
-}
-
-static void init_cpu_efficiency(void)
-{
-	int i, efficiency;
-	unsigned int max = 0, min = UINT_MAX;
-
-	if (!sched_enable_hmp)
-		return;
-
-	for_each_possible_cpu(i) {
-		efficiency = arch_get_cpu_efficiency(i);
-		cpu_rq(i)->efficiency = efficiency;
-
-		if (efficiency > max)
-			max = efficiency;
-		if (efficiency < min)
-			min = efficiency;
-	}
-
-	BUG_ON(!max || !min);
-
-	max_possible_efficiency = max;
-	min_possible_efficiency = min;
-}
-
 static void reset_task_stats(struct task_struct *p)
 {
 	u32 sum = 0;
@@ -2448,10 +2759,10 @@ void sched_get_cpus_busy(struct sched_load *busy,
 		load[i] = rq->old_busy_time = rq->prev_runnable_sum;
 		nload[i] = rq->nt_prev_runnable_sum;
 		/*
-		 * Scale load in reference to rq->max_possible_freq.
+		 * Scale load in reference to cluster max_possible_freq.
 		 *
 		 * Note that scale_load_to_cpu() scales load in reference to
-		 * rq->max_freq.
+		 * the cluster max_freq.
 		 */
 		load[i] = scale_load_to_cpu(load[i], cpu);
 		nload[i] = scale_load_to_cpu(nload[i], cpu);
@@ -2459,8 +2770,8 @@ void sched_get_cpus_busy(struct sched_load *busy,
 		notifier_sent[i] = rq->notifier_sent;
 		early_detection[i] = (rq->ed_task != NULL);
 		rq->notifier_sent = 0;
-		cur_freq[i] = rq->cur_freq;
-		max_freq[i] = rq->max_freq;
+		cur_freq[i] = cpu_cur_freq(cpu);
+		max_freq[i] = cpu_max_freq(cpu);
 		i++;
 	}
 
@@ -2490,14 +2801,14 @@ void sched_get_cpus_busy(struct sched_load *busy,
 				nload[i] = window_size;
 
 			load[i] = scale_load_to_freq(load[i], cur_freq[i],
-						     rq->max_possible_freq);
+						    cpu_max_possible_freq(cpu));
 			nload[i] = scale_load_to_freq(nload[i], cur_freq[i],
-						      rq->max_possible_freq);
+						    cpu_max_possible_freq(cpu));
 		} else {
 			load[i] = scale_load_to_freq(load[i], max_freq[i],
-						     rq->max_possible_freq);
+						    cpu_max_possible_freq(cpu));
 			nload[i] = scale_load_to_freq(nload[i], max_freq[i],
-						     rq->max_possible_freq);
+						    cpu_max_possible_freq(cpu));
 		}
 
 		busy[i].prev_load = div64_u64(load[i], NSEC_PER_USEC);
@@ -2651,126 +2962,22 @@ heavy_task_wakeup(struct task_struct *p, struct rq *rq, int event)
 
 #endif	/* CONFIG_SCHED_FREQ_INPUT */
 
-/* Keep track of max/min capacity possible across CPUs "currently" */
-static void __update_min_max_capacity(void)
-{
-	int i;
-	int max = 0, min = INT_MAX;
-
-	for_each_online_cpu(i) {
-		if (cpu_rq(i)->capacity > max)
-			max = cpu_rq(i)->capacity;
-		if (cpu_rq(i)->capacity < min)
-			min = cpu_rq(i)->capacity;
-	}
-
-	max_capacity = max;
-	min_capacity = min;
-}
-
-static void update_min_max_capacity(void)
-{
-	unsigned long flags;
-	int i;
-
-	local_irq_save(flags);
-	for_each_possible_cpu(i)
-		raw_spin_lock(&cpu_rq(i)->lock);
-
-	__update_min_max_capacity();
-
-	for_each_possible_cpu(i)
-		raw_spin_unlock(&cpu_rq(i)->lock);
-	local_irq_restore(flags);
-}
-
-/*
- * Return 'capacity' of a cpu in reference to "least" efficient cpu, such that
- * least efficient cpu gets capacity of 1024
- */
-unsigned long capacity_scale_cpu_efficiency(int cpu)
-{
-	return (1024 * cpu_rq(cpu)->efficiency) / min_possible_efficiency;
-}
-
-/*
- * Return 'capacity' of a cpu in reference to cpu with lowest max_freq
- * (min_max_freq), such that one with lowest max_freq gets capacity of 1024.
- */
-unsigned long capacity_scale_cpu_freq(int cpu)
-{
-	return (1024 * cpu_rq(cpu)->max_freq) / min_max_freq;
-}
-
-/*
- * Return load_scale_factor of a cpu in reference to "most" efficient cpu, so
- * that "most" efficient cpu gets a load_scale_factor of 1
- */
-static inline unsigned long load_scale_cpu_efficiency(int cpu)
-{
-	return DIV_ROUND_UP(1024 * max_possible_efficiency,
-			    cpu_rq(cpu)->efficiency);
-}
-
-/*
- * Return load_scale_factor of a cpu in reference to cpu with best max_freq
- * (max_possible_freq), so that one with best max_freq gets a load_scale_factor
- * of 1.
- */
-static inline unsigned long load_scale_cpu_freq(int cpu)
-{
-	return DIV_ROUND_UP(1024 * max_possible_freq, cpu_rq(cpu)->max_freq);
-}
-
-static int compute_capacity(int cpu)
-{
-	int capacity = 1024;
-
-	capacity *= capacity_scale_cpu_efficiency(cpu);
-	capacity >>= 10;
-
-	capacity *= capacity_scale_cpu_freq(cpu);
-	capacity >>= 10;
-
-	return capacity;
-}
-
-static int compute_load_scale_factor(int cpu)
-{
-	int load_scale = 1024;
-
-	/*
-	 * load_scale_factor accounts for the fact that task load
-	 * is in reference to "best" performing cpu. Task's load will need to be
-	 * scaled (up) by a factor to determine suitability to be placed on a
-	 * (little) cpu.
-	 */
-	load_scale *= load_scale_cpu_efficiency(cpu);
-	load_scale >>= 10;
-
-	load_scale *= load_scale_cpu_freq(cpu);
-	load_scale >>= 10;
-
-	return load_scale;
-}
-
 #define sched_up_down_migrate_auto_update 1
 static void check_for_up_down_migrate_update(const struct cpumask *cpus)
 {
 	int i = cpumask_first(cpus);
-	struct rq *rq = cpu_rq(i);
 
 	if (!sched_up_down_migrate_auto_update)
 		return;
 
-	if (rq->max_possible_capacity == max_possible_capacity)
+	if (cpu_max_possible_capacity(i) == max_possible_capacity)
 		return;
 
-	if (rq->max_possible_freq == rq->max_freq)
+	if (cpu_max_possible_freq(i) == cpu_max_freq(i))
 		up_down_migrate_scale_factor = 1024;
 	else
-		up_down_migrate_scale_factor = (1024 * rq->max_possible_freq)/
-					rq->max_freq;
+		up_down_migrate_scale_factor = (1024 *
+				 cpu_max_possible_freq(i)) / cpu_max_freq(i);
 
 	update_up_down_migrate();
 }
@@ -2779,13 +2986,10 @@ static int cpufreq_notifier_policy(struct notifier_block *nb,
 		unsigned long val, void *data)
 {
 	struct cpufreq_policy *policy = (struct cpufreq_policy *)data;
-	int i, update_max = 0;
-	u64 highest_mpc = 0, highest_mplsf = 0;
-	const struct cpumask *cpus = policy->related_cpus;
-	unsigned int orig_min_max_freq = min_max_freq;
-	unsigned int orig_max_possible_freq = max_possible_freq;
-	/* Initialized to policy->max in case policy->related_cpus is empty! */
-	unsigned int orig_max_freq = policy->max;
+	struct sched_cluster *cluster = NULL;
+	struct cpumask policy_cluster = *policy->related_cpus;
+	unsigned int orig_max_freq = 0;
+	int i, j, update_capacity = 0;
 
 	if (val != CPUFREQ_NOTIFY && val != CPUFREQ_REMOVE_POLICY &&
 						val != CPUFREQ_CREATE_POLICY)
@@ -2796,16 +3000,6 @@ static int cpufreq_notifier_policy(struct notifier_block *nb,
 		return 0;
 	}
 
-	for_each_cpu(i, policy->related_cpus) {
-		cpumask_copy(&cpu_rq(i)->freq_domain_cpumask,
-			     policy->related_cpus);
-		orig_max_freq = cpu_rq(i)->max_freq;
-		cpu_rq(i)->min_freq = policy->min;
-		cpu_rq(i)->max_freq = policy->max;
-		cpu_rq(i)->cur_freq = policy->cur;
-		cpu_rq(i)->max_possible_freq = policy->cpuinfo.max_freq;
-	}
-
 	max_possible_freq = max(max_possible_freq, policy->cpuinfo.max_freq);
 	if (min_max_freq == 1)
 		min_max_freq = UINT_MAX;
@@ -2813,82 +3007,51 @@ static int cpufreq_notifier_policy(struct notifier_block *nb,
 	BUG_ON(!min_max_freq);
 	BUG_ON(!policy->max);
 
-	/* Changes to policy other than max_freq don't require any updates */
-	if (orig_max_freq == policy->max)
-		return 0;
-
-	/*
-	 * A changed min_max_freq or max_possible_freq (possible during bootup)
-	 * needs to trigger re-computation of load_scale_factor and capacity for
-	 * all possible cpus (even those offline). It also needs to trigger
-	 * re-computation of nr_big_task count on all online cpus.
-	 *
-	 * A changed rq->max_freq otoh needs to trigger re-computation of
-	 * load_scale_factor and capacity for just the cluster of cpus involved.
-	 * Since small task definition depends on max_load_scale_factor, a
-	 * changed load_scale_factor of one cluster could influence
-	 * classification of tasks in another cluster. Hence a changed
-	 * rq->max_freq will need to trigger re-computation of nr_big_task
-	 * count on all online cpus.
-	 *
-	 * While it should be sufficient for nr_big_tasks to be
-	 * re-computed for only online cpus, we have inadequate context
-	 * information here (in policy notifier) with regard to hotplug-safety
-	 * context in which notification is issued. As a result, we can't use
-	 * get_online_cpus() here, as it can lead to deadlock. Until cpufreq is
-	 * fixed up to issue notification always in hotplug-safe context,
-	 * re-compute nr_big_task for all possible cpus.
-	 */
+	for_each_cpu(i, &policy_cluster) {
+		cluster = cpu_rq(i)->cluster;
+		cpumask_andnot(&policy_cluster, &policy_cluster,
+						&cluster->cpus);
+
+		orig_max_freq = cluster->max_freq;
+		cluster->min_freq = policy->min;
+		cluster->max_freq = policy->max;
+		cluster->cur_freq = policy->cur;
+
+		if (!cluster->freq_init_done) {
+			mutex_lock(&cluster_lock);
+			for_each_cpu(j, &cluster->cpus)
+				cpumask_copy(&cpu_rq(j)->freq_domain_cpumask,
+						policy->related_cpus);
+			cluster->max_possible_freq = policy->cpuinfo.max_freq;
+			cluster->max_possible_capacity =
+				compute_max_possible_capacity(cluster);
+			cluster->freq_init_done = true;
+
+			sort_clusters();
+			update_all_clusters_stats();
+			mutex_unlock(&cluster_lock);
+			continue;
+		}
 
-	if (orig_min_max_freq != min_max_freq ||
-		orig_max_possible_freq != max_possible_freq) {
-			cpus = cpu_possible_mask;
-			update_max = 1;
+		update_capacity += (orig_max_freq != policy->max);
 	}
 
-	/*
-	 * Changed load_scale_factor can trigger reclassification of tasks as
-	 * big or small. Make this change "atomic" so that tasks are accounted
-	 * properly due to changed load_scale_factor
-	 */
-	pre_big_task_count_change(cpu_possible_mask);
-	for_each_cpu(i, cpus) {
-		struct rq *rq = cpu_rq(i);
-
-		rq->capacity = compute_capacity(i);
-		rq->load_scale_factor = compute_load_scale_factor(i);
-
-		if (update_max) {
-			u64 mpc, mplsf;
-
-			mpc = div_u64(((u64) rq->capacity) *
-				rq->max_possible_freq, rq->max_freq);
-			rq->max_possible_capacity = (int) mpc;
-
-			mplsf = div_u64(((u64) rq->load_scale_factor) *
-				rq->max_possible_freq, rq->max_freq);
-
-			if (mpc > highest_mpc) {
-				highest_mpc = mpc;
-				cpumask_clear(&mpc_mask);
-				cpumask_set_cpu(i, &mpc_mask);
-			} else if (mpc == highest_mpc) {
-				cpumask_set_cpu(i, &mpc_mask);
-			}
-
-			if (mplsf > highest_mplsf)
-				highest_mplsf = mplsf;
-		}
-	}
+	if (!update_capacity)
+		return 0;
 
-	if (update_max) {
-		max_possible_capacity = highest_mpc;
-		max_load_scale_factor = highest_mplsf;
+	policy_cluster = *policy->related_cpus;
+	pre_big_task_count_change(cpu_possible_mask);
 
-		sched_update_freq_max_load(cpu_possible_mask);
+	for_each_cpu(i, &policy_cluster) {
+		cluster = cpu_rq(i)->cluster;
+		cpumask_andnot(&policy_cluster, &policy_cluster,
+						&cluster->cpus);
+		cluster->capacity = compute_capacity(cluster);
+		cluster->load_scale_factor = compute_load_scale_factor(cluster);
 	}
 
 	__update_min_max_capacity();
+
 	check_for_up_down_migrate_update(policy->related_cpus);
 	post_big_task_count_change(cpu_possible_mask);
 
@@ -2901,23 +3064,32 @@ static int cpufreq_notifier_trans(struct notifier_block *nb,
 	struct cpufreq_freqs *freq = (struct cpufreq_freqs *)data;
 	unsigned int cpu = freq->cpu, new_freq = freq->new;
 	unsigned long flags;
-	int i;
+	struct sched_cluster *cluster;
+	struct cpumask policy_cpus = cpu_rq(cpu)->freq_domain_cpumask;
+	int i, j;
 
 	if (val != CPUFREQ_POSTCHANGE)
 		return 0;
 
 	BUG_ON(!new_freq);
 
-	if (cpu_rq(cpu)->cur_freq == new_freq)
+	if (cpu_cur_freq(cpu) == new_freq)
 		return 0;
 
-	for_each_cpu(i, &cpu_rq(cpu)->freq_domain_cpumask) {
-		struct rq *rq = cpu_rq(i);
-		raw_spin_lock_irqsave(&rq->lock, flags);
-		update_task_ravg(rq->curr, rq, TASK_UPDATE,
-				 sched_ktime_clock(), 0);
-		rq->cur_freq = new_freq;
-		raw_spin_unlock_irqrestore(&rq->lock, flags);
+	for_each_cpu(i, &policy_cpus) {
+		cluster = cpu_rq(i)->cluster;
+
+		for_each_cpu(j, &cluster->cpus) {
+			struct rq *rq = cpu_rq(j);
+
+			raw_spin_lock_irqsave(&rq->lock, flags);
+			update_task_ravg(rq->curr, rq, TASK_UPDATE,
+						sched_ktime_clock(), 0);
+			raw_spin_unlock_irqrestore(&rq->lock, flags);
+		}
+
+		cluster->cur_freq = new_freq;
+		cpumask_andnot(&policy_cpus, &policy_cpus, &cluster->cpus);
 	}
 
 	return 0;
@@ -2931,6 +3103,10 @@ static int pwr_stats_ready_notifier(struct notifier_block *nb,
 	cpumask_set_cpu(cpu, &mask);
 	sched_update_freq_max_load(&mask);
 
+	mutex_lock(&cluster_lock);
+	sort_clusters();
+	mutex_unlock(&cluster_lock);
+
 	return 0;
 }
 
@@ -3004,8 +3180,6 @@ update_task_ravg(struct task_struct *p, struct rq *rq,
 {
 }
 
-static inline void init_cpu_efficiency(void) {}
-
 static inline void mark_task_starting(struct task_struct *p) {}
 
 static inline void set_window_start(struct rq *rq) {}
@@ -9474,7 +9648,6 @@ void __init sched_init_smp(void)
 {
 	cpumask_var_t non_isolated_cpus;
 
-	init_cpu_efficiency();
 	alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
 	alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
 
@@ -9496,6 +9669,8 @@ void __init sched_init_smp(void)
 	hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
 	hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
 
+	update_cluster_topology();
+
 	init_hrtick();
 
 	/* Move init over to a non-isolated CPU */
@@ -9543,6 +9718,10 @@ void __init sched_init(void)
 
 	BUG_ON(num_possible_cpus() > BITS_PER_LONG);
 
+#ifdef CONFIG_SCHED_HMP
+	init_clusters();
+#endif
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	alloc_size += 2 * nr_cpu_ids * sizeof(void **);
 #endif
@@ -9660,15 +9839,7 @@ void __init sched_init(void)
 		rq->avg_idle = 2*sysctl_sched_migration_cost;
 #ifdef CONFIG_SCHED_HMP
 		cpumask_set_cpu(i, &rq->freq_domain_cpumask);
-		rq->cur_freq = 1;
-		rq->max_freq = 1;
-		rq->min_freq = 1;
-		rq->max_possible_freq = 1;
-		rq->max_possible_capacity = 0;
 		rq->hmp_stats.cumulative_runnable_avg = 0;
-		rq->efficiency = 1024;
-		rq->capacity = 1024;
-		rq->load_scale_factor = 1024;
 		rq->window_start = 0;
 		rq->hmp_stats.nr_big_tasks = 0;
 		rq->hmp_flags = 0;
@@ -9676,8 +9847,13 @@ void __init sched_init(void)
 		rq->avg_irqload = 0;
 		rq->irqload_ts = 0;
 		rq->static_cpu_pwr_cost = 0;
-		rq->static_cluster_pwr_cost = 0;
 
+		/*
+		 * All cpus part of same cluster by default. This avoids the
+		 * need to check for rq->cluster being non-NULL in hot-paths
+		 * like select_best_cpu()
+		 */
+		rq->cluster = &init_cluster;
 #ifdef CONFIG_SCHED_FREQ_INPUT
 		rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
 		rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0;
@@ -9689,10 +9865,6 @@ void __init sched_init(void)
 		rq->cstate = 0;
 		rq->wakeup_latency = 0;
 
-		rq->dstate = 0;
-		rq->dstate_wakeup_latency = 0;
-		rq->dstate_wakeup_energy = 0;
-
 		INIT_LIST_HEAD(&rq->cfs_tasks);
 
 		rq_attach_root(rq, &def_root_domain);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index b2bb6caa6a5b..469640a0a626 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -318,14 +318,14 @@ do {									\
 	P(cpu_capacity);
 #endif
 #ifdef CONFIG_SCHED_HMP
-	P(load_scale_factor);
-	P(capacity);
-	P(max_possible_capacity);
-	P(efficiency);
-	P(cur_freq);
-	P(max_freq);
 	P(static_cpu_pwr_cost);
-	P(static_cluster_pwr_cost);
+	P(cluster->static_cluster_pwr_cost);
+	P(cluster->load_scale_factor);
+	P(cluster->capacity);
+	P(cluster->max_possible_capacity);
+	P(cluster->efficiency);
+	P(cluster->cur_freq);
+	P(cluster->max_freq);
 #endif
 #ifdef CONFIG_SCHED_HMP
 	P(hmp_stats.nr_big_tasks);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2e69f2fb4447..be7b44f9a85f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2780,13 +2780,6 @@ static unsigned int __read_mostly
 sched_short_sleep_task_threshold = 2000 * NSEC_PER_USEC;
 unsigned int __read_mostly sysctl_sched_select_prev_cpu_us = 2000;
 
-static inline int available_cpu_capacity(int cpu)
-{
-	struct rq *rq = cpu_rq(cpu);
-
-	return rq->capacity;
-}
-
 void update_up_down_migrate(void)
 {
 	unsigned int up_migrate = pct_to_real(sysctl_sched_upmigrate_pct);
@@ -2922,7 +2915,7 @@ static void boost_kick_cpus(void)
 	int i;
 
 	for_each_online_cpu(i) {
-		if (cpu_rq(i)->capacity != max_capacity)
+		if (cpu_capacity(i) != max_capacity)
 			boost_kick(i);
 	}
 }
@@ -2998,9 +2991,7 @@ done:
 
 static int task_load_will_fit(struct task_struct *p, u64 task_load, int cpu)
 {
-	struct rq *rq = cpu_rq(cpu);
-
-	if (rq->capacity == max_capacity)
+	if (cpu_capacity(cpu) == max_capacity)
 		return 1;
 
 	if (task_nice(p) > sched_upmigrate_min_nice || upmigrate_discouraged(p))
@@ -3068,12 +3059,12 @@ unsigned int power_cost(int cpu, u64 demand)
 		 * capacity as a rough stand-in for real CPU power
 		 * numbers, assuming bigger CPUs are more power
 		 * hungry. */
-		return rq->max_possible_capacity;
+		return cpu_max_possible_capacity(cpu);
 
 	rcu_read_lock();
 	max_load = rcu_dereference(per_cpu(freq_max_load, cpu));
 	if (!max_load) {
-		pc = rq->max_possible_capacity;
+		pc = cpu_max_possible_capacity(cpu);
 		goto unlock;
 	}
 
@@ -3108,8 +3099,9 @@ unlock:
 
 	if (idle_cpu(cpu) && rq->cstate) {
 		total_static_pwr_cost += rq->static_cpu_pwr_cost;
-		if (rq->dstate)
-			total_static_pwr_cost += rq->static_cluster_pwr_cost;
+		if (rq->cluster->dstate)
+			total_static_pwr_cost +=
+				rq->cluster->static_cluster_pwr_cost;
 	}
 
 	return pc + total_static_pwr_cost;
@@ -3120,7 +3112,7 @@ unlock:
 #define DOWN_MIGRATION		2
 #define IRQLOAD_MIGRATION	4
 
-static int skip_freq_domain(struct rq *task_rq, struct rq *rq, int reason)
+static int skip_cluster(int tcpu, int cpu, int reason)
 {
 	int skip;
 
@@ -3129,11 +3121,11 @@ static int skip_freq_domain(struct rq *task_rq, struct rq *rq, int reason)
 
 	switch (reason) {
 	case UP_MIGRATION:
-		skip = (rq->capacity <= task_rq->capacity);
+		skip = (cpu_capacity(cpu) <= cpu_capacity(tcpu));
 		break;
 
 	case DOWN_MIGRATION:
-		skip = rq->capacity >= task_rq->capacity;
+		skip = (cpu_capacity(cpu) >= cpu_capacity(tcpu));
 		break;
 
 	case IRQLOAD_MIGRATION:
@@ -3247,13 +3239,13 @@ static int select_best_cpu(struct task_struct *p, int target, int reason,
 		 power_cost(i, task_load(p) + cpu_cravg_sync(i, sync)),
 		 cpu_temp(i));
 
-		if (skip_freq_domain(trq, rq, reason)) {
+		if (skip_cluster(task_cpu(p), i, reason)) {
 			cpumask_andnot(&search_cpus, &search_cpus,
-						&rq->freq_domain_cpumask);
+						&rq->cluster->cpus);
 			continue;
 		}
 
-		if (skip_cpu(trq, rq, i, reason))
+		if (skip_cpu(task_rq(p), rq, i, reason))
 			continue;
 
 		cpu_load = cpu_load_sync(i, sync);
@@ -3263,8 +3255,7 @@ static int select_best_cpu(struct task_struct *p, int target, int reason,
 		if (spare_capacity > 0 &&
 		    (spare_capacity > highest_spare_capacity ||
 		     (spare_capacity == highest_spare_capacity &&
-				cpu_rq(i)->capacity >
-				cpu_rq(best_capacity_cpu)->capacity))) {
+			cpu_capacity(i) > cpu_capacity(best_capacity_cpu)))) {
 			highest_spare_capacity = spare_capacity;
 			best_capacity_cpu = i;
 		}
@@ -3434,7 +3425,7 @@ unsigned int nr_eligible_big_tasks(int cpu)
 	int nr_big = rq->hmp_stats.nr_big_tasks;
 	int nr = rq->nr_running;
 
-	if (rq->max_possible_capacity != max_possible_capacity)
+	if (cpu_max_possible_capacity(cpu) != max_possible_capacity)
 		return nr_big;
 
 	return nr;
@@ -3817,9 +3808,9 @@ static inline void reset_balance_interval(int cpu)
  *
  * Returns reason why task needs to be migrated
  */
-static inline int migration_needed(struct rq *rq, struct task_struct *p)
+static inline int migration_needed(struct task_struct *p, int cpu)
 {
-	int nice = task_nice(p);
+	int nice;
 
 	if (!sched_enable_hmp || p->state != TASK_RUNNING)
 		return 0;
@@ -3828,14 +3819,15 @@ static inline int migration_needed(struct rq *rq, struct task_struct *p)
 	if (task_will_be_throttled(p))
 		return 0;
 
-	if (sched_cpu_high_irqload(cpu_of(rq)))
+	if (sched_cpu_high_irqload(cpu))
 		return IRQLOAD_MIGRATION;
 
+	nice = task_nice(p);
 	if ((nice > sched_upmigrate_min_nice || upmigrate_discouraged(p)) &&
-			 rq->capacity > min_capacity)
+			 cpu_capacity(cpu) > min_capacity)
 		return DOWN_MIGRATION;
 
-	if (!task_will_fit(p, cpu_of(rq)))
+	if (!task_will_fit(p, cpu))
 		return UP_MIGRATION;
 
 	return 0;
@@ -3873,7 +3865,7 @@ void check_for_migration(struct rq *rq, struct task_struct *p)
 	int cpu = cpu_of(rq), new_cpu;
 	int active_balance = 0, reason;
 
-	reason = migration_needed(rq, p);
+	reason = migration_needed(p, cpu);
 	if (!reason)
 		return;
 
@@ -4413,22 +4405,18 @@ unsigned int pct_task_load(struct task_struct *p)
 static inline void
 add_to_scaled_stat(int cpu, struct sched_avg *sa, u64 delta)
 {
-	struct rq *rq = cpu_rq(cpu);
-	int cur_freq = rq->cur_freq, max_freq = rq->max_freq;
-	int cpu_max_possible_freq = rq->max_possible_freq;
+	int cur_freq = cpu_cur_freq(cpu);
 	u64 scaled_delta;
 	int sf;
 
 	if (!sched_enable_hmp)
 		return;
 
-	if (unlikely(cur_freq > max_possible_freq ||
-		     (cur_freq == max_freq &&
-		      max_freq < cpu_max_possible_freq)))
+	if (unlikely(cur_freq > max_possible_freq))
 		cur_freq = max_possible_freq;
 
 	scaled_delta = div64_u64(delta * cur_freq, max_possible_freq);
-	sf = (rq->efficiency * 1024) / max_possible_efficiency;
+	sf = (cpu_efficiency(cpu) * 1024) / max_possible_efficiency;
 	scaled_delta *= sf;
 	scaled_delta >>= 10;
 	sa->runnable_avg_sum_scaled += scaled_delta;
@@ -7407,9 +7395,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 	/* Record that we found atleast one task that could run on dst_cpu */
 	env->flags &= ~LBF_ALL_PINNED;
 
-	if (nr_big_tasks(env->src_rq) &&
-			capacity(env->dst_rq) > capacity(env->src_rq) &&
-			!is_big_task(p))
+	if (cpu_capacity(env->dst_cpu) > cpu_capacity(env->src_cpu) &&
+		nr_big_tasks(env->src_rq) && !is_big_task(p))
 		return 0;
 
 	twf = task_will_fit(p, env->dst_cpu);
@@ -7528,7 +7515,7 @@ static int detach_tasks(struct lb_env *env)
 	if (env->imbalance <= 0)
 		return 0;
 
-	if (capacity(env->dst_rq) < capacity(env->src_rq) &&
+	if (cpu_capacity(env->dst_cpu) < cpu_capacity(env->src_cpu) &&
 							!sched_boost())
 		env->flags |= LBF_IGNORE_BIG_TASKS;
 
@@ -7823,17 +7810,21 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
 static int
 bail_inter_cluster_balance(struct lb_env *env, struct sd_lb_stats *sds)
 {
+	int local_cpu, busiest_cpu;
 	int local_capacity, busiest_capacity;
 	unsigned int local_freq, busiest_freq, busiest_max_freq;
 
 	if (sched_boost())
 		return 0;
 
-	local_capacity = group_rq_mpc(sds->local);
-	busiest_capacity = group_rq_mpc(sds->busiest);
-	local_freq = cpu_rq(group_first_cpu(sds->local))->cur_freq;
-	busiest_freq = cpu_rq(group_first_cpu(sds->busiest))->cur_freq;
-	busiest_max_freq = cpu_rq(group_first_cpu(sds->busiest))->max_freq;
+	local_cpu = group_first_cpu(sds->local);
+	busiest_cpu = group_first_cpu(sds->busiest);
+
+	local_capacity = cpu_max_possible_capacity(local_cpu);
+	busiest_capacity = cpu_max_possible_capacity(busiest_cpu);
+	local_freq = cpu_cur_freq(local_cpu);
+	busiest_freq = cpu_cur_freq(busiest_cpu);
+	busiest_max_freq = cpu_max_freq(busiest_cpu);
 
 	if (local_capacity < busiest_capacity) {
 		if (local_freq >= sysctl_sched_pack_freq &&
@@ -8174,7 +8165,7 @@ static bool update_sd_pick_busiest_active_balance(struct lb_env *env,
 						  struct sg_lb_stats *sgs)
 {
 	if (env->idle != CPU_NOT_IDLE &&
-	    capacity(env->dst_rq) > group_rq_capacity(sg)) {
+	    cpu_capacity(env->dst_cpu) > group_rq_capacity(sg)) {
 		if (sched_boost() && !sds->busiest && sgs->sum_nr_running) {
 			env->flags |= LBF_SCHED_BOOST_ACTIVE_BALANCE;
 			return true;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1e4b2e986b15..35a13974f34a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -369,6 +369,32 @@ struct hmp_sched_stats {
 	u64 cumulative_runnable_avg;
 };
 
+struct sched_cluster {
+	struct list_head list;
+	struct cpumask cpus;
+	int id;
+	int max_power_cost;
+	int max_possible_capacity;
+	int capacity;
+	int efficiency; /* Differentiate cpus with different IPC capability */
+	int load_scale_factor;
+	/*
+	 * max_freq = user or thermal defined maximum
+	 * max_possible_freq = maximum supported by hardware
+	 */
+	unsigned int cur_freq, max_freq, min_freq, max_possible_freq;
+	bool freq_init_done;
+	int dstate, dstate_wakeup_latency, dstate_wakeup_energy;
+	unsigned int static_cluster_pwr_cost;
+};
+
+extern unsigned long all_cluster_ids[];
+
+static inline int cluster_first_cpu(struct sched_cluster *cluster)
+{
+	return cpumask_first(&cluster->cpus);
+}
+
 #endif
 
 /* CFS-related fields in a runqueue */
@@ -670,26 +696,16 @@ struct rq {
 	u64 idle_stamp;
 	u64 avg_idle;
 	int cstate, wakeup_latency, wakeup_energy;
-	int dstate, dstate_wakeup_latency, dstate_wakeup_energy;
 
 	/* This is used to determine avg_idle's max value */
 	u64 max_idle_balance_cost;
 #endif
 
 #ifdef CONFIG_SCHED_HMP
-	/*
-	 * max_freq = user or thermal defined maximum
-	 * max_possible_freq = maximum supported by hardware
-	 */
-	unsigned int cur_freq, max_freq, min_freq, max_possible_freq;
+	struct sched_cluster *cluster;
 	struct cpumask freq_domain_cpumask;
-
 	struct hmp_sched_stats hmp_stats;
 
-	int efficiency; /* Differentiate cpus with different IPC capability */
-	int load_scale_factor;
-	int capacity;
-	int max_possible_capacity;
 	u64 window_start;
 	unsigned long hmp_flags;
 
@@ -697,7 +713,6 @@ struct rq {
 	u64 avg_irqload;
 	u64 irqload_ts;
 	unsigned int static_cpu_pwr_cost;
-	unsigned int static_cluster_pwr_cost;
 	struct task_struct *ed_task;
 
 #ifdef CONFIG_SCHED_FREQ_INPUT
@@ -969,13 +984,6 @@ static inline unsigned int group_first_cpu(struct sched_group *group)
 
 extern int group_balance_cpu(struct sched_group *sg);
 
-/*
- * Returns the rq capacity of any rq in a group. This does not play
- * well with groups where rq capacity can change independently.
- */
-#define group_rq_capacity(group) capacity(cpu_rq(group_first_cpu(group)))
-#define group_rq_mpc(group) max_poss_capacity(cpu_rq(group_first_cpu(group)))
-
 #else
 
 static inline void sched_ttwu_pending(void) { }
@@ -1009,9 +1017,6 @@ extern unsigned int max_capacity;
 extern unsigned int min_capacity;
 extern unsigned int max_load_scale_factor;
 extern unsigned int max_possible_capacity;
-extern cpumask_t mpc_mask;
-extern unsigned long capacity_scale_cpu_efficiency(int cpu);
-extern unsigned long capacity_scale_cpu_freq(int cpu);
 extern unsigned int sched_upmigrate;
 extern unsigned int sched_downmigrate;
 extern unsigned int sched_init_task_load_pelt;
@@ -1026,6 +1031,51 @@ unsigned int cpu_temp(int cpu);
 extern unsigned int nr_eligible_big_tasks(int cpu);
 extern void update_up_down_migrate(void);
 
+static inline int cpu_capacity(int cpu)
+{
+	return cpu_rq(cpu)->cluster->capacity;
+}
+
+static inline int cpu_max_possible_capacity(int cpu)
+{
+	return cpu_rq(cpu)->cluster->max_possible_capacity;
+}
+
+static inline int cpu_load_scale_factor(int cpu)
+{
+	return cpu_rq(cpu)->cluster->load_scale_factor;
+}
+
+static inline int cpu_efficiency(int cpu)
+{
+	return cpu_rq(cpu)->cluster->efficiency;
+}
+
+static inline unsigned int cpu_cur_freq(int cpu)
+{
+	return cpu_rq(cpu)->cluster->cur_freq;
+}
+
+static inline unsigned int cpu_min_freq(int cpu)
+{
+	return cpu_rq(cpu)->cluster->min_freq;
+}
+
+static inline unsigned int cpu_max_freq(int cpu)
+{
+	return cpu_rq(cpu)->cluster->max_freq;
+}
+
+static inline unsigned int cpu_max_possible_freq(int cpu)
+{
+	return cpu_rq(cpu)->cluster->max_possible_freq;
+}
+
+static inline int same_cluster(int src_cpu, int dst_cpu)
+{
+	return cpu_rq(src_cpu)->cluster == cpu_rq(dst_cpu)->cluster;
+}
+
 /*
  * 'load' is in reference to "best cpu" at its best frequency.
  * Scale that in reference to a given cpu, accounting for how bad it is
@@ -1033,25 +1083,16 @@ extern void update_up_down_migrate(void);
  */
 static inline u64 scale_load_to_cpu(u64 task_load, int cpu)
 {
-	struct rq *rq = cpu_rq(cpu);
+	u64 lsf = cpu_load_scale_factor(cpu);
 
-	if (rq->load_scale_factor != 1024) {
-		task_load *= (u64)rq->load_scale_factor;
+	if (lsf != 1024) {
+		task_load *= lsf;
 		task_load /= 1024;
 	}
 
 	return task_load;
 }
 
-static inline int capacity(struct rq *rq)
-{
-	return rq->capacity;
-}
-static inline int max_poss_capacity(struct rq *rq)
-{
-	return rq->max_possible_capacity;
-}
-
 static inline unsigned int task_load(struct task_struct *p)
 {
 	if (sched_use_pelt)
@@ -1153,16 +1194,12 @@ static inline unsigned int nr_eligible_big_tasks(int cpu)
 
 static inline int pct_task_load(struct task_struct *p) { return 0; }
 
-static inline int capacity(struct rq *rq)
-{
-	return SCHED_LOAD_SCALE;
-}
-
-static inline int max_poss_capacity(struct rq *rq)
+static inline int cpu_capacity(int cpu)
 {
 	return SCHED_LOAD_SCALE;
 }
 
+static inline int same_cluster(int src_cpu, int dst_cpu) { return 1; }
 
 static inline void inc_cumulative_runnable_avg(struct hmp_sched_stats *stats,
 		 struct task_struct *p)
@@ -1174,16 +1211,6 @@ static inline void dec_cumulative_runnable_avg(struct hmp_sched_stats *stats,
 {
 }
 
-static inline unsigned long capacity_scale_cpu_efficiency(int cpu)
-{
-	return SCHED_LOAD_SCALE;
-}
-
-static inline unsigned long capacity_scale_cpu_freq(int cpu)
-{
-	return SCHED_LOAD_SCALE;
-}
-
 static inline void sched_account_irqtime(int cpu, struct task_struct *curr,
 				 u64 delta, u64 wallclock)
 {
@@ -1193,6 +1220,12 @@ static inline int sched_cpu_high_irqload(int cpu) { return 0; }
 
 #endif	/* CONFIG_SCHED_HMP */
 
+/*
+ * Returns the rq capacity of any rq in a group. This does not play
+ * well with groups where rq capacity can change independently.
+ */
+#define group_rq_capacity(group) cpu_capacity(group_first_cpu(group))
+
 #ifdef CONFIG_SCHED_FREQ_INPUT
 extern void check_for_freq_change(struct rq *rq);
author	Srivatsa Vaddagiri <vatsa@codeaurora.org>	2015-04-20 12:35:48 +0530
committer	David Keitel <dkeitel@codeaurora.org>	2016-03-23 21:25:10 -0700
commit	cb1bb6a8f4a881b2c6c80160a94975dd93175558 (patch)
tree	663ceabac7e860cfa7c15bae3dc7994c9c275da0 /kernel
parent	f36ae7405af944bb43f807c3e59bcddf47fd41e4 (diff)