Merge "sched: handle frequency alert notifications better"

author: Linux Build Service Account <lnxbuild@localhost> 2016-08-26 22:22:38 -0700
committer: Gerrit - the friendly Code Review server <code-review@localhost> 2016-08-26 22:22:38 -0700
commit: 1b7819036eaa2ba0c93bcc65fbbc1e72fa9b824a (patch)
tree: a08a77e7e4a5c5bd332a7a98637516dd65b217d3 /kernel
parent: 373c6ba99fddd8071aa8eebd889c73426760fa3c (diff)
parent: 2552980f79e476b99d9f489c265a836dd61a2102 (diff)
7 files changed, 4409 insertions, 4595 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 846c15156616..1f159743ebfc 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -15,6 +15,7 @@ obj-y += core.o loadavg.o clock.o cputime.o
 obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
 obj-y += wait.o completion.o idle.o sched_avg.o
 obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
+obj-$(CONFIG_SCHED_HMP) += hmp.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
 obj-$(CONFIG_SCHED_DEBUG) += debug.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 25afcb8a1402..a1626bdf8729 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -74,9 +74,6 @@
 #include <linux/binfmts.h>
 #include <linux/context_tracking.h>
 #include <linux/compiler.h>
-#include <linux/cpufreq.h>
-#include <linux/syscore_ops.h>
-#include <linux/list_sort.h>
 
 #include <asm/switch_to.h>
 #include <asm/tlb.h>
@@ -100,7 +97,6 @@ const char *task_event_names[] = {"PUT_PREV_TASK", "PICK_NEXT_TASK",
 const char *migrate_type_names[] = {"GROUP_TO_RQ", "RQ_TO_GROUP",
 					 "RQ_TO_RQ", "GROUP_TO_GROUP"};
 
-ATOMIC_NOTIFIER_HEAD(migration_notifier_head);
 ATOMIC_NOTIFIER_HEAD(load_alert_notifier_head);
 
 DEFINE_MUTEX(sched_domains_mutex);
@@ -775,202 +771,8 @@ void sched_avg_update(struct rq *rq)
 	}
 }
 
-/*
- * Note C-state for (idle) cpus.
- *
- * @cstate = cstate index, 0 -> active state
- * @wakeup_energy = energy spent in waking up cpu
- * @wakeup_latency = latency to wakeup from cstate
- *
- */
-void
-sched_set_cpu_cstate(int cpu, int cstate, int wakeup_energy, int wakeup_latency)
-{
-	struct rq *rq = cpu_rq(cpu);
-
-	rq->cstate = cstate; /* C1, C2 etc */
-	rq->wakeup_energy = wakeup_energy;
-	rq->wakeup_latency = wakeup_latency;
-}
-
 #endif /* CONFIG_SMP */
 
-#ifdef CONFIG_SCHED_HMP
-
-static ktime_t ktime_last;
-static bool sched_ktime_suspended;
-
-static bool use_cycle_counter;
-static struct cpu_cycle_counter_cb cpu_cycle_counter_cb;
-
-u64 sched_ktime_clock(void)
-{
-	if (unlikely(sched_ktime_suspended))
-		return ktime_to_ns(ktime_last);
-	return ktime_get_ns();
-}
-
-static void sched_resume(void)
-{
-	sched_ktime_suspended = false;
-}
-
-static int sched_suspend(void)
-{
-	ktime_last = ktime_get();
-	sched_ktime_suspended = true;
-	return 0;
-}
-
-static struct syscore_ops sched_syscore_ops = {
-	.resume	= sched_resume,
-	.suspend = sched_suspend
-};
-
-static int __init sched_init_ops(void)
-{
-	register_syscore_ops(&sched_syscore_ops);
-	return 0;
-}
-late_initcall(sched_init_ops);
-
-static inline void clear_ed_task(struct task_struct *p, struct rq *rq)
-{
-	if (p == rq->ed_task)
-		rq->ed_task = NULL;
-}
-
-static inline void set_task_last_wake(struct task_struct *p, u64 wallclock)
-{
-	p->last_wake_ts = wallclock;
-}
-
-static inline void set_task_last_switch_out(struct task_struct *p,
-					    u64 wallclock)
-{
-	p->last_switch_out_ts = wallclock;
-}
-
-/*
- * Note D-state for (idle) cluster.
- *
- * @dstate = dstate index, 0 -> active state
- * @wakeup_energy = energy spent in waking up cluster
- * @wakeup_latency = latency to wakeup from cluster
- *
- */
-void sched_set_cluster_dstate(const cpumask_t *cluster_cpus, int dstate,
-			int wakeup_energy, int wakeup_latency)
-{
-	struct sched_cluster *cluster =
-		cpu_rq(cpumask_first(cluster_cpus))->cluster;
-	cluster->dstate = dstate;
-	cluster->dstate_wakeup_energy = wakeup_energy;
-	cluster->dstate_wakeup_latency = wakeup_latency;
-}
-
-u32 __weak get_freq_max_load(int cpu, u32 freq)
-{
-	/* 100% by default */
-	return 100;
-}
-
-DEFINE_PER_CPU(struct freq_max_load *, freq_max_load);
-static DEFINE_SPINLOCK(freq_max_load_lock);
-
-int sched_update_freq_max_load(const cpumask_t *cpumask)
-{
-	int i, cpu, ret;
-	unsigned int freq;
-	struct cpu_pstate_pwr *costs;
-	struct cpu_pwr_stats *per_cpu_info = get_cpu_pwr_stats();
-	struct freq_max_load *max_load, *old_max_load;
-	struct freq_max_load_entry *entry;
-	u64 max_demand_capacity, max_demand;
-	unsigned long flags;
-	u32 hfreq;
-	int hpct;
-
-	if (!per_cpu_info)
-		return 0;
-
-	spin_lock_irqsave(&freq_max_load_lock, flags);
-	max_demand_capacity = div64_u64(max_task_load(), max_possible_capacity);
-	for_each_cpu(cpu, cpumask) {
-		if (!per_cpu_info[cpu].ptable) {
-			ret = -EINVAL;
-			goto fail;
-		}
-
-		old_max_load = rcu_dereference(per_cpu(freq_max_load, cpu));
-
-		/*
-		 * allocate len + 1 and leave the last power cost as 0 for
-		 * power_cost() can stop iterating index when
-		 * per_cpu_info[cpu].len > len of max_load due to race between
-		 * cpu power stats update and get_cpu_pwr_stats().
-		 */
-		max_load = kzalloc(sizeof(struct freq_max_load) +
-				   sizeof(struct freq_max_load_entry) *
-				   (per_cpu_info[cpu].len + 1), GFP_ATOMIC);
-		if (unlikely(!max_load)) {
-			ret = -ENOMEM;
-			goto fail;
-		}
-
-		max_load->length = per_cpu_info[cpu].len;
-
-		max_demand = max_demand_capacity *
-			     cpu_max_possible_capacity(cpu);
-
-		i = 0;
-		costs = per_cpu_info[cpu].ptable;
-		while (costs[i].freq) {
-			entry = &max_load->freqs[i];
-			freq = costs[i].freq;
-			hpct = get_freq_max_load(cpu, freq);
-			if (hpct <= 0 && hpct > 100)
-				hpct = 100;
-			hfreq = div64_u64((u64)freq * hpct, 100);
-			entry->hdemand =
-			    div64_u64(max_demand * hfreq,
-				      cpu_max_possible_freq(cpu));
-			i++;
-		}
-
-		rcu_assign_pointer(per_cpu(freq_max_load, cpu), max_load);
-		if (old_max_load)
-			kfree_rcu(old_max_load, rcu);
-	}
-
-	spin_unlock_irqrestore(&freq_max_load_lock, flags);
-	return 0;
-
-fail:
-	for_each_cpu(cpu, cpumask) {
-		max_load = rcu_dereference(per_cpu(freq_max_load, cpu));
-		if (max_load) {
-			rcu_assign_pointer(per_cpu(freq_max_load, cpu), NULL);
-			kfree_rcu(max_load, rcu);
-		}
-	}
-
-	spin_unlock_irqrestore(&freq_max_load_lock, flags);
-	return ret;
-}
-
-#else /* CONFIG_SCHED_HMP */
-u64 sched_ktime_clock(void)
-{
-	return 0;
-}
-
-static inline void clear_ed_task(struct task_struct *p, struct rq *rq) {}
-static inline void set_task_last_wake(struct task_struct *p, u64 wallclock) {}
-static inline void set_task_last_switch_out(struct task_struct *p,
-					    u64 wallclock) {}
-#endif /* CONFIG_SCHED_HMP */
-
 #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
 			(defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
 /*
@@ -1259,3004 +1061,6 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 		rq_clock_skip_update(rq, true);
 }
 
-#ifdef CONFIG_SCHED_HMP
-unsigned int max_possible_efficiency = 1;
-unsigned int min_possible_efficiency = UINT_MAX;
-
-unsigned long __weak arch_get_cpu_efficiency(int cpu)
-{
-	return SCHED_LOAD_SCALE;
-}
-
-/* Keep track of max/min capacity possible across CPUs "currently" */
-static void __update_min_max_capacity(void)
-{
-	int i;
-	int max_cap = 0, min_cap = INT_MAX;
-
-	for_each_online_cpu(i) {
-		max_cap = max(max_cap, cpu_capacity(i));
-		min_cap = min(min_cap, cpu_capacity(i));
-	}
-
-	max_capacity = max_cap;
-	min_capacity = min_cap;
-}
-
-static void update_min_max_capacity(void)
-{
-	unsigned long flags;
-	int i;
-
-	local_irq_save(flags);
-	for_each_possible_cpu(i)
-		raw_spin_lock(&cpu_rq(i)->lock);
-
-	__update_min_max_capacity();
-
-	for_each_possible_cpu(i)
-		raw_spin_unlock(&cpu_rq(i)->lock);
-	local_irq_restore(flags);
-}
-
-/*
- * Return 'capacity' of a cpu in reference to "least" efficient cpu, such that
- * least efficient cpu gets capacity of 1024
- */
-static unsigned long
-capacity_scale_cpu_efficiency(struct sched_cluster *cluster)
-{
-	return (1024 * cluster->efficiency) / min_possible_efficiency;
-}
-
-/*
- * Return 'capacity' of a cpu in reference to cpu with lowest max_freq
- * (min_max_freq), such that one with lowest max_freq gets capacity of 1024.
- */
-static unsigned long capacity_scale_cpu_freq(struct sched_cluster *cluster)
-{
-	return (1024 * cluster_max_freq(cluster)) / min_max_freq;
-}
-
-/*
- * Return load_scale_factor of a cpu in reference to "most" efficient cpu, so
- * that "most" efficient cpu gets a load_scale_factor of 1
- */
-static inline unsigned long
-load_scale_cpu_efficiency(struct sched_cluster *cluster)
-{
-	return DIV_ROUND_UP(1024 * max_possible_efficiency,
-			    cluster->efficiency);
-}
-
-/*
- * Return load_scale_factor of a cpu in reference to cpu with best max_freq
- * (max_possible_freq), so that one with best max_freq gets a load_scale_factor
- * of 1.
- */
-static inline unsigned long load_scale_cpu_freq(struct sched_cluster *cluster)
-{
-	return DIV_ROUND_UP(1024 * max_possible_freq,
-			   cluster_max_freq(cluster));
-}
-
-static int compute_capacity(struct sched_cluster *cluster)
-{
-	int capacity = 1024;
-
-	capacity *= capacity_scale_cpu_efficiency(cluster);
-	capacity >>= 10;
-
-	capacity *= capacity_scale_cpu_freq(cluster);
-	capacity >>= 10;
-
-	return capacity;
-}
-
-static int compute_max_possible_capacity(struct sched_cluster *cluster)
-{
-	int capacity = 1024;
-
-	capacity *= capacity_scale_cpu_efficiency(cluster);
-	capacity >>= 10;
-
-	capacity *= (1024 * cluster->max_possible_freq) / min_max_freq;
-	capacity >>= 10;
-
-	return capacity;
-}
-
-static int compute_load_scale_factor(struct sched_cluster *cluster)
-{
-	int load_scale = 1024;
-
-	/*
-	 * load_scale_factor accounts for the fact that task load
-	 * is in reference to "best" performing cpu. Task's load will need to be
-	 * scaled (up) by a factor to determine suitability to be placed on a
-	 * (little) cpu.
-	 */
-	load_scale *= load_scale_cpu_efficiency(cluster);
-	load_scale >>= 10;
-
-	load_scale *= load_scale_cpu_freq(cluster);
-	load_scale >>= 10;
-
-	return load_scale;
-}
-
-struct list_head cluster_head;
-static DEFINE_MUTEX(cluster_lock);
-static cpumask_t all_cluster_cpus = CPU_MASK_NONE;
-DECLARE_BITMAP(all_cluster_ids, NR_CPUS);
-struct sched_cluster *sched_cluster[NR_CPUS];
-int num_clusters;
-
-static struct sched_cluster init_cluster = {
-	.list			=	LIST_HEAD_INIT(init_cluster.list),
-	.id			=	0,
-	.max_power_cost		=	1,
-	.min_power_cost		=	1,
-	.capacity		=	1024,
-	.max_possible_capacity	=	1024,
-	.efficiency		=	1,
-	.load_scale_factor	=	1024,
-	.cur_freq		=	1,
-	.max_freq		=	1,
-	.max_mitigated_freq	=	UINT_MAX,
-	.min_freq		=	1,
-	.max_possible_freq	=	1,
-	.dstate			=	0,
-	.dstate_wakeup_energy	=	0,
-	.dstate_wakeup_latency	=	0,
-	.exec_scale_factor	=	1024,
-};
-
-void update_all_clusters_stats(void)
-{
-	struct sched_cluster *cluster;
-	u64 highest_mpc = 0, lowest_mpc = U64_MAX;
-
-	pre_big_task_count_change(cpu_possible_mask);
-
-	for_each_sched_cluster(cluster) {
-		u64 mpc;
-
-		cluster->capacity = compute_capacity(cluster);
-		mpc = cluster->max_possible_capacity =
-			compute_max_possible_capacity(cluster);
-		cluster->load_scale_factor = compute_load_scale_factor(cluster);
-
-		cluster->exec_scale_factor =
-			DIV_ROUND_UP(cluster->efficiency * 1024,
-				     max_possible_efficiency);
-
-		if (mpc > highest_mpc)
-			highest_mpc = mpc;
-
-		if (mpc < lowest_mpc)
-			lowest_mpc = mpc;
-	}
-
-	max_possible_capacity = highest_mpc;
-	min_max_possible_capacity = lowest_mpc;
-
-	__update_min_max_capacity();
-	sched_update_freq_max_load(cpu_possible_mask);
-	post_big_task_count_change(cpu_possible_mask);
-}
-
-static void assign_cluster_ids(struct list_head *head)
-{
-	struct sched_cluster *cluster;
-	int pos = 0;
-
-	list_for_each_entry(cluster, head, list) {
-		cluster->id = pos;
-		sched_cluster[pos++] = cluster;
-	}
-}
-
-static void
-move_list(struct list_head *dst, struct list_head *src, bool sync_rcu)
-{
-	struct list_head *first, *last;
-
-	first = src->next;
-	last = src->prev;
-
-	if (sync_rcu) {
-		INIT_LIST_HEAD_RCU(src);
-		synchronize_rcu();
-	}
-
-	first->prev = dst;
-	dst->prev = last;
-	last->next = dst;
-
-	/* Ensure list sanity before making the head visible to all CPUs. */
-	smp_mb();
-	dst->next = first;
-}
-
-static int
-compare_clusters(void *priv, struct list_head *a, struct list_head *b)
-{
-	struct sched_cluster *cluster1, *cluster2;
-	int ret;
-
-	cluster1 = container_of(a, struct sched_cluster, list);
-	cluster2 = container_of(b, struct sched_cluster, list);
-
-	ret = cluster1->max_power_cost > cluster2->max_power_cost ||
-		(cluster1->max_power_cost == cluster2->max_power_cost &&
-		cluster1->max_possible_capacity <
-				cluster2->max_possible_capacity);
-
-	return ret;
-}
-
-static void sort_clusters(void)
-{
-	struct sched_cluster *cluster;
-	struct list_head new_head;
-
-	INIT_LIST_HEAD(&new_head);
-
-	for_each_sched_cluster(cluster) {
-		cluster->max_power_cost = power_cost(cluster_first_cpu(cluster),
-							       max_task_load());
-		cluster->min_power_cost = power_cost(cluster_first_cpu(cluster),
-							       0);
-	}
-
-	move_list(&new_head, &cluster_head, true);
-
-	list_sort(NULL, &new_head, compare_clusters);
-	assign_cluster_ids(&new_head);
-
-	/*
-	 * Ensure cluster ids are visible to all CPUs before making
-	 * cluster_head visible.
-	 */
-	move_list(&cluster_head, &new_head, false);
-}
-
-static void
-insert_cluster(struct sched_cluster *cluster, struct list_head *head)
-{
-	struct sched_cluster *tmp;
-	struct list_head *iter = head;
-
-	list_for_each_entry(tmp, head, list) {
-		if (cluster->max_power_cost < tmp->max_power_cost)
-			break;
-		iter = &tmp->list;
-	}
-
-	list_add(&cluster->list, iter);
-}
-
-static struct sched_cluster *alloc_new_cluster(const struct cpumask *cpus)
-{
-	struct sched_cluster *cluster = NULL;
-
-	cluster = kzalloc(sizeof(struct sched_cluster), GFP_ATOMIC);
-	if (!cluster) {
-		__WARN_printf("Cluster allocation failed. \
-				Possible bad scheduling\n");
-		return NULL;
-	}
-
-	INIT_LIST_HEAD(&cluster->list);
-	cluster->max_power_cost		=	1;
-	cluster->min_power_cost		=	1;
-	cluster->capacity		=	1024;
-	cluster->max_possible_capacity	=	1024;
-	cluster->efficiency		=	1;
-	cluster->load_scale_factor	=	1024;
-	cluster->cur_freq		=	1;
-	cluster->max_freq		=	1;
-	cluster->max_mitigated_freq	=	UINT_MAX;
-	cluster->min_freq		=	1;
-	cluster->max_possible_freq	=	1;
-	cluster->dstate			=	0;
-	cluster->dstate_wakeup_energy	=	0;
-	cluster->dstate_wakeup_latency	=	0;
-	cluster->freq_init_done		=	false;
-
-	cluster->cpus = *cpus;
-	cluster->efficiency = arch_get_cpu_efficiency(cpumask_first(cpus));
-
-	if (cluster->efficiency > max_possible_efficiency)
-		max_possible_efficiency = cluster->efficiency;
-	if (cluster->efficiency < min_possible_efficiency)
-		min_possible_efficiency = cluster->efficiency;
-
-	return cluster;
-}
-
-static void add_cluster(const struct cpumask *cpus, struct list_head *head)
-{
-	struct sched_cluster *cluster = alloc_new_cluster(cpus);
-	int i;
-
-	if (!cluster)
-		return;
-
-	for_each_cpu(i, cpus)
-		cpu_rq(i)->cluster = cluster;
-
-	insert_cluster(cluster, head);
-	set_bit(num_clusters, all_cluster_ids);
-	num_clusters++;
-}
-
-#ifdef CONFIG_SMP
-static void update_cluster_topology(void)
-{
-	struct cpumask cpus = *cpu_possible_mask;
-	const struct cpumask *cluster_cpus;
-	struct list_head new_head;
-	int i;
-
-	INIT_LIST_HEAD(&new_head);
-
-	for_each_cpu(i, &cpus) {
-		cluster_cpus = cpu_coregroup_mask(i);
-		cpumask_or(&all_cluster_cpus, &all_cluster_cpus, cluster_cpus);
-		cpumask_andnot(&cpus, &cpus, cluster_cpus);
-		add_cluster(cluster_cpus, &new_head);
-	}
-
-	assign_cluster_ids(&new_head);
-
-	/*
-	 * Ensure cluster ids are visible to all CPUs before making
-	 * cluster_head visible.
-	 */
-	move_list(&cluster_head, &new_head, false);
-}
-#endif
-
-static void init_clusters(void)
-{
-	bitmap_clear(all_cluster_ids, 0, NR_CPUS);
-	init_cluster.cpus = *cpu_possible_mask;
-	INIT_LIST_HEAD(&cluster_head);
-}
-
-int register_cpu_cycle_counter_cb(struct cpu_cycle_counter_cb *cb)
-{
-	mutex_lock(&cluster_lock);
-	if (!cb->get_cpu_cycle_counter) {
-		mutex_unlock(&cluster_lock);
-		return -EINVAL;
-	}
-
-	cpu_cycle_counter_cb = *cb;
-	use_cycle_counter = true;
-	mutex_unlock(&cluster_lock);
-
-	return 0;
-}
-
-static int __init set_sched_enable_hmp(char *str)
-{
-	int enable_hmp = 0;
-
-	get_option(&str, &enable_hmp);
-
-	sched_enable_hmp = !!enable_hmp;
-
-	return 0;
-}
-
-early_param("sched_enable_hmp", set_sched_enable_hmp);
-
-static inline int got_boost_kick(void)
-{
-	int cpu = smp_processor_id();
-	struct rq *rq = cpu_rq(cpu);
-
-	return test_bit(BOOST_KICK, &rq->hmp_flags);
-}
-
-static inline void clear_boost_kick(int cpu)
-{
-	struct rq *rq = cpu_rq(cpu);
-
-	clear_bit(BOOST_KICK, &rq->hmp_flags);
-}
-
-void boost_kick(int cpu)
-{
-	struct rq *rq = cpu_rq(cpu);
-
-	if (!test_and_set_bit(BOOST_KICK, &rq->hmp_flags))
-		smp_send_reschedule(cpu);
-}
-
-/* Clear any HMP scheduler related requests pending from or on cpu */
-static inline void clear_hmp_request(int cpu)
-{
-	struct rq *rq = cpu_rq(cpu);
-	unsigned long flags;
-
-	clear_boost_kick(cpu);
-	clear_reserved(cpu);
-	if (rq->push_task) {
-		raw_spin_lock_irqsave(&rq->lock, flags);
-		if (rq->push_task) {
-			clear_reserved(rq->push_cpu);
-			put_task_struct(rq->push_task);
-			rq->push_task = NULL;
-		}
-		rq->active_balance = 0;
-		raw_spin_unlock_irqrestore(&rq->lock, flags);
-	}
-}
-
-int sched_set_static_cpu_pwr_cost(int cpu, unsigned int cost)
-{
-	struct rq *rq = cpu_rq(cpu);
-
-	rq->static_cpu_pwr_cost = cost;
-	return 0;
-}
-
-unsigned int sched_get_static_cpu_pwr_cost(int cpu)
-{
-	return cpu_rq(cpu)->static_cpu_pwr_cost;
-}
-
-int sched_set_static_cluster_pwr_cost(int cpu, unsigned int cost)
-{
-	struct sched_cluster *cluster = cpu_rq(cpu)->cluster;
-
-	cluster->static_cluster_pwr_cost = cost;
-	return 0;
-}
-
-unsigned int sched_get_static_cluster_pwr_cost(int cpu)
-{
-	return cpu_rq(cpu)->cluster->static_cluster_pwr_cost;
-}
-
-#else /* CONFIG_SCHED_HMP */
-
-static inline int got_boost_kick(void)
-{
-	return 0;
-}
-
-static inline void clear_boost_kick(int cpu) { }
-
-static inline void clear_hmp_request(int cpu) { }
-
-int register_cpu_cycle_counter_cb(struct cpu_cycle_counter_cb *cb)
-{
-	return 0;
-}
-
-#ifdef CONFIG_SMP
-static void update_cluster_topology(void) { }
-#endif
-
-#endif	/* CONFIG_SCHED_HMP */
-
-#define SCHED_MIN_FREQ 1
-
-#if defined(CONFIG_SCHED_HMP)
-
-/*
- * sched_window_stats_policy and sched_ravg_hist_size have a 'sysctl' copy
- * associated with them. This is required for atomic update of those variables
- * when being modifed via sysctl interface.
- *
- * IMPORTANT: Initialize both copies to same value!!
- */
-
-/*
- * Tasks that are runnable continuously for a period greather than
- * EARLY_DETECTION_DURATION can be flagged early as potential
- * high load tasks.
- */
-#define EARLY_DETECTION_DURATION 9500000
-
-static __read_mostly unsigned int sched_ravg_hist_size = 5;
-__read_mostly unsigned int sysctl_sched_ravg_hist_size = 5;
-
-static __read_mostly unsigned int sched_window_stats_policy =
-	 WINDOW_STATS_MAX_RECENT_AVG;
-__read_mostly unsigned int sysctl_sched_window_stats_policy =
-	WINDOW_STATS_MAX_RECENT_AVG;
-
-#define SCHED_ACCOUNT_WAIT_TIME 1
-
-__read_mostly unsigned int sysctl_sched_cpu_high_irqload = (10 * NSEC_PER_MSEC);
-
-unsigned int __read_mostly sysctl_sched_enable_colocation = 1;
-
-#ifdef CONFIG_SCHED_FREQ_INPUT
-
-__read_mostly unsigned int sysctl_sched_new_task_windows = 5;
-
-#define SCHED_FREQ_ACCOUNT_WAIT_TIME 0
-
-/*
- * For increase, send notification if
- *      freq_required - cur_freq > sysctl_sched_freq_inc_notify
- */
-__read_mostly int sysctl_sched_freq_inc_notify = 10 * 1024 * 1024; /* + 10GHz */
-
-/*
- * For decrease, send notification if
- *      cur_freq - freq_required > sysctl_sched_freq_dec_notify
- */
-__read_mostly int sysctl_sched_freq_dec_notify = 10 * 1024 * 1024; /* - 10GHz */
-
-static __read_mostly unsigned int sched_io_is_busy;
-
-__read_mostly unsigned int sysctl_sched_pred_alert_freq = 10 * 1024 * 1024;
-
-#endif	/* CONFIG_SCHED_FREQ_INPUT */
-
-/* 1 -> use PELT based load stats, 0 -> use window-based load stats */
-unsigned int __read_mostly sched_use_pelt;
-
-/*
- * Maximum possible frequency across all cpus. Task demand and cpu
- * capacity (cpu_power) metrics are scaled in reference to it.
- */
-unsigned int max_possible_freq = 1;
-
-/*
- * Minimum possible max_freq across all cpus. This will be same as
- * max_possible_freq on homogeneous systems and could be different from
- * max_possible_freq on heterogenous systems. min_max_freq is used to derive
- * capacity (cpu_power) of cpus.
- */
-unsigned int min_max_freq = 1;
-
-unsigned int max_capacity = 1024; /* max(rq->capacity) */
-unsigned int min_capacity = 1024; /* min(rq->capacity) */
-unsigned int max_possible_capacity = 1024; /* max(rq->max_possible_capacity) */
-unsigned int
-min_max_possible_capacity = 1024; /* min(rq->max_possible_capacity) */
-
-/* Window size (in ns) */
-__read_mostly unsigned int sched_ravg_window = 10000000;
-
-/* Min window size (in ns) = 10ms */
-#define MIN_SCHED_RAVG_WINDOW 10000000
-
-/* Max window size (in ns) = 1s */
-#define MAX_SCHED_RAVG_WINDOW 1000000000
-
-/* Temporarily disable window-stats activity on all cpus */
-unsigned int __read_mostly sched_disable_window_stats;
-
-/*
- * Major task runtime. If a task runs for more than sched_major_task_runtime
- * in a window, it's considered to be generating majority of workload
- * for this window. Prediction could be adjusted for such tasks.
- */
-#ifdef CONFIG_SCHED_FREQ_INPUT
-__read_mostly unsigned int sched_major_task_runtime = 10000000;
-#endif
-
-static unsigned int sync_cpu;
-
-static LIST_HEAD(related_thread_groups);
-static DEFINE_RWLOCK(related_thread_group_lock);
-
-#define for_each_related_thread_group(grp) \
-	list_for_each_entry(grp, &related_thread_groups, list)
-
-/*
- * Demand aggregation for frequency purpose:
- *
- * 'sched_freq_aggregate' controls aggregation of cpu demand of related threads
- * for frequency determination purpose. This aggregation is done per-cluster.
- *
- * CPU demand of tasks from various related groups is aggregated per-cluster and
- * added to the "max_busy_cpu" in that cluster, where max_busy_cpu is determined
- * by just rq->prev_runnable_sum.
- *
- * Some examples follow, which assume:
- *	Cluster0 = CPU0-3, Cluster1 = CPU4-7
- *	One related thread group A that has tasks A0, A1, A2
- *
- *	A->cpu_time[X].curr/prev_sum = counters in which cpu execution stats of
- *	tasks belonging to group A are accumulated when they run on cpu X.
- *
- *	CX->curr/prev_sum = counters in which cpu execution stats of all tasks
- *	not belonging to group A are accumulated when they run on cpu X
- *
- * Lets say the stats for window M was as below:
- *
- *	C0->prev_sum = 1ms, A->cpu_time[0].prev_sum = 5ms
- *		Task A0 ran 5ms on CPU0
- *		Task B0 ran 1ms on CPU0
- *
- *	C1->prev_sum = 5ms, A->cpu_time[1].prev_sum = 6ms
- *		Task A1 ran 4ms on CPU1
- *		Task A2 ran 2ms on CPU1
- *		Task B1 ran 5ms on CPU1
- *
- *	C2->prev_sum = 0ms, A->cpu_time[2].prev_sum = 0
- *		CPU2 idle
- *
- *	C3->prev_sum = 0ms, A->cpu_time[3].prev_sum = 0
- *		CPU3 idle
- *
- * In this case, CPU1 was most busy going by just its prev_sum counter. Demand
- * from all group A tasks are added to CPU1. IOW, at end of window M, cpu busy
- * time reported to governor will be:
- *
- *
- *	C0 busy time = 1ms
- *	C1 busy time = 5 + 5 + 6 = 16ms
- *
- */
-static __read_mostly unsigned int sched_freq_aggregate;
-__read_mostly unsigned int sysctl_sched_freq_aggregate;
-
-#define EXITING_TASK_MARKER	0xdeaddead
-
-static inline int exiting_task(struct task_struct *p)
-{
-	return (p->ravg.sum_history[0] == EXITING_TASK_MARKER);
-}
-
-static int __init set_sched_ravg_window(char *str)
-{
-	get_option(&str, &sched_ravg_window);
-
-	sched_use_pelt = (sched_ravg_window < MIN_SCHED_RAVG_WINDOW ||
-				sched_ravg_window > MAX_SCHED_RAVG_WINDOW);
-
-	return 0;
-}
-
-early_param("sched_ravg_window", set_sched_ravg_window);
-
-static inline void
-update_window_start(struct rq *rq, u64 wallclock)
-{
-	s64 delta;
-	int nr_windows;
-
-	delta = wallclock - rq->window_start;
-	BUG_ON(delta < 0);
-	if (delta < sched_ravg_window)
-		return;
-
-	nr_windows = div64_u64(delta, sched_ravg_window);
-	rq->window_start += (u64)nr_windows * (u64)sched_ravg_window;
-}
-
-#define DIV64_U64_ROUNDUP(X, Y) div64_u64((X) + (Y - 1), Y)
-
-static inline u64 scale_exec_time(u64 delta, struct rq *rq)
-{
-	u32 freq;
-
-	freq = cpu_cycles_to_freq(rq->cc.cycles, rq->cc.time);
-	delta = DIV64_U64_ROUNDUP(delta * freq, max_possible_freq);
-	delta *= rq->cluster->exec_scale_factor;
-	delta >>= 10;
-
-	return delta;
-}
-
-#ifdef CONFIG_SCHED_FREQ_INPUT
-
-static inline int cpu_is_waiting_on_io(struct rq *rq)
-{
-	if (!sched_io_is_busy)
-		return 0;
-
-	return atomic_read(&rq->nr_iowait);
-}
-
-/* Does freq_required sufficiently exceed or fall behind cur_freq? */
-static inline int
-nearly_same_freq(unsigned int cur_freq, unsigned int freq_required)
-{
-	int delta = freq_required - cur_freq;
-
-	if (freq_required > cur_freq)
-		return delta < sysctl_sched_freq_inc_notify;
-
-	delta = -delta;
-
-	return delta < sysctl_sched_freq_dec_notify;
-}
-
-/* Convert busy time to frequency equivalent */
-static inline unsigned int load_to_freq(struct rq *rq, u64 load)
-{
-	unsigned int freq;
-
-	load = scale_load_to_cpu(load, cpu_of(rq));
-	load *= 128;
-	load = div64_u64(load, max_task_load());
-
-	freq = load * cpu_max_possible_freq(cpu_of(rq));
-	freq /= 128;
-
-	return freq;
-}
-
-static inline struct group_cpu_time *
-_group_cpu_time(struct related_thread_group *grp, int cpu);
-
-/*
- * Return load from all related group in given cpu.
- * Caller must ensure that related_thread_group_lock is held.
- */
-static void _group_load_in_cpu(int cpu, u64 *grp_load, u64 *new_grp_load)
-{
-	struct related_thread_group *grp;
-
-	for_each_related_thread_group(grp) {
-		struct group_cpu_time *cpu_time;
-
-		cpu_time = _group_cpu_time(grp, cpu);
-		*grp_load += cpu_time->prev_runnable_sum;
-		if (new_grp_load)
-			*new_grp_load += cpu_time->nt_prev_runnable_sum;
-	}
-}
-
-/*
- * Return load from all related groups in given frequency domain.
- * Caller must ensure that related_thread_group_lock is held.
- */
-static void group_load_in_freq_domain(struct cpumask *cpus,
-				u64 *grp_load, u64 *new_grp_load)
-{
-	struct related_thread_group *grp;
-	int j;
-
-	for_each_related_thread_group(grp) {
-		for_each_cpu(j, cpus) {
-			struct group_cpu_time *cpu_time;
-
-			cpu_time = _group_cpu_time(grp, j);
-			*grp_load += cpu_time->prev_runnable_sum;
-			*new_grp_load += cpu_time->nt_prev_runnable_sum;
-		}
-	}
-}
-
-/*
- * Should scheduler alert governor for changing frequency?
- *
- * @check_pred - evaluate frequency based on the predictive demand
- * @check_groups - add load from all related groups on given cpu
- *
- * check_groups is set to 1 if a "related" task movement/wakeup is triggering
- * the notification check. To avoid "re-aggregation" of demand in such cases,
- * we check whether the migrated/woken tasks demand (along with demand from
- * existing tasks on the cpu) can be met on target cpu
- *
- */
-
-static int send_notification(struct rq *rq, int check_pred, int check_groups)
-{
-	unsigned int cur_freq, freq_required;
-	unsigned long flags;
-	int rc = 0;
-	u64 group_load = 0, new_load  = 0;
-
-	if (!sched_enable_hmp)
-		return 0;
-
-	if (check_pred) {
-		u64 prev = rq->old_busy_time;
-		u64 predicted = rq->hmp_stats.pred_demands_sum;
-
-		if (rq->cluster->cur_freq == cpu_max_freq(cpu_of(rq)))
-			return 0;
-
-		prev = max(prev, rq->old_estimated_time);
-		if (prev > predicted)
-			return 0;
-
-		cur_freq = load_to_freq(rq, prev);
-		freq_required = load_to_freq(rq, predicted);
-
-		if (freq_required < cur_freq + sysctl_sched_pred_alert_freq)
-			return 0;
-	} else {
-		read_lock(&related_thread_group_lock);
-		/*
-		 * Protect from concurrent update of rq->prev_runnable_sum and
-		 * group cpu load
-		 */
-		raw_spin_lock_irqsave(&rq->lock, flags);
-		if (check_groups)
-			_group_load_in_cpu(cpu_of(rq), &group_load, NULL);
-
-		new_load = rq->prev_runnable_sum + group_load;
-
-		raw_spin_unlock_irqrestore(&rq->lock, flags);
-		read_unlock(&related_thread_group_lock);
-
-		cur_freq = load_to_freq(rq, rq->old_busy_time);
-		freq_required = load_to_freq(rq, new_load);
-
-		if (nearly_same_freq(cur_freq, freq_required))
-			return 0;
-	}
-
-	raw_spin_lock_irqsave(&rq->lock, flags);
-	if (!rq->notifier_sent) {
-		rq->notifier_sent = 1;
-		rc = 1;
-		trace_sched_freq_alert(cpu_of(rq), check_pred, check_groups, rq,
-				       new_load);
-	}
-	raw_spin_unlock_irqrestore(&rq->lock, flags);
-
-	return rc;
-}
-
-/* Alert governor if there is a need to change frequency */
-void check_for_freq_change(struct rq *rq, bool check_pred, bool check_groups)
-{
-	int cpu = cpu_of(rq);
-
-	if (!send_notification(rq, check_pred, check_groups))
-		return;
-
-	atomic_notifier_call_chain(
-		&load_alert_notifier_head, 0,
-		(void *)(long)cpu);
-}
-
-static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p,
-				     u64 irqtime, int event)
-{
-	if (is_idle_task(p)) {
-		/* TASK_WAKE && TASK_MIGRATE is not possible on idle task! */
-		if (event == PICK_NEXT_TASK)
-			return 0;
-
-		/* PUT_PREV_TASK, TASK_UPDATE && IRQ_UPDATE are left */
-		return irqtime || cpu_is_waiting_on_io(rq);
-	}
-
-	if (event == TASK_WAKE)
-		return 0;
-
-	if (event == PUT_PREV_TASK || event == IRQ_UPDATE)
-		return 1;
-
-	/*
-	 * TASK_UPDATE can be called on sleeping task, when its moved between
-	 * related groups
-	 */
-	if (event == TASK_UPDATE) {
-		if (rq->curr == p)
-			return 1;
-
-		return p->on_rq ? SCHED_FREQ_ACCOUNT_WAIT_TIME : 0;
-	}
-
-	/* TASK_MIGRATE, PICK_NEXT_TASK left */
-	return SCHED_FREQ_ACCOUNT_WAIT_TIME;
-}
-
-static inline bool is_new_task(struct task_struct *p)
-{
-	return p->ravg.active_windows < sysctl_sched_new_task_windows;
-}
-
-#define INC_STEP 8
-#define DEC_STEP 2
-#define CONSISTENT_THRES 16
-#define INC_STEP_BIG 16
-/*
- * bucket_increase - update the count of all buckets
- *
- * @buckets: array of buckets tracking busy time of a task
- * @idx: the index of bucket to be incremented
- *
- * Each time a complete window finishes, count of bucket that runtime
- * falls in (@idx) is incremented. Counts of all other buckets are
- * decayed. The rate of increase and decay could be different based
- * on current count in the bucket.
- */
-static inline void bucket_increase(u8 *buckets, int idx)
-{
-	int i, step;
-
-	for (i = 0; i < NUM_BUSY_BUCKETS; i++) {
-		if (idx != i) {
-			if (buckets[i] > DEC_STEP)
-				buckets[i] -= DEC_STEP;
-			else
-				buckets[i] = 0;
-		} else {
-			step = buckets[i] >= CONSISTENT_THRES ?
-						INC_STEP_BIG : INC_STEP;
-			if (buckets[i] > U8_MAX - step)
-				buckets[i] = U8_MAX;
-			else
-				buckets[i] += step;
-		}
-	}
-}
-
-static inline int busy_to_bucket(u32 normalized_rt)
-{
-	int bidx;
-
-	bidx = mult_frac(normalized_rt, NUM_BUSY_BUCKETS, max_task_load());
-	bidx = min(bidx, NUM_BUSY_BUCKETS - 1);
-
-	/*
-	 * Combine lowest two buckets. The lowest frequency falls into
-	 * 2nd bucket and thus keep predicting lowest bucket is not
-	 * useful.
-	 */
-	if (!bidx)
-		bidx++;
-
-	return bidx;
-}
-
-static inline u64
-scale_load_to_freq(u64 load, unsigned int src_freq, unsigned int dst_freq)
-{
-	return div64_u64(load * (u64)src_freq, (u64)dst_freq);
-}
-
-#define HEAVY_TASK_SKIP 2
-#define HEAVY_TASK_SKIP_LIMIT 4
-/*
- * get_pred_busy - calculate predicted demand for a task on runqueue
- *
- * @rq: runqueue of task p
- * @p: task whose prediction is being updated
- * @start: starting bucket. returned prediction should not be lower than
- *         this bucket.
- * @runtime: runtime of the task. returned prediction should not be lower
- *           than this runtime.
- * Note: @start can be derived from @runtime. It's passed in only to
- * avoid duplicated calculation in some cases.
- *
- * A new predicted busy time is returned for task @p based on @runtime
- * passed in. The function searches through buckets that represent busy
- * time equal to or bigger than @runtime and attempts to find the bucket to
- * to use for prediction. Once found, it searches through historical busy
- * time and returns the latest that falls into the bucket. If no such busy
- * time exists, it returns the medium of that bucket.
- */
-static u32 get_pred_busy(struct rq *rq, struct task_struct *p,
-				int start, u32 runtime)
-{
-	int i;
-	u8 *buckets = p->ravg.busy_buckets;
-	u32 *hist = p->ravg.sum_history;
-	u32 dmin, dmax;
-	u64 cur_freq_runtime = 0;
-	int first = NUM_BUSY_BUCKETS, final, skip_to;
-	u32 ret = runtime;
-
-	/* skip prediction for new tasks due to lack of history */
-	if (unlikely(is_new_task(p)))
-		goto out;
-
-	/* find minimal bucket index to pick */
-	for (i = start; i < NUM_BUSY_BUCKETS; i++) {
-		if (buckets[i]) {
-			first = i;
-			break;
-		}
-	}
-	/* if no higher buckets are filled, predict runtime */
-	if (first >= NUM_BUSY_BUCKETS)
-		goto out;
-
-	/* compute the bucket for prediction */
-	final = first;
-	if (first < HEAVY_TASK_SKIP_LIMIT) {
-		/* compute runtime at current CPU frequency */
-		cur_freq_runtime = mult_frac(runtime, max_possible_efficiency,
-					     rq->cluster->efficiency);
-		cur_freq_runtime = scale_load_to_freq(cur_freq_runtime,
-				max_possible_freq, rq->cluster->cur_freq);
-		/*
-		 * if the task runs for majority of the window, try to
-		 * pick higher buckets.
-		 */
-		if (cur_freq_runtime >= sched_major_task_runtime) {
-			int next = NUM_BUSY_BUCKETS;
-			/*
-			 * if there is a higher bucket that's consistently
-			 * hit, don't jump beyond that.
-			 */
-			for (i = start + 1; i <= HEAVY_TASK_SKIP_LIMIT &&
-			     i < NUM_BUSY_BUCKETS; i++) {
-				if (buckets[i] > CONSISTENT_THRES) {
-					next = i;
-					break;
-				}
-			}
-			skip_to = min(next, start + HEAVY_TASK_SKIP);
-			/* don't jump beyond HEAVY_TASK_SKIP_LIMIT */
-			skip_to = min(HEAVY_TASK_SKIP_LIMIT, skip_to);
-			/* don't go below first non-empty bucket, if any */
-			final = max(first, skip_to);
-		}
-	}
-
-	/* determine demand range for the predicted bucket */
-	if (final < 2) {
-		/* lowest two buckets are combined */
-		dmin = 0;
-		final = 1;
-	} else {
-		dmin = mult_frac(final, max_task_load(), NUM_BUSY_BUCKETS);
-	}
-	dmax = mult_frac(final + 1, max_task_load(), NUM_BUSY_BUCKETS);
-
-	/*
-	 * search through runtime history and return first runtime that falls
-	 * into the range of predicted bucket.
-	 */
-	for (i = 0; i < sched_ravg_hist_size; i++) {
-		if (hist[i] >= dmin && hist[i] < dmax) {
-			ret = hist[i];
-			break;
-		}
-	}
-	/* no historical runtime within bucket found, use average of the bin */
-	if (ret < dmin)
-		ret = (dmin + dmax) / 2;
-	/*
-	 * when updating in middle of a window, runtime could be higher
-	 * than all recorded history. Always predict at least runtime.
-	 */
-	ret = max(runtime, ret);
-out:
-	trace_sched_update_pred_demand(rq, p, runtime,
-		mult_frac((unsigned int)cur_freq_runtime, 100,
-			  sched_ravg_window), ret);
-	return ret;
-}
-
-static inline u32 calc_pred_demand(struct rq *rq, struct task_struct *p)
-{
-	if (p->ravg.pred_demand >= p->ravg.curr_window)
-		return p->ravg.pred_demand;
-
-	return get_pred_busy(rq, p, busy_to_bucket(p->ravg.curr_window),
-			     p->ravg.curr_window);
-}
-
-/*
- * predictive demand of a task is calculated at the window roll-over.
- * if the task current window busy time exceeds the predicted
- * demand, update it here to reflect the task needs.
- */
-void update_task_pred_demand(struct rq *rq, struct task_struct *p, int event)
-{
-	u32 new, old;
-
-	if (is_idle_task(p) || exiting_task(p))
-		return;
-
-	if (event != PUT_PREV_TASK && event != TASK_UPDATE &&
-			(!SCHED_FREQ_ACCOUNT_WAIT_TIME ||
-			 (event != TASK_MIGRATE &&
-			 event != PICK_NEXT_TASK)))
-		return;
-
-	/*
-	 * TASK_UPDATE can be called on sleeping task, when its moved between
-	 * related groups
-	 */
-	if (event == TASK_UPDATE) {
-		if (!p->on_rq && !SCHED_FREQ_ACCOUNT_WAIT_TIME)
-			return;
-	}
-
-	new = calc_pred_demand(rq, p);
-	old = p->ravg.pred_demand;
-
-	if (old >= new)
-		return;
-
-	if (task_on_rq_queued(p) && (!task_has_dl_policy(p) ||
-				!p->dl.dl_throttled))
-		p->sched_class->fixup_hmp_sched_stats(rq, p,
-				p->ravg.demand,
-				new);
-
-	p->ravg.pred_demand = new;
-}
-
-/*
- * Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum)
- */
-static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
-				 int event, u64 wallclock, u64 irqtime)
-{
-	int new_window, full_window = 0;
-	int p_is_curr_task = (p == rq->curr);
-	u64 mark_start = p->ravg.mark_start;
-	u64 window_start = rq->window_start;
-	u32 window_size = sched_ravg_window;
-	u64 delta;
-	u64 *curr_runnable_sum = &rq->curr_runnable_sum;
-	u64 *prev_runnable_sum = &rq->prev_runnable_sum;
-	u64 *nt_curr_runnable_sum = &rq->nt_curr_runnable_sum;
-	u64 *nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;
-	int flip_counters = 0;
-	int prev_sum_reset = 0;
-	bool new_task;
-	struct related_thread_group *grp;
-
-	new_window = mark_start < window_start;
-	if (new_window) {
-		full_window = (window_start - mark_start) >= window_size;
-		if (p->ravg.active_windows < USHRT_MAX)
-			p->ravg.active_windows++;
-	}
-
-	new_task = is_new_task(p);
-
-	grp = p->grp;
-	if (grp && sched_freq_aggregate) {
-		/* cpu_time protected by rq_lock */
-		struct group_cpu_time *cpu_time =
-			_group_cpu_time(grp, cpu_of(rq));
-
-		curr_runnable_sum = &cpu_time->curr_runnable_sum;
-		prev_runnable_sum = &cpu_time->prev_runnable_sum;
-
-		nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
-		nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
-
-		if (cpu_time->window_start != rq->window_start) {
-			int nr_windows;
-
-			delta = rq->window_start - cpu_time->window_start;
-			nr_windows = div64_u64(delta, window_size);
-			if (nr_windows > 1)
-				prev_sum_reset = 1;
-
-			cpu_time->window_start = rq->window_start;
-			flip_counters = 1;
-		}
-
-		if (p_is_curr_task && new_window) {
-			u64 curr_sum = rq->curr_runnable_sum;
-			u64 nt_curr_sum = rq->nt_curr_runnable_sum;
-
-			if (full_window)
-				curr_sum = nt_curr_sum = 0;
-
-			rq->prev_runnable_sum = curr_sum;
-			rq->nt_prev_runnable_sum = nt_curr_sum;
-
-			rq->curr_runnable_sum = 0;
-			rq->nt_curr_runnable_sum = 0;
-		}
-	} else {
-		if (p_is_curr_task && new_window) {
-			flip_counters = 1;
-			if (full_window)
-				prev_sum_reset = 1;
-		}
-	}
-
-	/* Handle per-task window rollover. We don't care about the idle
-	 * task or exiting tasks. */
-	if (new_window && !is_idle_task(p) && !exiting_task(p)) {
-		u32 curr_window = 0;
-
-		if (!full_window)
-			curr_window = p->ravg.curr_window;
-
-		p->ravg.prev_window = curr_window;
-		p->ravg.curr_window = 0;
-	}
-
-	if (flip_counters) {
-		u64 curr_sum = *curr_runnable_sum;
-		u64 nt_curr_sum = *nt_curr_runnable_sum;
-
-		if (prev_sum_reset)
-			curr_sum = nt_curr_sum = 0;
-
-		*prev_runnable_sum = curr_sum;
-		*nt_prev_runnable_sum = nt_curr_sum;
-
-		*curr_runnable_sum = 0;
-		*nt_curr_runnable_sum = 0;
-	}
-
-	if (!account_busy_for_cpu_time(rq, p, irqtime, event)) {
-		/* account_busy_for_cpu_time() = 0, so no update to the
-		 * task's current window needs to be made. This could be
-		 * for example
-		 *
-		 *   - a wakeup event on a task within the current
-		 *     window (!new_window below, no action required),
-		 *   - switching to a new task from idle (PICK_NEXT_TASK)
-		 *     in a new window where irqtime is 0 and we aren't
-		 *     waiting on IO */
-
-		if (!new_window)
-			return;
-
-		/* A new window has started. The RQ demand must be rolled
-		 * over if p is the current task. */
-		if (p_is_curr_task) {
-			/* p is idle task */
-			BUG_ON(p != rq->idle);
-		}
-
-		return;
-	}
-
-	if (!new_window) {
-		/* account_busy_for_cpu_time() = 1 so busy time needs
-		 * to be accounted to the current window. No rollover
-		 * since we didn't start a new window. An example of this is
-		 * when a task starts execution and then sleeps within the
-		 * same window. */
-
-		if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq))
-			delta = wallclock - mark_start;
-		else
-			delta = irqtime;
-		delta = scale_exec_time(delta, rq);
-		*curr_runnable_sum += delta;
-		if (new_task)
-			*nt_curr_runnable_sum += delta;
-
-		if (!is_idle_task(p) && !exiting_task(p))
-			p->ravg.curr_window += delta;
-
-		return;
-	}
-
-	if (!p_is_curr_task) {
-		/* account_busy_for_cpu_time() = 1 so busy time needs
-		 * to be accounted to the current window. A new window
-		 * has also started, but p is not the current task, so the
-		 * window is not rolled over - just split up and account
-		 * as necessary into curr and prev. The window is only
-		 * rolled over when a new window is processed for the current
-		 * task.
-		 *
-		 * Irqtime can't be accounted by a task that isn't the
-		 * currently running task. */
-
-		if (!full_window) {
-			/* A full window hasn't elapsed, account partial
-			 * contribution to previous completed window. */
-			delta = scale_exec_time(window_start - mark_start, rq);
-			if (!exiting_task(p))
-				p->ravg.prev_window += delta;
-		} else {
-			/* Since at least one full window has elapsed,
-			 * the contribution to the previous window is the
-			 * full window (window_size). */
-			delta = scale_exec_time(window_size, rq);
-			if (!exiting_task(p))
-				p->ravg.prev_window = delta;
-		}
-
-		*prev_runnable_sum += delta;
-		if (new_task)
-			*nt_prev_runnable_sum += delta;
-
-		/* Account piece of busy time in the current window. */
-		delta = scale_exec_time(wallclock - window_start, rq);
-		*curr_runnable_sum += delta;
-		if (new_task)
-			*nt_curr_runnable_sum += delta;
-
-		if (!exiting_task(p))
-			p->ravg.curr_window = delta;
-
-		return;
-	}
-
-	if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) {
-		/* account_busy_for_cpu_time() = 1 so busy time needs
-		 * to be accounted to the current window. A new window
-		 * has started and p is the current task so rollover is
-		 * needed. If any of these three above conditions are true
-		 * then this busy time can't be accounted as irqtime.
-		 *
-		 * Busy time for the idle task or exiting tasks need not
-		 * be accounted.
-		 *
-		 * An example of this would be a task that starts execution
-		 * and then sleeps once a new window has begun. */
-
-		if (!full_window) {
-			/* A full window hasn't elapsed, account partial
-			 * contribution to previous completed window. */
-			delta = scale_exec_time(window_start - mark_start, rq);
-			if (!is_idle_task(p) && !exiting_task(p))
-				p->ravg.prev_window += delta;
-		} else {
-			/* Since at least one full window has elapsed,
-			 * the contribution to the previous window is the
-			 * full window (window_size). */
-			delta = scale_exec_time(window_size, rq);
-			if (!is_idle_task(p) && !exiting_task(p))
-				p->ravg.prev_window = delta;
-		}
-
-		/* Rollover is done here by overwriting the values in
-		 * prev_runnable_sum and curr_runnable_sum. */
-		*prev_runnable_sum += delta;
-		if (new_task)
-			*nt_prev_runnable_sum += delta;
-
-		/* Account piece of busy time in the current window. */
-		delta = scale_exec_time(wallclock - window_start, rq);
-		*curr_runnable_sum += delta;
-		if (new_task)
-			*nt_curr_runnable_sum += delta;
-
-		if (!is_idle_task(p) && !exiting_task(p))
-			p->ravg.curr_window = delta;
-
-		return;
-	}
-
-	if (irqtime) {
-		/* account_busy_for_cpu_time() = 1 so busy time needs
-		 * to be accounted to the current window. A new window
-		 * has started and p is the current task so rollover is
-		 * needed. The current task must be the idle task because
-		 * irqtime is not accounted for any other task.
-		 *
-		 * Irqtime will be accounted each time we process IRQ activity
-		 * after a period of idleness, so we know the IRQ busy time
-		 * started at wallclock - irqtime. */
-
-		BUG_ON(!is_idle_task(p));
-		mark_start = wallclock - irqtime;
-
-		/* Roll window over. If IRQ busy time was just in the current
-		 * window then that is all that need be accounted. */
-		if (mark_start > window_start) {
-			*curr_runnable_sum = scale_exec_time(irqtime, rq);
-			return;
-		}
-
-		/* The IRQ busy time spanned multiple windows. Process the
-		 * busy time preceding the current window start first. */
-		delta = window_start - mark_start;
-		if (delta > window_size)
-			delta = window_size;
-		delta = scale_exec_time(delta, rq);
-		*prev_runnable_sum += delta;
-
-		/* Process the remaining IRQ busy time in the current window. */
-		delta = wallclock - window_start;
-		rq->curr_runnable_sum = scale_exec_time(delta, rq);
-
-		return;
-	}
-
-	BUG();
-}
-
-static inline u32 predict_and_update_buckets(struct rq *rq,
-			struct task_struct *p, u32 runtime) {
-
-	int bidx;
-	u32 pred_demand;
-
-	bidx = busy_to_bucket(runtime);
-	pred_demand = get_pred_busy(rq, p, bidx, runtime);
-	bucket_increase(p->ravg.busy_buckets, bidx);
-
-	return pred_demand;
-}
-#define assign_ravg_pred_demand(x) (p->ravg.pred_demand = x)
-
-#else	/* CONFIG_SCHED_FREQ_INPUT */
-
-static inline void
-update_task_pred_demand(struct rq *rq, struct task_struct *p, int event)
-{
-}
-
-static inline void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
-	     int event, u64 wallclock, u64 irqtime)
-{
-}
-
-static inline u32 predict_and_update_buckets(struct rq *rq,
-			struct task_struct *p, u32 runtime)
-{
-	return 0;
-}
-#define assign_ravg_pred_demand(x)
-
-#endif	/* CONFIG_SCHED_FREQ_INPUT */
-
-static void update_task_cpu_cycles(struct task_struct *p, int cpu)
-{
-	if (use_cycle_counter)
-		p->cpu_cycles = cpu_cycle_counter_cb.get_cpu_cycle_counter(cpu);
-}
-
-static void
-update_task_rq_cpu_cycles(struct task_struct *p, struct rq *rq, int event,
-			  u64 wallclock, u64 irqtime)
-{
-	u64 cur_cycles;
-	int cpu = cpu_of(rq);
-
-	lockdep_assert_held(&rq->lock);
-
-	if (!use_cycle_counter) {
-		rq->cc.cycles = cpu_cur_freq(cpu);
-		rq->cc.time = 1;
-		return;
-	}
-
-	cur_cycles = cpu_cycle_counter_cb.get_cpu_cycle_counter(cpu);
-
-	/*
-	 * If current task is idle task and irqtime == 0 CPU was
-	 * indeed idle and probably its cycle counter was not
-	 * increasing.  We still need estimatied CPU frequency
-	 * for IO wait time accounting.  Use the previously
-	 * calculated frequency in such a case.
-	 */
-	if (!is_idle_task(rq->curr) || irqtime) {
-		if (unlikely(cur_cycles < p->cpu_cycles))
-			rq->cc.cycles = cur_cycles + (U64_MAX - p->cpu_cycles);
-		else
-			rq->cc.cycles = cur_cycles - p->cpu_cycles;
-		rq->cc.cycles = rq->cc.cycles * NSEC_PER_MSEC;
-
-		if (event == IRQ_UPDATE && is_idle_task(p))
-			/*
-			 * Time between mark_start of idle task and IRQ handler
-			 * entry time is CPU cycle counter stall period.
-			 * Upon IRQ handler entry sched_account_irqstart()
-			 * replenishes idle task's cpu cycle counter so
-			 * rq->cc.cycles now represents increased cycles during
-			 * IRQ handler rather than time between idle entry and
-			 * IRQ exit.  Thus use irqtime as time delta.
-			 */
-			rq->cc.time = irqtime;
-		else
-			rq->cc.time = wallclock - p->ravg.mark_start;
-		BUG_ON((s64)rq->cc.time < 0);
-	}
-
-	p->cpu_cycles = cur_cycles;
-
-	trace_sched_get_task_cpu_cycles(cpu, event, rq->cc.cycles, rq->cc.time);
-}
-
-static int account_busy_for_task_demand(struct task_struct *p, int event)
-{
-	/* No need to bother updating task demand for exiting tasks
-	 * or the idle task. */
-	if (exiting_task(p) || is_idle_task(p))
-		return 0;
-
-	/* When a task is waking up it is completing a segment of non-busy
-	 * time. Likewise, if wait time is not treated as busy time, then
-	 * when a task begins to run or is migrated, it is not running and
-	 * is completing a segment of non-busy time. */
-	if (event == TASK_WAKE || (!SCHED_ACCOUNT_WAIT_TIME &&
-			 (event == PICK_NEXT_TASK || event == TASK_MIGRATE)))
-		return 0;
-
-	return 1;
-}
-
-/*
- * Called when new window is starting for a task, to record cpu usage over
- * recently concluded window(s). Normally 'samples' should be 1. It can be > 1
- * when, say, a real-time task runs without preemption for several windows at a
- * stretch.
- */
-static void update_history(struct rq *rq, struct task_struct *p,
-			 u32 runtime, int samples, int event)
-{
-	u32 *hist = &p->ravg.sum_history[0];
-	int ridx, widx;
-	u32 max = 0, avg, demand, pred_demand;
-	u64 sum = 0;
-
-	/* Ignore windows where task had no activity */
-	if (!runtime || is_idle_task(p) || exiting_task(p) || !samples)
-			goto done;
-
-	/* Push new 'runtime' value onto stack */
-	widx = sched_ravg_hist_size - 1;
-	ridx = widx - samples;
-	for (; ridx >= 0; --widx, --ridx) {
-		hist[widx] = hist[ridx];
-		sum += hist[widx];
-		if (hist[widx] > max)
-			max = hist[widx];
-	}
-
-	for (widx = 0; widx < samples && widx < sched_ravg_hist_size; widx++) {
-		hist[widx] = runtime;
-		sum += hist[widx];
-		if (hist[widx] > max)
-			max = hist[widx];
-	}
-
-	p->ravg.sum = 0;
-
-	if (sched_window_stats_policy == WINDOW_STATS_RECENT) {
-		demand = runtime;
-	} else if (sched_window_stats_policy == WINDOW_STATS_MAX) {
-		demand = max;
-	} else {
-		avg = div64_u64(sum, sched_ravg_hist_size);
-		if (sched_window_stats_policy == WINDOW_STATS_AVG)
-			demand = avg;
-		else
-			demand = max(avg, runtime);
-	}
-	pred_demand = predict_and_update_buckets(rq, p, runtime);
-
-	/*
-	 * A throttled deadline sched class task gets dequeued without
-	 * changing p->on_rq. Since the dequeue decrements hmp stats
-	 * avoid decrementing it here again.
-	 */
-	if (task_on_rq_queued(p) && (!task_has_dl_policy(p) ||
-						!p->dl.dl_throttled))
-		p->sched_class->fixup_hmp_sched_stats(rq, p, demand,
-						      pred_demand);
-
-	p->ravg.demand = demand;
-	assign_ravg_pred_demand(pred_demand);
-
-done:
-	trace_sched_update_history(rq, p, runtime, samples, event);
-}
-
-static void add_to_task_demand(struct rq *rq, struct task_struct *p, u64 delta)
-{
-	delta = scale_exec_time(delta, rq);
-	p->ravg.sum += delta;
-	if (unlikely(p->ravg.sum > sched_ravg_window))
-		p->ravg.sum = sched_ravg_window;
-}
-
-/*
- * Account cpu demand of task and/or update task's cpu demand history
- *
- * ms = p->ravg.mark_start;
- * wc = wallclock
- * ws = rq->window_start
- *
- * Three possibilities:
- *
- *	a) Task event is contained within one window.
- *		window_start < mark_start < wallclock
- *
- *		ws   ms  wc
- *		|    |   |
- *		V    V   V
- *		|---------------|
- *
- *	In this case, p->ravg.sum is updated *iff* event is appropriate
- *	(ex: event == PUT_PREV_TASK)
- *
- *	b) Task event spans two windows.
- *		mark_start < window_start < wallclock
- *
- *		ms   ws   wc
- *		|    |    |
- *		V    V    V
- *		-----|-------------------
- *
- *	In this case, p->ravg.sum is updated with (ws - ms) *iff* event
- *	is appropriate, then a new window sample is recorded followed
- *	by p->ravg.sum being set to (wc - ws) *iff* event is appropriate.
- *
- *	c) Task event spans more than two windows.
- *
- *		ms ws_tmp			   ws  wc
- *		|  |				   |   |
- *		V  V				   V   V
- *		---|-------|-------|-------|-------|------
- *		   |				   |
- *		   |<------ nr_full_windows ------>|
- *
- *	In this case, p->ravg.sum is updated with (ws_tmp - ms) first *iff*
- *	event is appropriate, window sample of p->ravg.sum is recorded,
- *	'nr_full_window' samples of window_size is also recorded *iff*
- *	event is appropriate and finally p->ravg.sum is set to (wc - ws)
- *	*iff* event is appropriate.
- *
- * IMPORTANT : Leave p->ravg.mark_start unchanged, as update_cpu_busy_time()
- * depends on it!
- */
-static void update_task_demand(struct task_struct *p, struct rq *rq,
-			       int event, u64 wallclock)
-{
-	u64 mark_start = p->ravg.mark_start;
-	u64 delta, window_start = rq->window_start;
-	int new_window, nr_full_windows;
-	u32 window_size = sched_ravg_window;
-
-	new_window = mark_start < window_start;
-	if (!account_busy_for_task_demand(p, event)) {
-		if (new_window)
-			/* If the time accounted isn't being accounted as
-			 * busy time, and a new window started, only the
-			 * previous window need be closed out with the
-			 * pre-existing demand. Multiple windows may have
-			 * elapsed, but since empty windows are dropped,
-			 * it is not necessary to account those. */
-			update_history(rq, p, p->ravg.sum, 1, event);
-		return;
-	}
-
-	if (!new_window) {
-		/* The simple case - busy time contained within the existing
-		 * window. */
-		add_to_task_demand(rq, p, wallclock - mark_start);
-		return;
-	}
-
-	/* Busy time spans at least two windows. Temporarily rewind
-	 * window_start to first window boundary after mark_start. */
-	delta = window_start - mark_start;
-	nr_full_windows = div64_u64(delta, window_size);
-	window_start -= (u64)nr_full_windows * (u64)window_size;
-
-	/* Process (window_start - mark_start) first */
-	add_to_task_demand(rq, p, window_start - mark_start);
-
-	/* Push new sample(s) into task's demand history */
-	update_history(rq, p, p->ravg.sum, 1, event);
-	if (nr_full_windows)
-		update_history(rq, p, scale_exec_time(window_size, rq),
-			       nr_full_windows, event);
-
-	/* Roll window_start back to current to process any remainder
-	 * in current window. */
-	window_start += (u64)nr_full_windows * (u64)window_size;
-
-	/* Process (wallclock - window_start) next */
-	mark_start = window_start;
-	add_to_task_demand(rq, p, wallclock - mark_start);
-}
-
-/* Reflect task activity on its demand and cpu's busy time statistics */
-static void
-update_task_ravg(struct task_struct *p, struct rq *rq, int event,
-		 u64 wallclock, u64 irqtime)
-{
-	if (sched_use_pelt || !rq->window_start || sched_disable_window_stats)
-		return;
-
-	lockdep_assert_held(&rq->lock);
-
-	update_window_start(rq, wallclock);
-
-	if (!p->ravg.mark_start) {
-		update_task_cpu_cycles(p, cpu_of(rq));
-		goto done;
-	}
-
-	update_task_rq_cpu_cycles(p, rq, event, wallclock, irqtime);
-	update_task_demand(p, rq, event, wallclock);
-	update_cpu_busy_time(p, rq, event, wallclock, irqtime);
-	update_task_pred_demand(rq, p, event);
-done:
-	trace_sched_update_task_ravg(p, rq, event, wallclock, irqtime,
-				     rq->cc.cycles, rq->cc.time,
-				     _group_cpu_time(p->grp, cpu_of(rq)));
-
-	p->ravg.mark_start = wallclock;
-}
-
-void sched_account_irqtime(int cpu, struct task_struct *curr,
-				 u64 delta, u64 wallclock)
-{
-	struct rq *rq = cpu_rq(cpu);
-	unsigned long flags, nr_windows;
-	u64 cur_jiffies_ts;
-
-	raw_spin_lock_irqsave(&rq->lock, flags);
-
-	/*
-	 * cputime (wallclock) uses sched_clock so use the same here for
-	 * consistency.
-	 */
-	delta += sched_clock() - wallclock;
-	cur_jiffies_ts = get_jiffies_64();
-
-	if (is_idle_task(curr))
-		update_task_ravg(curr, rq, IRQ_UPDATE, sched_ktime_clock(),
-				 delta);
-
-	nr_windows = cur_jiffies_ts - rq->irqload_ts;
-
-	if (nr_windows) {
-		if (nr_windows < 10) {
-			/* Decay CPU's irqload by 3/4 for each window. */
-			rq->avg_irqload *= (3 * nr_windows);
-			rq->avg_irqload = div64_u64(rq->avg_irqload,
-						    4 * nr_windows);
-		} else {
-			rq->avg_irqload = 0;
-		}
-		rq->avg_irqload += rq->cur_irqload;
-		rq->cur_irqload = 0;
-	}
-
-	rq->cur_irqload += delta;
-	rq->irqload_ts = cur_jiffies_ts;
-	raw_spin_unlock_irqrestore(&rq->lock, flags);
-}
-
-void sched_account_irqstart(int cpu, struct task_struct *curr, u64 wallclock)
-{
-	struct rq *rq = cpu_rq(cpu);
-
-	if (!rq->window_start || sched_disable_window_stats)
-		return;
-
-	if (is_idle_task(curr)) {
-		/* We're here without rq->lock held, IRQ disabled */
-		raw_spin_lock(&rq->lock);
-		update_task_cpu_cycles(curr, cpu);
-		raw_spin_unlock(&rq->lock);
-	}
-}
-
-static void reset_task_stats(struct task_struct *p)
-{
-	u32 sum = 0;
-
-	if (exiting_task(p))
-		sum = EXITING_TASK_MARKER;
-
-	memset(&p->ravg, 0, sizeof(struct ravg));
-	/* Retain EXITING_TASK marker */
-	p->ravg.sum_history[0] = sum;
-}
-
-static inline void mark_task_starting(struct task_struct *p)
-{
-	u64 wallclock;
-	struct rq *rq = task_rq(p);
-
-	if (!rq->window_start || sched_disable_window_stats) {
-		reset_task_stats(p);
-		return;
-	}
-
-	wallclock = sched_ktime_clock();
-	p->ravg.mark_start = p->last_wake_ts = wallclock;
-	p->last_cpu_selected_ts = wallclock;
-	p->last_switch_out_ts = 0;
-	update_task_cpu_cycles(p, cpu_of(rq));
-}
-
-static inline void set_window_start(struct rq *rq)
-{
-	int cpu = cpu_of(rq);
-	struct rq *sync_rq = cpu_rq(sync_cpu);
-
-	if (rq->window_start || !sched_enable_hmp)
-		return;
-
-	if (cpu == sync_cpu) {
-		rq->window_start = sched_ktime_clock();
-	} else {
-		raw_spin_unlock(&rq->lock);
-		double_rq_lock(rq, sync_rq);
-		rq->window_start = cpu_rq(sync_cpu)->window_start;
-#ifdef CONFIG_SCHED_FREQ_INPUT
-		rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
-		rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0;
-#endif
-		raw_spin_unlock(&sync_rq->lock);
-	}
-
-	rq->curr->ravg.mark_start = rq->window_start;
-}
-
-static inline void migrate_sync_cpu(int cpu)
-{
-	if (cpu == sync_cpu)
-		sync_cpu = smp_processor_id();
-}
-
-static void reset_all_task_stats(void)
-{
-	struct task_struct *g, *p;
-
-	read_lock(&tasklist_lock);
-	do_each_thread(g, p) {
-		reset_task_stats(p);
-	}  while_each_thread(g, p);
-	read_unlock(&tasklist_lock);
-}
-
-/*
- * sched_exit() - Set EXITING_TASK_MARKER in task's ravg.demand field
- *
- * Stop accounting (exiting) task's future cpu usage
- *
- * We need this so that reset_all_windows_stats() can function correctly.
- * reset_all_window_stats() depends on do_each_thread/for_each_thread task
- * iterators to reset *all* task's statistics. Exiting tasks however become
- * invisible to those iterators. sched_exit() is called on a exiting task prior
- * to being removed from task_list, which will let reset_all_window_stats()
- * function correctly.
- */
-void sched_exit(struct task_struct *p)
-{
-	unsigned long flags;
-	int cpu = get_cpu();
-	struct rq *rq = cpu_rq(cpu);
-	u64 wallclock;
-
-	sched_set_group_id(p, 0);
-
-	raw_spin_lock_irqsave(&rq->lock, flags);
-	/* rq->curr == p */
-	wallclock = sched_ktime_clock();
-	update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
-	dequeue_task(rq, p, 0);
-	reset_task_stats(p);
-	p->ravg.mark_start = wallclock;
-	p->ravg.sum_history[0] = EXITING_TASK_MARKER;
-	enqueue_task(rq, p, 0);
-	clear_ed_task(p, rq);
-	raw_spin_unlock_irqrestore(&rq->lock, flags);
-
-	put_cpu();
-}
-
-static void disable_window_stats(void)
-{
-	unsigned long flags;
-	int i;
-
-	local_irq_save(flags);
-	for_each_possible_cpu(i)
-		raw_spin_lock(&cpu_rq(i)->lock);
-
-	sched_disable_window_stats = 1;
-
-	for_each_possible_cpu(i)
-		raw_spin_unlock(&cpu_rq(i)->lock);
-
-	local_irq_restore(flags);
-}
-
-/* Called with all cpu's rq->lock held */
-static void enable_window_stats(void)
-{
-	sched_disable_window_stats = 0;
-
-}
-
-enum reset_reason_code {
-	WINDOW_CHANGE,
-	POLICY_CHANGE,
-	HIST_SIZE_CHANGE,
-	FREQ_AGGREGATE_CHANGE,
-};
-
-const char *sched_window_reset_reasons[] = {
-	"WINDOW_CHANGE",
-	"POLICY_CHANGE",
-	"HIST_SIZE_CHANGE",
-};
-
-/* Called with IRQs enabled */
-void reset_all_window_stats(u64 window_start, unsigned int window_size)
-{
-	int cpu;
-	unsigned long flags;
-	u64 start_ts = sched_ktime_clock();
-	int reason = WINDOW_CHANGE;
-	unsigned int old = 0, new = 0;
-	struct related_thread_group *grp;
-
-	disable_window_stats();
-
-	reset_all_task_stats();
-
-	local_irq_save(flags);
-
-	read_lock(&related_thread_group_lock);
-
-	for_each_possible_cpu(cpu) {
-		struct rq *rq = cpu_rq(cpu);
-		raw_spin_lock(&rq->lock);
-	}
-
-	list_for_each_entry(grp, &related_thread_groups, list) {
-		int j;
-
-		for_each_possible_cpu(j) {
-			struct group_cpu_time *cpu_time;
-			/* Protected by rq lock */
-			cpu_time = _group_cpu_time(grp, j);
-			memset(cpu_time, 0, sizeof(struct group_cpu_time));
-			if (window_start)
-				cpu_time->window_start = window_start;
-		}
-	}
-
-	if (window_size) {
-		sched_ravg_window = window_size * TICK_NSEC;
-		set_hmp_defaults();
-	}
-
-	enable_window_stats();
-
-	for_each_possible_cpu(cpu) {
-		struct rq *rq = cpu_rq(cpu);
-
-		if (window_start)
-			rq->window_start = window_start;
-#ifdef CONFIG_SCHED_FREQ_INPUT
-		rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
-		rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0;
-#endif
-		reset_cpu_hmp_stats(cpu, 1);
-	}
-
-	if (sched_window_stats_policy != sysctl_sched_window_stats_policy) {
-		reason = POLICY_CHANGE;
-		old = sched_window_stats_policy;
-		new = sysctl_sched_window_stats_policy;
-		sched_window_stats_policy = sysctl_sched_window_stats_policy;
-	} else if (sched_ravg_hist_size != sysctl_sched_ravg_hist_size) {
-		reason = HIST_SIZE_CHANGE;
-		old = sched_ravg_hist_size;
-		new = sysctl_sched_ravg_hist_size;
-		sched_ravg_hist_size = sysctl_sched_ravg_hist_size;
-	}
-#ifdef CONFIG_SCHED_FREQ_INPUT
-	else if (sched_freq_aggregate !=
-					sysctl_sched_freq_aggregate) {
-		reason = FREQ_AGGREGATE_CHANGE;
-		old = sched_freq_aggregate;
-		new = sysctl_sched_freq_aggregate;
-		sched_freq_aggregate = sysctl_sched_freq_aggregate;
-	}
-#endif
-
-	for_each_possible_cpu(cpu) {
-		struct rq *rq = cpu_rq(cpu);
-		raw_spin_unlock(&rq->lock);
-	}
-
-	read_unlock(&related_thread_group_lock);
-
-	local_irq_restore(flags);
-
-	trace_sched_reset_all_window_stats(window_start, window_size,
-		sched_ktime_clock() - start_ts, reason, old, new);
-}
-
-#ifdef CONFIG_SCHED_FREQ_INPUT
-
-static inline void
-sync_window_start(struct rq *rq, struct group_cpu_time *cpu_time);
-
-void sched_get_cpus_busy(struct sched_load *busy,
-			 const struct cpumask *query_cpus)
-{
-	unsigned long flags;
-	struct rq *rq;
-	const int cpus = cpumask_weight(query_cpus);
-	u64 load[cpus], group_load[cpus];
-	u64 nload[cpus], ngload[cpus];
-	u64 pload[cpus];
-	unsigned int cur_freq[cpus], max_freq[cpus];
-	int notifier_sent[cpus];
-	int early_detection[cpus];
-	int cpu, i = 0;
-	unsigned int window_size;
-	u64 max_prev_sum = 0;
-	int max_busy_cpu = cpumask_first(query_cpus);
-	struct related_thread_group *grp;
-
-	if (unlikely(cpus == 0))
-		return;
-
-	/*
-	 * This function could be called in timer context, and the
-	 * current task may have been executing for a long time. Ensure
-	 * that the window stats are current by doing an update.
-	 */
-	read_lock(&related_thread_group_lock);
-
-	local_irq_save(flags);
-	for_each_cpu(cpu, query_cpus)
-		raw_spin_lock(&cpu_rq(cpu)->lock);
-
-	window_size = sched_ravg_window;
-
-	for_each_cpu(cpu, query_cpus) {
-		rq = cpu_rq(cpu);
-
-		update_task_ravg(rq->curr, rq, TASK_UPDATE, sched_ktime_clock(),
-				 0);
-		cur_freq[i] = cpu_cycles_to_freq(rq->cc.cycles, rq->cc.time);
-
-		load[i] = rq->old_busy_time = rq->prev_runnable_sum;
-		nload[i] = rq->nt_prev_runnable_sum;
-		pload[i] = rq->hmp_stats.pred_demands_sum;
-		rq->old_estimated_time = pload[i];
-
-		if (load[i] > max_prev_sum) {
-			max_prev_sum = load[i];
-			max_busy_cpu = cpu;
-		}
-
-		notifier_sent[i] = rq->notifier_sent;
-		early_detection[i] = (rq->ed_task != NULL);
-		rq->notifier_sent = 0;
-		cur_freq[i] = cpu_cur_freq(cpu);
-		max_freq[i] = cpu_max_freq(cpu);
-		i++;
-	}
-
-	for_each_related_thread_group(grp) {
-		for_each_cpu(cpu, query_cpus) {
-			/* Protected by rq_lock */
-			struct group_cpu_time *cpu_time =
-						_group_cpu_time(grp, cpu);
-			sync_window_start(cpu_rq(cpu), cpu_time);
-		}
-	}
-
-	i = 0;
-	for_each_cpu(cpu, query_cpus) {
-		group_load[i] = 0;
-		ngload[i] = 0;
-
-		if (early_detection[i])
-			goto skip_early;
-
-		rq = cpu_rq(cpu);
-		if (!notifier_sent[i]) {
-			if (cpu == max_busy_cpu)
-				group_load_in_freq_domain(
-					&rq->freq_domain_cpumask,
-					&group_load[i], &ngload[i]);
-		} else {
-			_group_load_in_cpu(cpu, &group_load[i], &ngload[i]);
-		}
-
-		load[i] += group_load[i];
-		nload[i] += ngload[i];
-		/*
-		 * Scale load in reference to cluster max_possible_freq.
-		 *
-		 * Note that scale_load_to_cpu() scales load in reference to
-		 * the cluster max_freq.
-		 */
-		load[i] = scale_load_to_cpu(load[i], cpu);
-		nload[i] = scale_load_to_cpu(nload[i], cpu);
-		pload[i] = scale_load_to_cpu(pload[i], cpu);
-skip_early:
-		i++;
-	}
-
-	for_each_cpu(cpu, query_cpus)
-		raw_spin_unlock(&(cpu_rq(cpu))->lock);
-	local_irq_restore(flags);
-
-	read_unlock(&related_thread_group_lock);
-
-	i = 0;
-	for_each_cpu(cpu, query_cpus) {
-		rq = cpu_rq(cpu);
-
-		if (early_detection[i]) {
-			busy[i].prev_load = div64_u64(sched_ravg_window,
-							NSEC_PER_USEC);
-			busy[i].new_task_load = 0;
-			goto exit_early;
-		}
-
-		if (!notifier_sent[i]) {
-			load[i] = scale_load_to_freq(load[i], max_freq[i],
-						     cur_freq[i]);
-			nload[i] = scale_load_to_freq(nload[i], max_freq[i],
-						      cur_freq[i]);
-			if (load[i] > window_size)
-				load[i] = window_size;
-			if (nload[i] > window_size)
-				nload[i] = window_size;
-
-			load[i] = scale_load_to_freq(load[i], cur_freq[i],
-						    cpu_max_possible_freq(cpu));
-			nload[i] = scale_load_to_freq(nload[i], cur_freq[i],
-						    cpu_max_possible_freq(cpu));
-		} else {
-			load[i] = scale_load_to_freq(load[i], max_freq[i],
-						    cpu_max_possible_freq(cpu));
-			nload[i] = scale_load_to_freq(nload[i], max_freq[i],
-						    cpu_max_possible_freq(cpu));
-		}
-		pload[i] = scale_load_to_freq(pload[i], max_freq[i],
-					     rq->cluster->max_possible_freq);
-
-		busy[i].prev_load = div64_u64(load[i], NSEC_PER_USEC);
-		busy[i].new_task_load = div64_u64(nload[i], NSEC_PER_USEC);
-		busy[i].predicted_load = div64_u64(pload[i], NSEC_PER_USEC);
-
-exit_early:
-		trace_sched_get_busy(cpu, busy[i].prev_load,
-				     busy[i].new_task_load,
-				     busy[i].predicted_load,
-				     early_detection[i]);
-		i++;
-	}
-}
-
-void sched_set_io_is_busy(int val)
-{
-	sched_io_is_busy = val;
-}
-
-int sched_set_window(u64 window_start, unsigned int window_size)
-{
-	u64 now, cur_jiffies, jiffy_ktime_ns;
-	s64 ws;
-	unsigned long flags;
-
-	if (sched_use_pelt ||
-		 (window_size * TICK_NSEC <  MIN_SCHED_RAVG_WINDOW))
-			return -EINVAL;
-
-	mutex_lock(&policy_mutex);
-
-	/*
-	 * Get a consistent view of ktime, jiffies, and the time
-	 * since the last jiffy (based on last_jiffies_update).
-	 */
-	local_irq_save(flags);
-	cur_jiffies = jiffy_to_ktime_ns(&now, &jiffy_ktime_ns);
-	local_irq_restore(flags);
-
-	/* translate window_start from jiffies to nanoseconds */
-	ws = (window_start - cur_jiffies); /* jiffy difference */
-	ws *= TICK_NSEC;
-	ws += jiffy_ktime_ns;
-
-	/* roll back calculated window start so that it is in
-	 * the past (window stats must have a current window) */
-	while (ws > now)
-		ws -= (window_size * TICK_NSEC);
-
-	BUG_ON(sched_ktime_clock() < ws);
-
-	reset_all_window_stats(ws, window_size);
-
-	sched_update_freq_max_load(cpu_possible_mask);
-
-	mutex_unlock(&policy_mutex);
-
-	return 0;
-}
-
-static void fixup_busy_time(struct task_struct *p, int new_cpu)
-{
-	struct rq *src_rq = task_rq(p);
-	struct rq *dest_rq = cpu_rq(new_cpu);
-	u64 wallclock;
-	u64 *src_curr_runnable_sum, *dst_curr_runnable_sum;
-	u64 *src_prev_runnable_sum, *dst_prev_runnable_sum;
-	u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum;
-	u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum;
-	int migrate_type;
-	struct migration_sum_data d;
-	bool new_task;
-	struct related_thread_group *grp;
-
-	if (!sched_enable_hmp || (!p->on_rq && p->state != TASK_WAKING))
-		return;
-
-	if (exiting_task(p)) {
-		clear_ed_task(p, src_rq);
-		return;
-	}
-
-	if (p->state == TASK_WAKING)
-		double_rq_lock(src_rq, dest_rq);
-
-	if (sched_disable_window_stats)
-		goto done;
-
-	wallclock = sched_ktime_clock();
-
-	update_task_ravg(task_rq(p)->curr, task_rq(p),
-			 TASK_UPDATE,
-			 wallclock, 0);
-	update_task_ravg(dest_rq->curr, dest_rq,
-			 TASK_UPDATE, wallclock, 0);
-
-	update_task_ravg(p, task_rq(p), TASK_MIGRATE,
-			 wallclock, 0);
-
-	update_task_cpu_cycles(p, new_cpu);
-
-	new_task = is_new_task(p);
-	/* Protected by rq_lock */
-	grp = p->grp;
-	if (grp && sched_freq_aggregate) {
-		struct group_cpu_time *cpu_time;
-
-		migrate_type = GROUP_TO_GROUP;
-		/* Protected by rq_lock */
-		cpu_time = _group_cpu_time(grp, cpu_of(src_rq));
-		d.src_rq = NULL;
-		d.src_cpu_time = cpu_time;
-		src_curr_runnable_sum = &cpu_time->curr_runnable_sum;
-		src_prev_runnable_sum = &cpu_time->prev_runnable_sum;
-		src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
-		src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
-
-		/* Protected by rq_lock */
-		cpu_time = _group_cpu_time(grp, cpu_of(dest_rq));
-		d.dst_rq = NULL;
-		d.dst_cpu_time = cpu_time;
-		dst_curr_runnable_sum = &cpu_time->curr_runnable_sum;
-		dst_prev_runnable_sum = &cpu_time->prev_runnable_sum;
-		dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
-		dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
-		sync_window_start(dest_rq, cpu_time);
-	} else {
-		migrate_type = RQ_TO_RQ;
-		d.src_rq = src_rq;
-		d.src_cpu_time = NULL;
-		d.dst_rq = dest_rq;
-		d.dst_cpu_time = NULL;
-		src_curr_runnable_sum = &src_rq->curr_runnable_sum;
-		src_prev_runnable_sum = &src_rq->prev_runnable_sum;
-		src_nt_curr_runnable_sum = &src_rq->nt_curr_runnable_sum;
-		src_nt_prev_runnable_sum = &src_rq->nt_prev_runnable_sum;
-
-		dst_curr_runnable_sum = &dest_rq->curr_runnable_sum;
-		dst_prev_runnable_sum = &dest_rq->prev_runnable_sum;
-		dst_nt_curr_runnable_sum = &dest_rq->nt_curr_runnable_sum;
-		dst_nt_prev_runnable_sum = &dest_rq->nt_prev_runnable_sum;
-	}
-
-	if (p->ravg.curr_window) {
-		*src_curr_runnable_sum -= p->ravg.curr_window;
-		*dst_curr_runnable_sum += p->ravg.curr_window;
-		if (new_task) {
-			*src_nt_curr_runnable_sum -= p->ravg.curr_window;
-			*dst_nt_curr_runnable_sum += p->ravg.curr_window;
-		}
-	}
-
-	if (p->ravg.prev_window) {
-		*src_prev_runnable_sum -= p->ravg.prev_window;
-		*dst_prev_runnable_sum += p->ravg.prev_window;
-		if (new_task) {
-			*src_nt_prev_runnable_sum -= p->ravg.prev_window;
-			*dst_nt_prev_runnable_sum += p->ravg.prev_window;
-		}
-	}
-
-	if (p == src_rq->ed_task) {
-		src_rq->ed_task = NULL;
-		if (!dest_rq->ed_task)
-			dest_rq->ed_task = p;
-	}
-
-	trace_sched_migration_update_sum(p, migrate_type, &d);
-	BUG_ON((s64)*src_prev_runnable_sum < 0);
-	BUG_ON((s64)*src_curr_runnable_sum < 0);
-	BUG_ON((s64)*src_nt_prev_runnable_sum < 0);
-	BUG_ON((s64)*src_nt_curr_runnable_sum < 0);
-
-done:
-	if (p->state == TASK_WAKING)
-		double_rq_unlock(src_rq, dest_rq);
-}
-
-#else
-
-static inline void fixup_busy_time(struct task_struct *p, int new_cpu) { }
-
-#endif	/* CONFIG_SCHED_FREQ_INPUT */
-
-#define sched_up_down_migrate_auto_update 1
-static void check_for_up_down_migrate_update(const struct cpumask *cpus)
-{
-	int i = cpumask_first(cpus);
-
-	if (!sched_up_down_migrate_auto_update)
-		return;
-
-	if (cpu_max_possible_capacity(i) == max_possible_capacity)
-		return;
-
-	if (cpu_max_possible_freq(i) == cpu_max_freq(i))
-		up_down_migrate_scale_factor = 1024;
-	else
-		up_down_migrate_scale_factor = (1024 *
-				 cpu_max_possible_freq(i)) / cpu_max_freq(i);
-
-	update_up_down_migrate();
-}
-
-/* Return cluster which can offer required capacity for group */
-static struct sched_cluster *
-best_cluster(struct related_thread_group *grp, u64 total_demand)
-{
-	struct sched_cluster *cluster = NULL;
-
-	for_each_sched_cluster(cluster) {
-		if (group_will_fit(cluster, grp, total_demand))
-			return cluster;
-	}
-
-	return NULL;
-}
-
-static void _set_preferred_cluster(struct related_thread_group *grp)
-{
-	struct task_struct *p;
-	u64 combined_demand = 0;
-
-	if (!sysctl_sched_enable_colocation) {
-		grp->last_update = sched_ktime_clock();
-		grp->preferred_cluster = NULL;
-		return;
-	}
-
-	/*
-	 * wakeup of two or more related tasks could race with each other and
-	 * could result in multiple calls to _set_preferred_cluster being issued
-	 * at same time. Avoid overhead in such cases of rechecking preferred
-	 * cluster
-	 */
-	if (sched_ktime_clock() - grp->last_update < sched_ravg_window / 10)
-		return;
-
-	list_for_each_entry(p, &grp->tasks, grp_list)
-		combined_demand += p->ravg.demand;
-
-	grp->preferred_cluster = best_cluster(grp, combined_demand);
-	grp->last_update = sched_ktime_clock();
-	trace_sched_set_preferred_cluster(grp, combined_demand);
-}
-
-static void set_preferred_cluster(struct related_thread_group *grp)
-{
-	raw_spin_lock(&grp->lock);
-	_set_preferred_cluster(grp);
-	raw_spin_unlock(&grp->lock);
-}
-
-#define ADD_TASK	0
-#define REM_TASK	1
-
-#ifdef CONFIG_SCHED_FREQ_INPUT
-
-static void
-update_task_ravg(struct task_struct *p, struct rq *rq,
-		 int event, u64 wallclock, u64 irqtime);
-
-static inline void free_group_cputime(struct related_thread_group *grp)
-{
-	free_percpu(grp->cpu_time);
-}
-
-static int alloc_group_cputime(struct related_thread_group *grp)
-{
-	int i;
-	struct group_cpu_time *cpu_time;
-	int cpu = raw_smp_processor_id();
-	struct rq *rq = cpu_rq(cpu);
-	u64 window_start = rq->window_start;
-
-	grp->cpu_time = alloc_percpu(struct group_cpu_time);
-	if (!grp->cpu_time)
-		return -ENOMEM;
-
-	for_each_possible_cpu(i) {
-		cpu_time = per_cpu_ptr(grp->cpu_time, i);
-		memset(cpu_time, 0, sizeof(struct group_cpu_time));
-		cpu_time->window_start = window_start;
-	}
-
-	return 0;
-}
-
-/*
- * A group's window_start may be behind. When moving it forward, flip prev/curr
- * counters. When moving forward > 1 window, prev counter is set to 0
- */
-static inline void
-sync_window_start(struct rq *rq, struct group_cpu_time *cpu_time)
-{
-	u64 delta;
-	int nr_windows;
-	u64 curr_sum = cpu_time->curr_runnable_sum;
-	u64 nt_curr_sum = cpu_time->nt_curr_runnable_sum;
-
-	delta = rq->window_start - cpu_time->window_start;
-	if (!delta)
-		return;
-
-	nr_windows = div64_u64(delta, sched_ravg_window);
-	if (nr_windows > 1)
-		curr_sum = nt_curr_sum = 0;
-
-	cpu_time->prev_runnable_sum  = curr_sum;
-	cpu_time->curr_runnable_sum  = 0;
-
-	cpu_time->nt_prev_runnable_sum = nt_curr_sum;
-	cpu_time->nt_curr_runnable_sum = 0;
-
-	cpu_time->window_start = rq->window_start;
-}
-
-/*
- * Task's cpu usage is accounted in:
- *	rq->curr/prev_runnable_sum,  when its ->grp is NULL
- *	grp->cpu_time[cpu]->curr/prev_runnable_sum, when its ->grp is !NULL
- *
- * Transfer task's cpu usage between those counters when transitioning between
- * groups
- */
-static void transfer_busy_time(struct rq *rq, struct related_thread_group *grp,
-				struct task_struct *p, int event)
-{
-	u64 wallclock;
-	struct group_cpu_time *cpu_time;
-	u64 *src_curr_runnable_sum, *dst_curr_runnable_sum;
-	u64 *src_prev_runnable_sum, *dst_prev_runnable_sum;
-	u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum;
-	u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum;
-	struct migration_sum_data d;
-	int migrate_type;
-
-	if (!sched_freq_aggregate)
-		return;
-
-	wallclock = sched_ktime_clock();
-
-	update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
-	update_task_ravg(p, rq, TASK_UPDATE, wallclock, 0);
-
-	/* cpu_time protected by related_thread_group_lock, grp->lock rq_lock */
-	cpu_time = _group_cpu_time(grp, cpu_of(rq));
-	if (event == ADD_TASK) {
-		sync_window_start(rq, cpu_time);
-		migrate_type = RQ_TO_GROUP;
-		d.src_rq = rq;
-		d.src_cpu_time = NULL;
-		d.dst_rq = NULL;
-		d.dst_cpu_time = cpu_time;
-		src_curr_runnable_sum = &rq->curr_runnable_sum;
-		dst_curr_runnable_sum = &cpu_time->curr_runnable_sum;
-		src_prev_runnable_sum = &rq->prev_runnable_sum;
-		dst_prev_runnable_sum = &cpu_time->prev_runnable_sum;
-
-		src_nt_curr_runnable_sum = &rq->nt_curr_runnable_sum;
-		dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
-		src_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;
-		dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
-	} else if (event == REM_TASK) {
-		migrate_type = GROUP_TO_RQ;
-		d.src_rq = NULL;
-		d.src_cpu_time = cpu_time;
-		d.dst_rq = rq;
-		d.dst_cpu_time = NULL;
-
-		/*
-		 * In case of REM_TASK, cpu_time->window_start would be
-		 * uptodate, because of the update_task_ravg() we called
-		 * above on the moving task. Hence no need for
-		 * sync_window_start()
-		 */
-		src_curr_runnable_sum = &cpu_time->curr_runnable_sum;
-		dst_curr_runnable_sum = &rq->curr_runnable_sum;
-		src_prev_runnable_sum = &cpu_time->prev_runnable_sum;
-		dst_prev_runnable_sum = &rq->prev_runnable_sum;
-
-		src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
-		dst_nt_curr_runnable_sum = &rq->nt_curr_runnable_sum;
-		src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
-		dst_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;
-	}
-
-	*src_curr_runnable_sum -= p->ravg.curr_window;
-	*dst_curr_runnable_sum += p->ravg.curr_window;
-
-	*src_prev_runnable_sum -= p->ravg.prev_window;
-	*dst_prev_runnable_sum += p->ravg.prev_window;
-
-	if (is_new_task(p)) {
-		*src_nt_curr_runnable_sum -= p->ravg.curr_window;
-		*dst_nt_curr_runnable_sum += p->ravg.curr_window;
-		*src_nt_prev_runnable_sum -= p->ravg.prev_window;
-		*dst_nt_prev_runnable_sum += p->ravg.prev_window;
-	}
-
-	trace_sched_migration_update_sum(p, migrate_type, &d);
-
-	BUG_ON((s64)*src_curr_runnable_sum < 0);
-	BUG_ON((s64)*src_prev_runnable_sum < 0);
-}
-
-static inline struct group_cpu_time *
-task_group_cpu_time(struct task_struct *p, int cpu)
-{
-	return _group_cpu_time(rcu_dereference(p->grp), cpu);
-}
-
-static inline struct group_cpu_time *
-_group_cpu_time(struct related_thread_group *grp, int cpu)
-{
-	return grp ? per_cpu_ptr(grp->cpu_time, cpu) : NULL;
-}
-
-#else	/* CONFIG_SCHED_FREQ_INPUT */
-
-static inline void free_group_cputime(struct related_thread_group *grp) { }
-
-static inline int alloc_group_cputime(struct related_thread_group *grp)
-{
-	return 0;
-}
-
-static inline void transfer_busy_time(struct rq *rq,
-	 struct related_thread_group *grp, struct task_struct *p, int event)
-{
-}
-
-static struct group_cpu_time *
-task_group_cpu_time(struct task_struct *p, int cpu)
-{
-	return NULL;
-}
-
-static inline struct group_cpu_time *
-_group_cpu_time(struct related_thread_group *grp, int cpu)
-{
-	return NULL;
-}
-
-#endif
-
-struct related_thread_group *alloc_related_thread_group(int group_id)
-{
-	struct related_thread_group *grp;
-
-	grp = kzalloc(sizeof(*grp), GFP_KERNEL);
-	if (!grp)
-		return ERR_PTR(-ENOMEM);
-
-	if (alloc_group_cputime(grp)) {
-		kfree(grp);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	grp->id = group_id;
-	INIT_LIST_HEAD(&grp->tasks);
-	INIT_LIST_HEAD(&grp->list);
-	raw_spin_lock_init(&grp->lock);
-
-	return grp;
-}
-
-struct related_thread_group *lookup_related_thread_group(unsigned int group_id)
-{
-	struct related_thread_group *grp;
-
-	list_for_each_entry(grp, &related_thread_groups, list) {
-		if (grp->id == group_id)
-			return grp;
-	}
-
-	return NULL;
-}
-
-/* See comments before preferred_cluster() */
-static void free_related_thread_group(struct rcu_head *rcu)
-{
-	struct related_thread_group *grp = container_of(rcu, struct
-			related_thread_group, rcu);
-
-	free_group_cputime(grp);
-	kfree(grp);
-}
-
-static void remove_task_from_group(struct task_struct *p)
-{
-	struct related_thread_group *grp = p->grp;
-	struct rq *rq;
-	int empty_group = 1;
-
-	raw_spin_lock(&grp->lock);
-
-	rq = __task_rq_lock(p);
-	transfer_busy_time(rq, p->grp, p, REM_TASK);
-	list_del_init(&p->grp_list);
-	rcu_assign_pointer(p->grp, NULL);
-	__task_rq_unlock(rq);
-
-	if (!list_empty(&grp->tasks)) {
-		empty_group = 0;
-		_set_preferred_cluster(grp);
-	}
-
-	raw_spin_unlock(&grp->lock);
-
-	if (empty_group) {
-		list_del(&grp->list);
-		call_rcu(&grp->rcu, free_related_thread_group);
-	}
-}
-
-static int
-add_task_to_group(struct task_struct *p, struct related_thread_group *grp)
-{
-	struct rq *rq;
-
-	raw_spin_lock(&grp->lock);
-
-	/*
-	 * Change p->grp under rq->lock. Will prevent races with read-side
-	 * reference of p->grp in various hot-paths
-	 */
-	rq = __task_rq_lock(p);
-	transfer_busy_time(rq, grp, p, ADD_TASK);
-	list_add(&p->grp_list, &grp->tasks);
-	rcu_assign_pointer(p->grp, grp);
-	__task_rq_unlock(rq);
-
-	_set_preferred_cluster(grp);
-
-	raw_spin_unlock(&grp->lock);
-
-	return 0;
-}
-
-int sched_set_group_id(struct task_struct *p, unsigned int group_id)
-{
-	int rc = 0, destroy = 0;
-	unsigned long flags;
-	struct related_thread_group *grp = NULL, *new = NULL;
-
-redo:
-	raw_spin_lock_irqsave(&p->pi_lock, flags);
-
-	if ((current != p && p->flags & PF_EXITING) ||
-			(!p->grp && !group_id) ||
-			(p->grp && p->grp->id == group_id))
-		goto done;
-
-	write_lock(&related_thread_group_lock);
-
-	if (!group_id) {
-		remove_task_from_group(p);
-		write_unlock(&related_thread_group_lock);
-		goto done;
-	}
-
-	if (p->grp && p->grp->id != group_id)
-		remove_task_from_group(p);
-
-	grp = lookup_related_thread_group(group_id);
-	if (!grp && !new) {
-		/* New group */
-		write_unlock(&related_thread_group_lock);
-		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-		new = alloc_related_thread_group(group_id);
-		if (IS_ERR(new))
-			return -ENOMEM;
-		destroy = 1;
-		/* Rerun checks (like task exiting), since we dropped pi_lock */
-		goto redo;
-	} else if (!grp && new) {
-		/* New group - use object allocated before */
-		destroy = 0;
-		list_add(&new->list, &related_thread_groups);
-		grp = new;
-	}
-
-	BUG_ON(!grp);
-	rc = add_task_to_group(p, grp);
-	write_unlock(&related_thread_group_lock);
-done:
-	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-
-	if (new && destroy) {
-		free_group_cputime(new);
-		kfree(new);
-	}
-
-	return rc;
-}
-
-unsigned int sched_get_group_id(struct task_struct *p)
-{
-	unsigned int group_id;
-	struct related_thread_group *grp;
-
-	rcu_read_lock();
-	grp = task_related_thread_group(p);
-	group_id = grp ? grp->id : 0;
-	rcu_read_unlock();
-
-	return group_id;
-}
-
-static void update_cpu_cluster_capacity(const cpumask_t *cpus)
-{
-	int i;
-	struct sched_cluster *cluster;
-	struct cpumask cpumask;
-
-	cpumask_copy(&cpumask, cpus);
-	pre_big_task_count_change(cpu_possible_mask);
-
-	for_each_cpu(i, &cpumask) {
-		cluster = cpu_rq(i)->cluster;
-		cpumask_andnot(&cpumask, &cpumask, &cluster->cpus);
-
-		cluster->capacity = compute_capacity(cluster);
-		cluster->load_scale_factor = compute_load_scale_factor(cluster);
-
-		/* 'cpus' can contain cpumask more than one cluster */
-		check_for_up_down_migrate_update(&cluster->cpus);
-	}
-
-	__update_min_max_capacity();
-
-	post_big_task_count_change(cpu_possible_mask);
-}
-
-static DEFINE_SPINLOCK(cpu_freq_min_max_lock);
-void sched_update_cpu_freq_min_max(const cpumask_t *cpus, u32 fmin, u32 fmax)
-{
-	struct cpumask cpumask;
-	struct sched_cluster *cluster;
-	int i, update_capacity = 0;
-	unsigned long flags;
-
-	spin_lock_irqsave(&cpu_freq_min_max_lock, flags);
-	cpumask_copy(&cpumask, cpus);
-	for_each_cpu(i, &cpumask) {
-		cluster = cpu_rq(i)->cluster;
-		cpumask_andnot(&cpumask, &cpumask, &cluster->cpus);
-
-		update_capacity += (cluster->max_mitigated_freq != fmax);
-		cluster->max_mitigated_freq = fmax;
-	}
-	spin_unlock_irqrestore(&cpu_freq_min_max_lock, flags);
-
-	if (update_capacity)
-		update_cpu_cluster_capacity(cpus);
-}
-
-static int cpufreq_notifier_policy(struct notifier_block *nb,
-		unsigned long val, void *data)
-{
-	struct cpufreq_policy *policy = (struct cpufreq_policy *)data;
-	struct sched_cluster *cluster = NULL;
-	struct cpumask policy_cluster = *policy->related_cpus;
-	unsigned int orig_max_freq = 0;
-	int i, j, update_capacity = 0;
-
-	if (val != CPUFREQ_NOTIFY && val != CPUFREQ_REMOVE_POLICY &&
-						val != CPUFREQ_CREATE_POLICY)
-		return 0;
-
-	if (val == CPUFREQ_REMOVE_POLICY || val == CPUFREQ_CREATE_POLICY) {
-		update_min_max_capacity();
-		return 0;
-	}
-
-	max_possible_freq = max(max_possible_freq, policy->cpuinfo.max_freq);
-	if (min_max_freq == 1)
-		min_max_freq = UINT_MAX;
-	min_max_freq = min(min_max_freq, policy->cpuinfo.max_freq);
-	BUG_ON(!min_max_freq);
-	BUG_ON(!policy->max);
-
-	for_each_cpu(i, &policy_cluster) {
-		cluster = cpu_rq(i)->cluster;
-		cpumask_andnot(&policy_cluster, &policy_cluster,
-						&cluster->cpus);
-
-		orig_max_freq = cluster->max_freq;
-		cluster->min_freq = policy->min;
-		cluster->max_freq = policy->max;
-		cluster->cur_freq = policy->cur;
-
-		if (!cluster->freq_init_done) {
-			mutex_lock(&cluster_lock);
-			for_each_cpu(j, &cluster->cpus)
-				cpumask_copy(&cpu_rq(j)->freq_domain_cpumask,
-						policy->related_cpus);
-			cluster->max_possible_freq = policy->cpuinfo.max_freq;
-			cluster->max_possible_capacity =
-				compute_max_possible_capacity(cluster);
-			cluster->freq_init_done = true;
-
-			sort_clusters();
-			update_all_clusters_stats();
-			mutex_unlock(&cluster_lock);
-			continue;
-		}
-
-		update_capacity += (orig_max_freq != cluster->max_freq);
-	}
-
-	if (update_capacity)
-		update_cpu_cluster_capacity(policy->related_cpus);
-
-	return 0;
-}
-
-static int cpufreq_notifier_trans(struct notifier_block *nb,
-		unsigned long val, void *data)
-{
-	struct cpufreq_freqs *freq = (struct cpufreq_freqs *)data;
-	unsigned int cpu = freq->cpu, new_freq = freq->new;
-	unsigned long flags;
-	struct sched_cluster *cluster;
-	struct cpumask policy_cpus = cpu_rq(cpu)->freq_domain_cpumask;
-	int i, j;
-
-	if (val != CPUFREQ_POSTCHANGE)
-		return 0;
-
-	BUG_ON(!new_freq);
-
-	if (cpu_cur_freq(cpu) == new_freq)
-		return 0;
-
-	for_each_cpu(i, &policy_cpus) {
-		cluster = cpu_rq(i)->cluster;
-
-		for_each_cpu(j, &cluster->cpus) {
-			struct rq *rq = cpu_rq(j);
-
-			raw_spin_lock_irqsave(&rq->lock, flags);
-			update_task_ravg(rq->curr, rq, TASK_UPDATE,
-						sched_ktime_clock(), 0);
-			raw_spin_unlock_irqrestore(&rq->lock, flags);
-		}
-
-		cluster->cur_freq = new_freq;
-		cpumask_andnot(&policy_cpus, &policy_cpus, &cluster->cpus);
-	}
-
-	return 0;
-}
-
-static int pwr_stats_ready_notifier(struct notifier_block *nb,
-				    unsigned long cpu, void *data)
-{
-	cpumask_t mask = CPU_MASK_NONE;
-
-	cpumask_set_cpu(cpu, &mask);
-	sched_update_freq_max_load(&mask);
-
-	mutex_lock(&cluster_lock);
-	sort_clusters();
-	mutex_unlock(&cluster_lock);
-
-	return 0;
-}
-
-static struct notifier_block notifier_policy_block = {
-	.notifier_call = cpufreq_notifier_policy
-};
-
-static struct notifier_block notifier_trans_block = {
-	.notifier_call = cpufreq_notifier_trans
-};
-
-static struct notifier_block notifier_pwr_stats_ready = {
-	.notifier_call = pwr_stats_ready_notifier
-};
-
-int __weak register_cpu_pwr_stats_ready_notifier(struct notifier_block *nb)
-{
-	return -EINVAL;
-}
-
-static int register_sched_callback(void)
-{
-	int ret;
-
-	if (!sched_enable_hmp)
-		return 0;
-
-	ret = cpufreq_register_notifier(&notifier_policy_block,
-						CPUFREQ_POLICY_NOTIFIER);
-
-	if (!ret)
-		ret = cpufreq_register_notifier(&notifier_trans_block,
-						CPUFREQ_TRANSITION_NOTIFIER);
-
-	register_cpu_pwr_stats_ready_notifier(&notifier_pwr_stats_ready);
-
-	return 0;
-}
-
-/*
- * cpufreq callbacks can be registered at core_initcall or later time.
- * Any registration done prior to that is "forgotten" by cpufreq. See
- * initialization of variable init_cpufreq_transition_notifier_list_called
- * for further information.
- */
-core_initcall(register_sched_callback);
-
-static inline int update_preferred_cluster(struct related_thread_group *grp,
-		struct task_struct *p, u32 old_load)
-{
-	u32 new_load = task_load(p);
-
-	if (!grp)
-		return 0;
-
-	/*
-	 * Update if task's load has changed significantly or a complete window
-	 * has passed since we last updated preference
-	 */
-	if (abs(new_load - old_load) > sched_ravg_window / 4 ||
-		sched_ktime_clock() - grp->last_update > sched_ravg_window)
-		return 1;
-
-	return 0;
-}
-
-#else	/* CONFIG_SCHED_HMP */
-
-static inline void fixup_busy_time(struct task_struct *p, int new_cpu) { }
-
-static void
-update_task_ravg(struct task_struct *p, struct rq *rq,
-			 int event, u64 wallclock, u64 irqtime)
-{
-}
-
-static inline void mark_task_starting(struct task_struct *p) {}
-
-static inline void set_window_start(struct rq *rq) {}
-
-static inline void migrate_sync_cpu(int cpu) {}
-
-#endif	/* CONFIG_SCHED_HMP */
-
 #ifdef CONFIG_SMP
 /*
  * This is how migration works:
@@ -4330,34 +1134,6 @@ static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_
 	return rq;
 }
 
-static void notify_migration(int src_cpu, int dest_cpu, bool src_cpu_dead,
-			     struct task_struct *p)
-{
-	struct migration_notify_data mnd;
-	bool check_groups;
-
-	rcu_read_lock();
-	check_groups = rcu_access_pointer(p->grp) != NULL;
-	rcu_read_unlock();
-
-	if (!same_freq_domain(src_cpu, dest_cpu)) {
-		if (!src_cpu_dead)
-			check_for_freq_change(cpu_rq(src_cpu), false,
-					      check_groups);
-		check_for_freq_change(cpu_rq(dest_cpu), false, check_groups);
-	} else {
-		check_for_freq_change(cpu_rq(dest_cpu), true, check_groups);
-	}
-
-	if (task_notify_on_migrate(p)) {
-		mnd.src_cpu = src_cpu;
-		mnd.dest_cpu = dest_cpu;
-		mnd.load = pct_task_load(p);
-		atomic_notifier_call_chain(&migration_notifier_head, 0,
-					   (void *)&mnd);
-	}
-}
-
 /*
  * migration_cpu_stop - this will be executed by a highprio stopper thread
  * and performs thread migration by bumping thread off CPU then
@@ -5181,8 +1957,6 @@ static void ttwu_queue(struct task_struct *p, int cpu)
 	raw_spin_unlock(&rq->lock);
 }
 
-__read_mostly unsigned int sysctl_sched_wakeup_load_threshold = 110;
-
 /**
  * try_to_wake_up - wake up a thread
  * @p: the thread to be awakened
@@ -5203,8 +1977,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 {
 	unsigned long flags;
 	int cpu, src_cpu, success = 0;
-	int notify = 0;
-	struct migration_notify_data mnd;
 #ifdef CONFIG_SMP
 	unsigned int old_load;
 	struct rq *rq;
@@ -5309,31 +2081,9 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 	ttwu_queue(p, cpu);
 stat:
 	ttwu_stat(p, cpu, wake_flags);
-
-	if (task_notify_on_migrate(p)) {
-		mnd.src_cpu = src_cpu;
-		mnd.dest_cpu = cpu;
-		mnd.load = pct_task_load(p);
-
-		/*
-		 * Call the migration notifier with mnd for foreground task
-		 * migrations as well as for wakeups if their load is above
-		 * sysctl_sched_wakeup_load_threshold. This would prompt the
-		 * cpu-boost to boost the CPU frequency on wake up of a heavy
-		 * weight foreground task
-		 */
-		if ((src_cpu != cpu) || (mnd.load >
-					sysctl_sched_wakeup_load_threshold))
-			notify = 1;
-	}
-
 out:
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 
-	if (notify)
-		atomic_notifier_call_chain(&migration_notifier_head,
-					   0, (void *)&mnd);
-
 	if (freq_notif_allowed) {
 		if (!same_freq_domain(src_cpu, cpu)) {
 			check_for_freq_change(cpu_rq(cpu),
@@ -5466,6 +2216,44 @@ void __dl_clear_params(struct task_struct *p)
 	dl_se->dl_yielded = 0;
 }
 
+#ifdef CONFIG_SCHED_HMP
+/*
+ * sched_exit() - Set EXITING_TASK_MARKER in task's ravg.demand field
+ *
+ * Stop accounting (exiting) task's future cpu usage
+ *
+ * We need this so that reset_all_windows_stats() can function correctly.
+ * reset_all_window_stats() depends on do_each_thread/for_each_thread task
+ * iterators to reset *all* task's statistics. Exiting tasks however become
+ * invisible to those iterators. sched_exit() is called on a exiting task prior
+ * to being removed from task_list, which will let reset_all_window_stats()
+ * function correctly.
+ */
+void sched_exit(struct task_struct *p)
+{
+	unsigned long flags;
+	int cpu = get_cpu();
+	struct rq *rq = cpu_rq(cpu);
+	u64 wallclock;
+
+	sched_set_group_id(p, 0);
+
+	raw_spin_lock_irqsave(&rq->lock, flags);
+	/* rq->curr == p */
+	wallclock = sched_ktime_clock();
+	update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+	dequeue_task(rq, p, 0);
+	reset_task_stats(p);
+	p->ravg.mark_start = wallclock;
+	p->ravg.sum_history[0] = EXITING_TASK_MARKER;
+	enqueue_task(rq, p, 0);
+	clear_ed_task(p, rq);
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+	put_cpu();
+}
+#endif /* CONFIG_SCHED_HMP */
+
 /*
  * Perform scheduler related setup for a newly forked process p.
  * p is forked by current.
@@ -5752,6 +2540,7 @@ void wake_up_new_task(struct task_struct *p)
 
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
 	init_new_task_load(p);
+	add_new_task_to_grp(p);
 	/* Initialize new task's runnable average */
 	init_entity_runnable_average(&p->se);
 #ifdef CONFIG_SMP
@@ -6236,37 +3025,6 @@ unsigned long long task_sched_runtime(struct task_struct *p)
 	return ns;
 }
 
-#ifdef CONFIG_SCHED_HMP
-static bool early_detection_notify(struct rq *rq, u64 wallclock)
-{
-	struct task_struct *p;
-	int loop_max = 10;
-
-	if (!sched_boost() || !rq->cfs.h_nr_running)
-		return 0;
-
-	rq->ed_task = NULL;
-	list_for_each_entry(p, &rq->cfs_tasks, se.group_node) {
-		if (!loop_max)
-			break;
-
-		if (wallclock - p->last_wake_ts >= EARLY_DETECTION_DURATION) {
-			rq->ed_task = p;
-			return 1;
-		}
-
-		loop_max--;
-	}
-
-	return 0;
-}
-#else /* CONFIG_SCHED_HMP */
-static bool early_detection_notify(struct rq *rq, u64 wallclock)
-{
-	return 0;
-}
-#endif /* CONFIG_SCHED_HMP */
-
 /*
  * This function gets called by the timer code, with HZ frequency.
  * We call it with interrupts disabled.
@@ -10973,8 +7731,11 @@ void __init sched_init(void)
 		rq->avg_irqload = 0;
 		rq->irqload_ts = 0;
 		rq->static_cpu_pwr_cost = 0;
-		rq->cc.cycles = SCHED_MIN_FREQ;
+		rq->cc.cycles = 1;
 		rq->cc.time = 1;
+		rq->cstate = 0;
+		rq->wakeup_latency = 0;
+		rq->wakeup_energy = 0;
 
 		/*
 		 * All cpus part of same cluster by default. This avoids the
@@ -10982,19 +7743,14 @@ void __init sched_init(void)
 		 * like select_best_cpu()
 		 */
 		rq->cluster = &init_cluster;
-#ifdef CONFIG_SCHED_FREQ_INPUT
 		rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
 		rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0;
 		rq->old_busy_time = 0;
 		rq->old_estimated_time = 0;
 		rq->old_busy_time_group = 0;
-		rq->notifier_sent = 0;
 		rq->hmp_stats.pred_demands_sum = 0;
 #endif
-#endif
 		rq->max_idle_balance_cost = sysctl_sched_migration_cost;
-		rq->cstate = 0;
-		rq->wakeup_latency = 0;
 
 		INIT_LIST_HEAD(&rq->cfs_tasks);
 
@@ -11719,7 +8475,7 @@ int sched_rr_handler(struct ctl_table *table, int write,
 
 #ifdef CONFIG_CGROUP_SCHED
 
-static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
+inline struct task_group *css_tg(struct cgroup_subsys_state *css)
 {
 	return css ? container_of(css, struct task_group, css) : NULL;
 }
@@ -11793,63 +8549,6 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
 		sched_move_task(task);
 }
 
-static u64 cpu_notify_on_migrate_read_u64(struct cgroup_subsys_state *css,
-					  struct cftype *cft)
-{
-	struct task_group *tg = css_tg(css);
-
-	return tg->notify_on_migrate;
-}
-
-static int cpu_notify_on_migrate_write_u64(struct cgroup_subsys_state *css,
-					   struct cftype *cft, u64 notify)
-{
-	struct task_group *tg = css_tg(css);
-
-	tg->notify_on_migrate = (notify > 0);
-
-	return 0;
-}
-
-#ifdef CONFIG_SCHED_HMP
-
-static u64 cpu_upmigrate_discourage_read_u64(struct cgroup_subsys_state *css,
-					  struct cftype *cft)
-{
-	struct task_group *tg = css_tg(css);
-
-	return tg->upmigrate_discouraged;
-}
-
-static int cpu_upmigrate_discourage_write_u64(struct cgroup_subsys_state *css,
-				struct cftype *cft, u64 upmigrate_discourage)
-{
-	struct task_group *tg = css_tg(css);
-	int discourage = upmigrate_discourage > 0;
-
-	if (tg->upmigrate_discouraged == discourage)
-		return 0;
-
-	/*
-	 * Revisit big-task classification for tasks of this cgroup. It would
-	 * have been efficient to walk tasks of just this cgroup in running
-	 * state, but we don't have easy means to do that. Walk all tasks in
-	 * running state on all cpus instead and re-visit their big task
-	 * classification.
-	 */
-	get_online_cpus();
-	pre_big_task_count_change(cpu_online_mask);
-
-	tg->upmigrate_discouraged = discourage;
-
-	post_big_task_count_change(cpu_online_mask);
-	put_online_cpus();
-
-	return 0;
-}
-
-#endif	/* CONFIG_SCHED_HMP */
-
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
 				struct cftype *cftype, u64 shareval)
@@ -12135,11 +8834,6 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
 #endif /* CONFIG_RT_GROUP_SCHED */
 
 static struct cftype cpu_files[] = {
-	{
-		.name = "notify_on_migrate",
-		.read_u64 = cpu_notify_on_migrate_read_u64,
-		.write_u64 = cpu_notify_on_migrate_write_u64,
-	},
 #ifdef CONFIG_SCHED_HMP
 	{
 		.name = "upmigrate_discourage",
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index d1c0ef4bf07d..b6dc131f36a6 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -327,8 +327,6 @@ do {									\
 	P(cluster->cur_freq);
 	P(cluster->max_freq);
 	P(cluster->exec_scale_factor);
-#endif
-#ifdef CONFIG_SCHED_HMP
 	P(hmp_stats.nr_big_tasks);
 	SEQ_printf(m, "  .%-30s: %llu\n", "hmp_stats.cumulative_runnable_avg",
 			rq->hmp_stats.cumulative_runnable_avg);
@@ -417,10 +415,8 @@ static void sched_debug_header(struct seq_file *m)
 	P(sched_upmigrate);
 	P(sched_downmigrate);
 	P(sched_init_task_load_windows);
-	P(sched_init_task_load_pelt);
 	P(min_capacity);
 	P(max_capacity);
-	P(sched_use_pelt);
 	P(sched_ravg_window);
 #endif
 #undef PN
@@ -644,7 +640,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 	__P(load_avg);
 #ifdef CONFIG_SCHED_HMP
 	P(ravg.demand);
-	P(se.avg.runnable_avg_sum_scaled);
 #endif
 #endif
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 584cd048c24b..ce58e2245b4b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2577,450 +2577,24 @@ static u32 __compute_runnable_contrib(u64 n)
 	return contrib + runnable_avg_yN_sum[n];
 }
 
-static void add_to_scaled_stat(int cpu, struct sched_avg *sa, u64 delta);
-static inline void decay_scaled_stat(struct sched_avg *sa, u64 periods);
-
-struct cpu_pwr_stats __weak *get_cpu_pwr_stats(void)
-{
-	return NULL;
-}
-
-enum sched_boost_type {
-	SCHED_BOOST_NONE,
-	SCHED_BOOST_ON_BIG,
-	SCHED_BOOST_ON_ALL,
-};
-
 #ifdef CONFIG_SCHED_HMP
 
-/* Initial task load. Newly created tasks are assigned this load. */
-unsigned int __read_mostly sched_init_task_load_pelt;
-unsigned int __read_mostly sched_init_task_load_windows;
-unsigned int __read_mostly sysctl_sched_init_task_load_pct = 15;
-
-unsigned int max_task_load(void)
-{
-	if (sched_use_pelt)
-		return LOAD_AVG_MAX;
-
-	return sched_ravg_window;
-}
-
-/* Use this knob to turn on or off HMP-aware task placement logic */
-unsigned int __read_mostly sched_enable_hmp = 0;
-
-/* A cpu can no longer accomodate more tasks if:
- *
- *	rq->nr_running > sysctl_sched_spill_nr_run ||
- *	rq->hmp_stats.cumulative_runnable_avg > sched_spill_load
- */
-unsigned int __read_mostly sysctl_sched_spill_nr_run = 10;
-
-/*
- * Place sync wakee tasks those have less than configured demand to the waker's
- * cluster.
- */
-unsigned int __read_mostly sched_small_wakee_task_load;
-unsigned int __read_mostly sysctl_sched_small_wakee_task_load_pct = 10;
-
-unsigned int __read_mostly sched_big_waker_task_load;
-unsigned int __read_mostly sysctl_sched_big_waker_task_load_pct = 25;
-
-/*
- * CPUs with load greater than the sched_spill_load_threshold are not
- * eligible for task placement. When all CPUs in a cluster achieve a
- * load higher than this level, tasks becomes eligible for inter
- * cluster migration.
- */
-unsigned int __read_mostly sched_spill_load;
-unsigned int __read_mostly sysctl_sched_spill_load_pct = 100;
-
-/*
- * Tasks whose bandwidth consumption on a cpu is more than
- * sched_upmigrate are considered "big" tasks. Big tasks will be
- * considered for "up" migration, i.e migrating to a cpu with better
- * capacity.
- */
-unsigned int __read_mostly sched_upmigrate;
-unsigned int __read_mostly sysctl_sched_upmigrate_pct = 80;
-
-/*
- * Big tasks, once migrated, will need to drop their bandwidth
- * consumption to less than sched_downmigrate before they are "down"
- * migrated.
- */
-unsigned int __read_mostly sched_downmigrate;
-unsigned int __read_mostly sysctl_sched_downmigrate_pct = 60;
-
-#define SCHED_UPMIGRATE_MIN_NICE 15
-
-/*
- * The load scale factor of a CPU gets boosted when its max frequency
- * is restricted due to which the tasks are migrating to higher capacity
- * CPUs early. The sched_upmigrate threshold is auto-upgraded by
- * rq->max_possible_freq/rq->max_freq of a lower capacity CPU.
- */
-unsigned int up_down_migrate_scale_factor = 1024;
-
-/*
- * Scheduler boost is a mechanism to temporarily place tasks on CPUs
- * with higher capacity than those where a task would have normally
- * ended up with their load characteristics. Any entity enabling
- * boost is responsible for disabling it as well.
- */
-unsigned int sysctl_sched_boost;
-
-/*
- * Scheduler selects and places task to its previous CPU if sleep time is
- * less than sysctl_sched_select_prev_cpu_us.
- */
-static unsigned int __read_mostly
-sched_short_sleep_task_threshold = 2000 * NSEC_PER_USEC;
-unsigned int __read_mostly sysctl_sched_select_prev_cpu_us = 2000;
-
-static unsigned int __read_mostly
-sched_long_cpu_selection_threshold = 100 * NSEC_PER_MSEC;
-
-unsigned int __read_mostly sysctl_sched_restrict_cluster_spill;
-
-void update_up_down_migrate(void)
-{
-	unsigned int up_migrate = pct_to_real(sysctl_sched_upmigrate_pct);
-	unsigned int down_migrate = pct_to_real(sysctl_sched_downmigrate_pct);
-	unsigned int delta;
-
-	if (up_down_migrate_scale_factor == 1024)
-		goto done;
-
-	delta = up_migrate - down_migrate;
-
-	up_migrate /= NSEC_PER_USEC;
-	up_migrate *= up_down_migrate_scale_factor;
-	up_migrate >>= 10;
-	up_migrate *= NSEC_PER_USEC;
-
-	up_migrate = min(up_migrate, sched_ravg_window);
-
-	down_migrate /= NSEC_PER_USEC;
-	down_migrate *= up_down_migrate_scale_factor;
-	down_migrate >>= 10;
-	down_migrate *= NSEC_PER_USEC;
-
-	down_migrate = min(down_migrate, up_migrate - delta);
-done:
-	sched_upmigrate = up_migrate;
-	sched_downmigrate = down_migrate;
-}
-
-void set_hmp_defaults(void)
-{
-	sched_spill_load =
-		pct_to_real(sysctl_sched_spill_load_pct);
-
-	update_up_down_migrate();
-
-#ifdef CONFIG_SCHED_FREQ_INPUT
-	sched_major_task_runtime =
-		mult_frac(sched_ravg_window, MAJOR_TASK_PCT, 100);
-#endif
-
-	sched_init_task_load_pelt =
-		div64_u64((u64)sysctl_sched_init_task_load_pct *
-			  (u64)LOAD_AVG_MAX, 100);
-
-	sched_init_task_load_windows =
-		div64_u64((u64)sysctl_sched_init_task_load_pct *
-			  (u64)sched_ravg_window, 100);
-
-	sched_short_sleep_task_threshold = sysctl_sched_select_prev_cpu_us *
-					   NSEC_PER_USEC;
-
-	sched_small_wakee_task_load =
-		div64_u64((u64)sysctl_sched_small_wakee_task_load_pct *
-			  (u64)sched_ravg_window, 100);
-
-	sched_big_waker_task_load =
-		div64_u64((u64)sysctl_sched_big_waker_task_load_pct *
-			  (u64)sched_ravg_window, 100);
-}
-
-u32 sched_get_init_task_load(struct task_struct *p)
-{
-	return p->init_load_pct;
-}
-
-int sched_set_init_task_load(struct task_struct *p, int init_load_pct)
-{
-	if (init_load_pct < 0 || init_load_pct > 100)
-		return -EINVAL;
-
-	p->init_load_pct = init_load_pct;
-
-	return 0;
-}
-
-#ifdef CONFIG_CGROUP_SCHED
-
-static inline int upmigrate_discouraged(struct task_struct *p)
-{
-	return task_group(p)->upmigrate_discouraged;
-}
-
-#else
-
-static inline int upmigrate_discouraged(struct task_struct *p)
-{
-	return 0;
-}
-
-#endif
-
-/* Is a task "big" on its current cpu */
-static inline int __is_big_task(struct task_struct *p, u64 scaled_load)
-{
-	int nice = task_nice(p);
-
-	if (nice > SCHED_UPMIGRATE_MIN_NICE || upmigrate_discouraged(p))
-		return 0;
-
-	return scaled_load > sched_upmigrate;
-}
-
-static inline int is_big_task(struct task_struct *p)
-{
-	return __is_big_task(p, scale_load_to_cpu(task_load(p), task_cpu(p)));
-}
-
-static inline u64 cpu_load(int cpu)
-{
-	struct rq *rq = cpu_rq(cpu);
-
-	return scale_load_to_cpu(rq->hmp_stats.cumulative_runnable_avg, cpu);
-}
-
-static inline u64 cpu_load_sync(int cpu, int sync)
-{
-	return scale_load_to_cpu(cpu_cravg_sync(cpu, sync), cpu);
-}
-
-static int boost_refcount;
-static DEFINE_SPINLOCK(boost_lock);
-static DEFINE_MUTEX(boost_mutex);
-
-static void boost_kick_cpus(void)
-{
-	int i;
-
-	for_each_online_cpu(i) {
-		if (cpu_capacity(i) != max_capacity)
-			boost_kick(i);
-	}
-}
-
-int sched_boost(void)
-{
-	return boost_refcount > 0;
-}
-
-int sched_set_boost(int enable)
-{
-	unsigned long flags;
-	int ret = 0;
-	int old_refcount;
-
-	if (!sched_enable_hmp)
-		return -EINVAL;
-
-	spin_lock_irqsave(&boost_lock, flags);
-
-	old_refcount = boost_refcount;
-
-	if (enable == 1) {
-		boost_refcount++;
-	} else if (!enable) {
-		if (boost_refcount >= 1)
-			boost_refcount--;
-		else
-			ret = -EINVAL;
-	} else {
-		ret = -EINVAL;
-	}
-
-	if (!old_refcount && boost_refcount)
-		boost_kick_cpus();
-
-	trace_sched_set_boost(boost_refcount);
-	spin_unlock_irqrestore(&boost_lock, flags);
-
-	return ret;
-}
-
-int sched_boost_handler(struct ctl_table *table, int write,
-		void __user *buffer, size_t *lenp,
-		loff_t *ppos)
-{
-	int ret;
-
-	mutex_lock(&boost_mutex);
-	if (!write)
-		sysctl_sched_boost = sched_boost();
-
-	ret = proc_dointvec(table, write, buffer, lenp, ppos);
-	if (ret || !write)
-		goto done;
-
-	ret = (sysctl_sched_boost <= 1) ?
-		sched_set_boost(sysctl_sched_boost) : -EINVAL;
-
-done:
-	mutex_unlock(&boost_mutex);
-	return ret;
-}
-
-/*
- * Task will fit on a cpu if it's bandwidth consumption on that cpu
- * will be less than sched_upmigrate. A big task that was previously
- * "up" migrated will be considered fitting on "little" cpu if its
- * bandwidth consumption on "little" cpu will be less than
- * sched_downmigrate. This will help avoid frequenty migrations for
- * tasks with load close to the upmigrate threshold
- */
-
-static int task_load_will_fit(struct task_struct *p, u64 task_load, int cpu,
-			      enum sched_boost_type boost_type)
-{
-	int upmigrate;
-
-	if (cpu_capacity(cpu) == max_capacity)
-		return 1;
-
-	if (boost_type != SCHED_BOOST_ON_BIG) {
-		if (task_nice(p) > SCHED_UPMIGRATE_MIN_NICE ||
-		    upmigrate_discouraged(p))
-			return 1;
-
-		upmigrate = sched_upmigrate;
-		if (cpu_capacity(task_cpu(p)) > cpu_capacity(cpu))
-			upmigrate = sched_downmigrate;
-
-		if (task_load < upmigrate)
-			return 1;
-	}
-
-	return 0;
-}
-
-static enum sched_boost_type sched_boost_type(void)
-{
-	if (sched_boost()) {
-		if (min_possible_efficiency != max_possible_efficiency)
-			return SCHED_BOOST_ON_BIG;
-		else
-			return SCHED_BOOST_ON_ALL;
-	}
-	return SCHED_BOOST_NONE;
-}
-
-static int task_will_fit(struct task_struct *p, int cpu)
-{
-	u64 tload = scale_load_to_cpu(task_load(p), cpu);
-
-	return task_load_will_fit(p, tload, cpu, sched_boost_type());
-}
-
-int group_will_fit(struct sched_cluster *cluster,
-		 struct related_thread_group *grp, u64 demand)
-{
-	int cpu = cluster_first_cpu(cluster);
-	int prev_capacity = 0;
-	unsigned int threshold = sched_upmigrate;
-	u64 load;
-
-	if (cluster->capacity == max_capacity)
-		return 1;
-
-	if (grp->preferred_cluster)
-		prev_capacity = grp->preferred_cluster->capacity;
-
-	if (cluster->capacity < prev_capacity)
-		threshold = sched_downmigrate;
-
-	load = scale_load_to_cpu(demand, cpu);
-	if (load < threshold)
-		return 1;
-
-	return 0;
-}
-
-/*
- * Return the cost of running task p on CPU cpu. This function
- * currently assumes that task p is the only task which will run on
- * the CPU.
- */
-unsigned int power_cost(int cpu, u64 demand)
-{
-	int first, mid, last;
-	struct cpu_pwr_stats *per_cpu_info = get_cpu_pwr_stats();
-	struct cpu_pstate_pwr *costs;
-	struct freq_max_load *max_load;
-	int total_static_pwr_cost = 0;
-	struct rq *rq = cpu_rq(cpu);
-	unsigned int pc;
-
-	if (!per_cpu_info || !per_cpu_info[cpu].ptable)
-		/* When power aware scheduling is not in use, or CPU
-		 * power data is not available, just use the CPU
-		 * capacity as a rough stand-in for real CPU power
-		 * numbers, assuming bigger CPUs are more power
-		 * hungry. */
-		return cpu_max_possible_capacity(cpu);
-
-	rcu_read_lock();
-	max_load = rcu_dereference(per_cpu(freq_max_load, cpu));
-	if (!max_load) {
-		pc = cpu_max_possible_capacity(cpu);
-		goto unlock;
-	}
-
-	costs = per_cpu_info[cpu].ptable;
-
-	if (demand <= max_load->freqs[0].hdemand) {
-		pc = costs[0].power;
-		goto unlock;
-	} else if (demand > max_load->freqs[max_load->length - 1].hdemand) {
-		pc = costs[max_load->length - 1].power;
-		goto unlock;
-	}
-
-	first = 0;
-	last = max_load->length - 1;
-	mid = (last - first) >> 1;
-	while (1) {
-		if (demand <= max_load->freqs[mid].hdemand)
-			last = mid;
-		else
-			first = mid;
-
-		if (last - first == 1)
-			break;
-		mid = first + ((last - first) >> 1);
-	}
-
-	pc = costs[last].power;
-
-unlock:
-	rcu_read_unlock();
-
-	if (idle_cpu(cpu) && rq->cstate) {
-		total_static_pwr_cost += rq->static_cpu_pwr_cost;
-		if (rq->cluster->dstate)
-			total_static_pwr_cost +=
-				rq->cluster->static_cluster_pwr_cost;
-	}
-
-	return pc + total_static_pwr_cost;
-
-}
+/* CPU selection flag */
+#define SBC_FLAG_PREV_CPU				0x1
+#define SBC_FLAG_BEST_CAP_CPU				0x2
+#define SBC_FLAG_CPU_COST				0x4
+#define SBC_FLAG_MIN_COST				0x8
+#define SBC_FLAG_IDLE_LEAST_LOADED			0x10
+#define SBC_FLAG_IDLE_CSTATE				0x20
+#define SBC_FLAG_COST_CSTATE_TIE_BREAKER		0x40
+#define SBC_FLAG_COST_CSTATE_PREV_CPU_TIE_BREAKER	0x80
+#define SBC_FLAG_CSTATE_LOAD				0x100
+#define SBC_FLAG_BEST_SIBLING				0x200
+
+/* Cluster selection flag */
+#define SBC_FLAG_COLOC_CLUSTER				0x10000
+#define SBC_FLAG_WAKER_CLUSTER				0x20000
+#define SBC_FLAG_BACKUP_CLUSTER				0x40000
 
 struct cpu_select_env {
 	struct task_struct *p;
@@ -3036,6 +2610,8 @@ struct cpu_select_env {
 	DECLARE_BITMAP(backup_list, NR_CPUS);
 	u64 task_load;
 	u64 cpu_load;
+	u32 sbc_best_flag;
+	u32 sbc_best_cluster_flag;
 };
 
 struct cluster_cpu_stats {
@@ -3047,45 +2623,7 @@ struct cluster_cpu_stats {
 	s64 highest_spare_capacity;
 };
 
-#define UP_MIGRATION		1
-#define DOWN_MIGRATION		2
-#define IRQLOAD_MIGRATION	3
-
-/*
- * Invoked from three places:
- *	1) try_to_wake_up() -> ... -> select_best_cpu()
- *	2) scheduler_tick() -> ... -> migration_needed() -> select_best_cpu()
- *	3) can_migrate_task()
- *
- * Its safe to de-reference p->grp in first case (since p->pi_lock is held)
- * but not in other cases. p->grp is hence freed after a RCU grace period and
- * accessed under rcu_read_lock()
- */
-static inline int
-preferred_cluster(struct sched_cluster *cluster, struct task_struct *p)
-{
-	struct related_thread_group *grp;
-	int rc = 0;
-
-	rcu_read_lock();
-
-	grp = task_related_thread_group(p);
-	if (!grp || !sysctl_sched_enable_colocation)
-		rc = 1;
-	else
-		rc = (grp->preferred_cluster == cluster);
-
-	rcu_read_unlock();
-	return rc;
-}
-
-static inline struct sched_cluster *rq_cluster(struct rq *rq)
-{
-	return rq->cluster;
-}
-
-static int
-spill_threshold_crossed(struct cpu_select_env *env, struct rq *rq)
+static int spill_threshold_crossed(struct cpu_select_env *env, struct rq *rq)
 {
 	u64 total_load;
 
@@ -3168,6 +2706,7 @@ select_least_power_cluster(struct cpu_select_env *env)
 	if (env->rtg) {
 		env->task_load = scale_load_to_cpu(task_load(env->p),
 			cluster_first_cpu(env->rtg->preferred_cluster));
+		env->sbc_best_cluster_flag |= SBC_FLAG_COLOC_CLUSTER;
 		return env->rtg->preferred_cluster;
 	}
 
@@ -3246,6 +2785,7 @@ struct cpu_select_env *env, struct cluster_cpu_stats *stats)
 			update_spare_capacity(stats, env, i, next->capacity,
 					  cpu_load_sync(i, env->sync));
 		}
+		env->sbc_best_cluster_flag = SBC_FLAG_BACKUP_CLUSTER;
 	}
 }
 
@@ -3317,6 +2857,7 @@ static void __update_cluster_stats(int cpu, struct cluster_cpu_stats *stats,
 		stats->best_cpu_cstate = cpu_cstate;
 		stats->best_load = env->cpu_load;
 		stats->best_cpu = cpu;
+		env->sbc_best_flag = SBC_FLAG_CPU_COST;
 		return;
 	}
 
@@ -3329,12 +2870,14 @@ static void __update_cluster_stats(int cpu, struct cluster_cpu_stats *stats,
 		stats->best_cpu_cstate = cpu_cstate;
 		stats->best_load = env->cpu_load;
 		stats->best_cpu = cpu;
+		env->sbc_best_flag = SBC_FLAG_COST_CSTATE_TIE_BREAKER;
 		return;
 	}
 
 	/* C-state is the same. Use prev CPU to break the tie */
 	if (cpu == prev_cpu) {
 		stats->best_cpu = cpu;
+		env->sbc_best_flag = SBC_FLAG_COST_CSTATE_PREV_CPU_TIE_BREAKER;
 		return;
 	}
 
@@ -3343,6 +2886,7 @@ static void __update_cluster_stats(int cpu, struct cluster_cpu_stats *stats,
 	    (cpu_cstate > 0 && env->cpu_load > stats->best_load))) {
 		stats->best_load = env->cpu_load;
 		stats->best_cpu = cpu;
+		env->sbc_best_flag = SBC_FLAG_CSTATE_LOAD;
 	}
 }
 #else /* CONFIG_SCHED_HMP_CSTATE_AWARE */
@@ -3373,10 +2917,11 @@ static void __update_cluster_stats(int cpu, struct cluster_cpu_stats *stats,
 			stats->min_cost = cpu_cost;
 			stats->min_load = env->cpu_load;
 			stats->best_cpu = cpu;
+			env->sbc_best_flag = SBC_FLAG_MIN_COST;
 		}
 	}
 }
-#endif
+#endif /* CONFIG_SCHED_HMP_CSTATE_AWARE */
 
 static void update_cluster_stats(int cpu, struct cluster_cpu_stats *stats,
 					 struct cpu_select_env *env)
@@ -3530,8 +3075,8 @@ static int select_best_cpu(struct task_struct *p, int target, int reason,
 {
 	struct sched_cluster *cluster, *pref_cluster = NULL;
 	struct cluster_cpu_stats stats;
-	bool fast_path = false;
 	struct related_thread_group *grp;
+	unsigned int sbc_flag = 0;
 
 	struct cpu_select_env env = {
 		.p			= p,
@@ -3543,6 +3088,8 @@ static int select_best_cpu(struct task_struct *p, int target, int reason,
 		.prev_cpu		= target,
 		.ignore_prev_cpu	= 0,
 		.rtg			= NULL,
+		.sbc_best_flag		= 0,
+		.sbc_best_cluster_flag	= 0,
 	};
 
 	bitmap_copy(env.candidate_list, all_cluster_ids, NR_CPUS);
@@ -3567,8 +3114,10 @@ static int select_best_cpu(struct task_struct *p, int target, int reason,
 			env.need_waker_cluster = 1;
 			bitmap_zero(env.candidate_list, NR_CPUS);
 			__set_bit(cluster->id, env.candidate_list);
+			env.sbc_best_cluster_flag = SBC_FLAG_WAKER_CLUSTER;
+
 		} else if (bias_to_prev_cpu(&env, &stats)) {
-			fast_path = true;
+			sbc_flag = SBC_FLAG_PREV_CPU;
 			goto out;
 		}
 	}
@@ -3592,15 +3141,20 @@ retry:
 	} while ((cluster = next_best_cluster(cluster, &env, &stats)));
 
 	if (env.need_idle) {
-		if (stats.best_idle_cpu >= 0)
+		if (stats.best_idle_cpu >= 0) {
 			target = stats.best_idle_cpu;
-		else if (stats.least_loaded_cpu >= 0)
+			sbc_flag |= SBC_FLAG_IDLE_CSTATE;
+		} else if (stats.least_loaded_cpu >= 0) {
 			target = stats.least_loaded_cpu;
+			sbc_flag |= SBC_FLAG_IDLE_LEAST_LOADED;
+		}
 	} else if (stats.best_cpu >= 0) {
 		if (stats.best_cpu != task_cpu(p) &&
-				stats.min_cost == stats.best_sibling_cpu_cost)
+				stats.min_cost == stats.best_sibling_cpu_cost) {
 			stats.best_cpu = stats.best_sibling_cpu;
-
+			sbc_flag |= SBC_FLAG_BEST_SIBLING;
+		}
+		sbc_flag |= env.sbc_best_flag;
 		target = stats.best_cpu;
 	} else {
 		if (env.rtg) {
@@ -3609,66 +3163,20 @@ retry:
 		}
 
 		find_backup_cluster(&env, &stats);
-		if (stats.best_capacity_cpu >= 0)
+		if (stats.best_capacity_cpu >= 0) {
 			target = stats.best_capacity_cpu;
+			sbc_flag |= SBC_FLAG_BEST_CAP_CPU;
+		}
 	}
 	p->last_cpu_selected_ts = sched_ktime_clock();
-
+	sbc_flag |= env.sbc_best_cluster_flag;
 out:
 	rcu_read_unlock();
 	trace_sched_task_load(p, sched_boost(), env.reason, env.sync,
-					env.need_idle, fast_path, target);
+					env.need_idle, sbc_flag, target);
 	return target;
 }
 
-static void
-inc_nr_big_task(struct hmp_sched_stats *stats, struct task_struct *p)
-{
-	if (!sched_enable_hmp || sched_disable_window_stats)
-		return;
-
-	if (is_big_task(p))
-		stats->nr_big_tasks++;
-}
-
-static void
-dec_nr_big_task(struct hmp_sched_stats *stats, struct task_struct *p)
-{
-	if (!sched_enable_hmp || sched_disable_window_stats)
-		return;
-
-	if (is_big_task(p))
-		stats->nr_big_tasks--;
-
-	BUG_ON(stats->nr_big_tasks < 0);
-}
-
-static void
-inc_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra)
-{
-	inc_nr_big_task(&rq->hmp_stats, p);
-	if (change_cra)
-		inc_cumulative_runnable_avg(&rq->hmp_stats, p);
-}
-
-static void
-dec_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra)
-{
-	dec_nr_big_task(&rq->hmp_stats, p);
-	if (change_cra)
-		dec_cumulative_runnable_avg(&rq->hmp_stats, p);
-}
-
-static void reset_hmp_stats(struct hmp_sched_stats *stats, int reset_cra)
-{
-	stats->nr_big_tasks = 0;
-	if (reset_cra) {
-		stats->cumulative_runnable_avg = 0;
-		set_pred_demands_sum(stats, 0);
-	}
-}
-
-
 #ifdef CONFIG_CFS_BANDWIDTH
 
 static inline struct task_group *next_task_group(struct task_group *tg)
@@ -3683,7 +3191,7 @@ static inline struct task_group *next_task_group(struct task_group *tg)
 	for (tg = container_of(&task_groups, struct task_group, list);	\
 		((tg = next_task_group(tg)) && (cfs_rq = tg->cfs_rq[cpu]));)
 
-static void reset_cfs_rq_hmp_stats(int cpu, int reset_cra)
+void reset_cfs_rq_hmp_stats(int cpu, int reset_cra)
 {
 	struct task_group *tg;
 	struct cfs_rq *cfs_rq;
@@ -3696,66 +3204,6 @@ static void reset_cfs_rq_hmp_stats(int cpu, int reset_cra)
 	rcu_read_unlock();
 }
 
-#else	/* CONFIG_CFS_BANDWIDTH */
-
-static inline void reset_cfs_rq_hmp_stats(int cpu, int reset_cra) { }
-
-#endif	/* CONFIG_CFS_BANDWIDTH */
-
-/*
- * Return total number of tasks "eligible" to run on highest capacity cpu
- *
- * This is simply nr_big_tasks for cpus which are not of max_capacity and
- * nr_running for cpus of max_capacity
- */
-unsigned int nr_eligible_big_tasks(int cpu)
-{
-	struct rq *rq = cpu_rq(cpu);
-	int nr_big = rq->hmp_stats.nr_big_tasks;
-	int nr = rq->nr_running;
-
-	if (cpu_max_possible_capacity(cpu) != max_possible_capacity)
-		return nr_big;
-
-	return nr;
-}
-
-/*
- * reset_cpu_hmp_stats - reset HMP stats for a cpu
- *	nr_big_tasks
- *	cumulative_runnable_avg (iff reset_cra is true)
- */
-void reset_cpu_hmp_stats(int cpu, int reset_cra)
-{
-	reset_cfs_rq_hmp_stats(cpu, reset_cra);
-	reset_hmp_stats(&cpu_rq(cpu)->hmp_stats, reset_cra);
-}
-
-static void
-fixup_nr_big_tasks(struct hmp_sched_stats *stats, struct task_struct *p,
-		   s64 delta)
-{
-	u64 new_task_load;
-	u64 old_task_load;
-
-	if (!sched_enable_hmp || sched_disable_window_stats)
-		return;
-
-	old_task_load = scale_load_to_cpu(task_load(p), task_cpu(p));
-	new_task_load = scale_load_to_cpu(delta + task_load(p), task_cpu(p));
-
-	if (__is_big_task(p, old_task_load) && !__is_big_task(p, new_task_load))
-		stats->nr_big_tasks--;
-	else if (!__is_big_task(p, old_task_load) &&
-		 __is_big_task(p, new_task_load))
-		stats->nr_big_tasks++;
-
-	BUG_ON(stats->nr_big_tasks < 0);
-}
-
-
-#ifdef CONFIG_CFS_BANDWIDTH
-
 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
 
 static void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
@@ -3764,8 +3212,8 @@ static void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
 	 struct task_struct *p, int change_cra);
 
 /* Add task's contribution to a cpu' HMP statistics */
-static void
-_inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p, int change_cra)
+void _inc_hmp_sched_stats_fair(struct rq *rq,
+			struct task_struct *p, int change_cra)
 {
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &p->se;
@@ -3857,6 +3305,8 @@ static int task_will_be_throttled(struct task_struct *p);
 
 #else	/* CONFIG_CFS_BANDWIDTH */
 
+inline void reset_cfs_rq_hmp_stats(int cpu, int reset_cra) { }
+
 static void
 inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p)
 {
@@ -3887,8 +3337,8 @@ static inline int task_will_be_throttled(struct task_struct *p)
 	return 0;
 }
 
-static void
-_inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p, int change_cra)
+void _inc_hmp_sched_stats_fair(struct rq *rq,
+			struct task_struct *p, int change_cra)
 {
 	inc_nr_big_task(&rq->hmp_stats, p);
 }
@@ -3896,179 +3346,6 @@ _inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p, int change_cra)
 #endif	/* CONFIG_CFS_BANDWIDTH */
 
 /*
- * Walk runqueue of cpu and re-initialize 'nr_big_tasks' counters.
- */
-static void update_nr_big_tasks(int cpu)
-{
-	struct rq *rq = cpu_rq(cpu);
-	struct task_struct *p;
-
-	/* Do not reset cumulative_runnable_avg */
-	reset_cpu_hmp_stats(cpu, 0);
-
-	list_for_each_entry(p, &rq->cfs_tasks, se.group_node)
-		_inc_hmp_sched_stats_fair(rq, p, 0);
-}
-
-/* Disable interrupts and grab runqueue lock of all cpus listed in @cpus */
-void pre_big_task_count_change(const struct cpumask *cpus)
-{
-	int i;
-
-	local_irq_disable();
-
-	for_each_cpu(i, cpus)
-		raw_spin_lock(&cpu_rq(i)->lock);
-}
-
-/*
- * Reinitialize 'nr_big_tasks' counters on all affected cpus
- */
-void post_big_task_count_change(const struct cpumask *cpus)
-{
-	int i;
-
-	/* Assumes local_irq_disable() keeps online cpumap stable */
-	for_each_cpu(i, cpus)
-		update_nr_big_tasks(i);
-
-	for_each_cpu(i, cpus)
-		raw_spin_unlock(&cpu_rq(i)->lock);
-
-	local_irq_enable();
-}
-
-DEFINE_MUTEX(policy_mutex);
-
-#ifdef CONFIG_SCHED_FREQ_INPUT
-static inline int invalid_value_freq_input(unsigned int *data)
-{
-	if (data == &sysctl_sched_freq_aggregate)
-		return !(*data == 0 || *data == 1);
-
-	return 0;
-}
-#else
-static inline int invalid_value_freq_input(unsigned int *data)
-{
-	return 0;
-}
-#endif
-
-static inline int invalid_value(unsigned int *data)
-{
-	unsigned int val = *data;
-
-	if (data == &sysctl_sched_ravg_hist_size)
-		return (val < 2 || val > RAVG_HIST_SIZE_MAX);
-
-	if (data == &sysctl_sched_window_stats_policy)
-		return val >= WINDOW_STATS_INVALID_POLICY;
-
-	return invalid_value_freq_input(data);
-}
-
-/*
- * Handle "atomic" update of sysctl_sched_window_stats_policy,
- * sysctl_sched_ravg_hist_size and sched_freq_legacy_mode variables.
- */
-int sched_window_update_handler(struct ctl_table *table, int write,
-		void __user *buffer, size_t *lenp,
-		loff_t *ppos)
-{
-	int ret;
-	unsigned int *data = (unsigned int *)table->data;
-	unsigned int old_val;
-
-	if (!sched_enable_hmp)
-		return -EINVAL;
-
-	mutex_lock(&policy_mutex);
-
-	old_val = *data;
-
-	ret = proc_dointvec(table, write, buffer, lenp, ppos);
-	if (ret || !write || (write && (old_val == *data)))
-		goto done;
-
-	if (invalid_value(data)) {
-		*data = old_val;
-		ret = -EINVAL;
-		goto done;
-	}
-
-	reset_all_window_stats(0, 0);
-
-done:
-	mutex_unlock(&policy_mutex);
-
-	return ret;
-}
-
-/*
- * Convert percentage value into absolute form. This will avoid div() operation
- * in fast path, to convert task load in percentage scale.
- */
-int sched_hmp_proc_update_handler(struct ctl_table *table, int write,
-		void __user *buffer, size_t *lenp,
-		loff_t *ppos)
-{
-	int ret;
-	unsigned int old_val;
-	unsigned int *data = (unsigned int *)table->data;
-	int update_min_nice = 0;
-
-	mutex_lock(&policy_mutex);
-
-	old_val = *data;
-
-	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-
-	if (ret || !write || !sched_enable_hmp)
-		goto done;
-
-	if (write && (old_val == *data))
-		goto done;
-
-	if (data != &sysctl_sched_select_prev_cpu_us) {
-		/*
-		 * all tunables other than sched_select_prev_cpu_us are
-		 * in percentage.
-		 */
-		if (sysctl_sched_downmigrate_pct >
-		    sysctl_sched_upmigrate_pct || *data > 100) {
-			*data = old_val;
-			ret = -EINVAL;
-			goto done;
-		}
-	}
-
-	/*
-	 * Big task tunable change will need to re-classify tasks on
-	 * runqueue as big and set their counters appropriately.
-	 * sysctl interface affects secondary variables (*_pct), which is then
-	 * "atomically" carried over to the primary variables. Atomic change
-	 * includes taking runqueue lock of all online cpus and re-initiatizing
-	 * their big counter values based on changed criteria.
-	 */
-	if ((data == &sysctl_sched_upmigrate_pct || update_min_nice)) {
-		get_online_cpus();
-		pre_big_task_count_change(cpu_online_mask);
-	}
-
-	set_hmp_defaults();
-
-	if ((data == &sysctl_sched_upmigrate_pct || update_min_nice)) {
-		post_big_task_count_change(cpu_online_mask);
-		put_online_cpus();
-	}
-
-done:
-	mutex_unlock(&policy_mutex);
-	return ret;
-}
-
-/*
  * Reset balance_interval at all sched_domain levels of given cpu, so that it
  * honors kick.
  */
@@ -4131,8 +3408,6 @@ static inline int migration_needed(struct task_struct *p, int cpu)
 	return 0;
 }
 
-static DEFINE_RAW_SPINLOCK(migration_lock);
-
 static inline int
 kick_active_balance(struct rq *rq, struct task_struct *p, int new_cpu)
 {
@@ -4153,6 +3428,8 @@ kick_active_balance(struct rq *rq, struct task_struct *p, int new_cpu)
 	return rc;
 }
 
+static DEFINE_RAW_SPINLOCK(migration_lock);
+
 /*
  * Check if currently running task should be migrated to a better cpu.
  *
@@ -4183,104 +3460,85 @@ void check_for_migration(struct rq *rq, struct task_struct *p)
 					&rq->active_balance_work);
 }
 
-static inline int nr_big_tasks(struct rq *rq)
-{
-	return rq->hmp_stats.nr_big_tasks;
-}
+#ifdef CONFIG_CFS_BANDWIDTH
 
-unsigned int cpu_temp(int cpu)
+static void init_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq)
 {
-	struct cpu_pwr_stats *per_cpu_info = get_cpu_pwr_stats();
-	if (per_cpu_info)
-		return per_cpu_info[cpu].temp;
-	else
-		return 0;
+	cfs_rq->hmp_stats.nr_big_tasks = 0;
+	cfs_rq->hmp_stats.cumulative_runnable_avg = 0;
+	cfs_rq->hmp_stats.pred_demands_sum = 0;
 }
 
-#else	/* CONFIG_SCHED_HMP */
-
-struct cpu_select_env;
-struct sched_cluster;
-
-static inline int task_will_fit(struct task_struct *p, int cpu,
-				enum sched_boost_type boost_type)
+static void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+		 struct task_struct *p, int change_cra)
 {
-	return 1;
+	inc_nr_big_task(&cfs_rq->hmp_stats, p);
+	if (change_cra)
+		inc_cumulative_runnable_avg(&cfs_rq->hmp_stats, p);
 }
 
-static inline int select_best_cpu(struct task_struct *p, int target,
-				  int reason, int sync)
+static void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+		 struct task_struct *p, int change_cra)
 {
-	return 0;
+	dec_nr_big_task(&cfs_rq->hmp_stats, p);
+	if (change_cra)
+		dec_cumulative_runnable_avg(&cfs_rq->hmp_stats, p);
 }
 
-unsigned int power_cost(int cpu, u64 demand)
+static void inc_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats,
+			 struct cfs_rq *cfs_rq)
 {
-	return SCHED_CAPACITY_SCALE;
+	stats->nr_big_tasks += cfs_rq->hmp_stats.nr_big_tasks;
+	stats->cumulative_runnable_avg +=
+				cfs_rq->hmp_stats.cumulative_runnable_avg;
+	stats->pred_demands_sum += cfs_rq->hmp_stats.pred_demands_sum;
 }
 
-static inline int
-spill_threshold_crossed(struct cpu_select_env *env, struct rq *rq)
+static void dec_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats,
+				 struct cfs_rq *cfs_rq)
 {
-	return 0;
-}
+	stats->nr_big_tasks -= cfs_rq->hmp_stats.nr_big_tasks;
+	stats->cumulative_runnable_avg -=
+				cfs_rq->hmp_stats.cumulative_runnable_avg;
+	stats->pred_demands_sum -= cfs_rq->hmp_stats.pred_demands_sum;
 
-static inline int sched_boost(void)
-{
-	return 0;
+	BUG_ON(stats->nr_big_tasks < 0 ||
+		(s64)stats->cumulative_runnable_avg < 0);
+	verify_pred_demands_sum(stats);
 }
 
-static inline int is_big_task(struct task_struct *p)
-{
-	return 0;
-}
+#else	/* CONFIG_CFS_BANDWIDTH */
 
-static inline int nr_big_tasks(struct rq *rq)
-{
-	return 0;
-}
+static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+	 struct task_struct *p, int change_cra) { }
 
-static inline int is_cpu_throttling_imminent(int cpu)
-{
-	return 0;
-}
+static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+	 struct task_struct *p, int change_cra) { }
 
-static inline int is_task_migration_throttled(struct task_struct *p)
-{
-	return 0;
-}
+#endif	/* CONFIG_CFS_BANDWIDTH */
 
-unsigned int cpu_temp(int cpu)
-{
-	return 0;
-}
+#else	/* CONFIG_SCHED_HMP */
 
-static inline void
-inc_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra) { }
-static inline void
-dec_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra) { }
+static inline void init_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq) { }
 
-static inline void
-inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p) { }
+static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+	 struct task_struct *p, int change_cra) { }
 
-static inline void
-dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p) { }
+static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+	 struct task_struct *p, int change_cra) { }
 
-static inline int
-preferred_cluster(struct sched_cluster *cluster, struct task_struct *p)
+static inline void inc_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats,
+			 struct cfs_rq *cfs_rq)
 {
-	return 1;
 }
 
-static inline struct sched_cluster *rq_cluster(struct rq *rq)
+static inline void dec_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats,
+			 struct cfs_rq *cfs_rq)
 {
-	return NULL;
 }
 
 #endif	/* CONFIG_SCHED_HMP */
 
-
-
 #if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10
 #error "load tracking assumes 2^10 as unit"
 #endif
@@ -4323,7 +3581,6 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
 	u32 contrib;
 	unsigned int delta_w, scaled_delta_w, decayed = 0;
 	unsigned long scale_freq, scale_cpu;
-	struct sched_entity *se = NULL;
 
 	delta = now - sa->last_update_time;
 	/*
@@ -4344,12 +3601,6 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
 		return 0;
 	sa->last_update_time = now;
 
-	if (sched_use_pelt && cfs_rq && weight) {
-		se = container_of(sa, struct sched_entity, avg);
-		if (entity_is_task(se) && se->on_rq)
-			dec_hmp_sched_stats_fair(rq_of(cfs_rq), task_of(se));
-	}
-
 	scale_freq = arch_scale_freq_capacity(NULL, cpu);
 	scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
 
@@ -4370,7 +3621,6 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
 		scaled_delta_w = cap_scale(delta_w, scale_freq);
 		if (weight) {
 			sa->load_sum += weight * scaled_delta_w;
-			add_to_scaled_stat(cpu, sa, delta_w);
 			if (cfs_rq) {
 				cfs_rq->runnable_load_sum +=
 						weight * scaled_delta_w;
@@ -4397,7 +3647,6 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
 		contrib = cap_scale(contrib, scale_freq);
 		if (weight) {
 			sa->load_sum += weight * contrib;
-			add_to_scaled_stat(cpu, sa, contrib);
 			if (cfs_rq)
 				cfs_rq->runnable_load_sum += weight * contrib;
 		}
@@ -4409,14 +3658,10 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
 	scaled_delta = cap_scale(delta, scale_freq);
 	if (weight) {
 		sa->load_sum += weight * scaled_delta;
-		add_to_scaled_stat(cpu, sa, delta);
 		if (cfs_rq)
 			cfs_rq->runnable_load_sum += weight * scaled_delta;
 	}
 
-	if (se && entity_is_task(se) && se->on_rq)
-		inc_hmp_sched_stats_fair(rq_of(cfs_rq), task_of(se));
-
 	if (running)
 		sa->util_sum += scaled_delta * scale_cpu;
 
@@ -4657,191 +3902,13 @@ static inline int idle_balance(struct rq *rq)
 	return 0;
 }
 
-static inline void
-inc_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra) { }
-static inline void
-dec_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra) { }
-
-#endif /* CONFIG_SMP */
-
-#ifdef CONFIG_SCHED_HMP
-
-#ifdef CONFIG_SCHED_FREQ_INPUT
-#define clear_ravg_pred_demand() (p->ravg.pred_demand = 0)
-#else
-#define clear_ravg_pred_demand()
-#endif
-
-void init_new_task_load(struct task_struct *p)
-{
-	int i;
-	u32 init_load_windows = sched_init_task_load_windows;
-	u32 init_load_pelt = sched_init_task_load_pelt;
-	u32 init_load_pct = current->init_load_pct;
-
-	p->init_load_pct = 0;
-	rcu_assign_pointer(p->grp, NULL);
-	INIT_LIST_HEAD(&p->grp_list);
-	memset(&p->ravg, 0, sizeof(struct ravg));
-	p->cpu_cycles = 0;
-
-	if (init_load_pct) {
-		init_load_pelt = div64_u64((u64)init_load_pct *
-			  (u64)LOAD_AVG_MAX, 100);
-		init_load_windows = div64_u64((u64)init_load_pct *
-			  (u64)sched_ravg_window, 100);
-	}
-
-	p->ravg.demand = init_load_windows;
-	clear_ravg_pred_demand();
-	for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i)
-		p->ravg.sum_history[i] = init_load_windows;
-	p->se.avg.runnable_avg_sum_scaled = init_load_pelt;
-}
-
-#else /* CONFIG_SCHED_HMP */
-
-void init_new_task_load(struct task_struct *p)
-{
-}
-
-#endif /* CONFIG_SCHED_HMP */
-
-#ifdef CONFIG_SCHED_HMP
-
-/* Return task demand in percentage scale */
-unsigned int pct_task_load(struct task_struct *p)
-{
-	unsigned int load;
-
-	load = div64_u64((u64)task_load(p) * 100, (u64)max_task_load());
-
-	return load;
-}
-
-/*
- * Add scaled version of 'delta' to runnable_avg_sum_scaled
- * 'delta' is scaled in reference to "best" cpu
- */
-static inline void
-add_to_scaled_stat(int cpu, struct sched_avg *sa, u64 delta)
-{
-	int cur_freq = cpu_cur_freq(cpu);
-	u64 scaled_delta;
-	int sf;
-
-	if (!sched_enable_hmp)
-		return;
-
-	if (unlikely(cur_freq > max_possible_freq))
-		cur_freq = max_possible_freq;
-
-	scaled_delta = div64_u64(delta * cur_freq, max_possible_freq);
-	sf = (cpu_efficiency(cpu) * 1024) / max_possible_efficiency;
-	scaled_delta *= sf;
-	scaled_delta >>= 10;
-	sa->runnable_avg_sum_scaled += scaled_delta;
-}
-
-static inline void decay_scaled_stat(struct sched_avg *sa, u64 periods)
-{
-	if (!sched_enable_hmp)
-		return;
-
-	sa->runnable_avg_sum_scaled =
-		decay_load(sa->runnable_avg_sum_scaled,
-			   periods);
-}
-
-#ifdef CONFIG_CFS_BANDWIDTH
-
-static void init_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq)
-{
-	cfs_rq->hmp_stats.nr_big_tasks = 0;
-	cfs_rq->hmp_stats.cumulative_runnable_avg = 0;
-	set_pred_demands_sum(&cfs_rq->hmp_stats, 0);
-}
-
-static void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
-		 struct task_struct *p, int change_cra)
-{
-	inc_nr_big_task(&cfs_rq->hmp_stats, p);
-	if (change_cra)
-		inc_cumulative_runnable_avg(&cfs_rq->hmp_stats, p);
-}
-
-static void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
-		 struct task_struct *p, int change_cra)
-{
-	dec_nr_big_task(&cfs_rq->hmp_stats, p);
-	if (change_cra)
-		dec_cumulative_runnable_avg(&cfs_rq->hmp_stats, p);
-}
-
-static void inc_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats,
-			 struct cfs_rq *cfs_rq)
-{
-	stats->nr_big_tasks += cfs_rq->hmp_stats.nr_big_tasks;
-	stats->cumulative_runnable_avg +=
-				cfs_rq->hmp_stats.cumulative_runnable_avg;
-	set_pred_demands_sum(stats, stats->pred_demands_sum +
-			     cfs_rq->hmp_stats.pred_demands_sum);
-}
-
-static void dec_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats,
-				 struct cfs_rq *cfs_rq)
-{
-	stats->nr_big_tasks -= cfs_rq->hmp_stats.nr_big_tasks;
-	stats->cumulative_runnable_avg -=
-				cfs_rq->hmp_stats.cumulative_runnable_avg;
-	set_pred_demands_sum(stats, stats->pred_demands_sum -
-			     cfs_rq->hmp_stats.pred_demands_sum);
-
-	BUG_ON(stats->nr_big_tasks < 0 ||
-		(s64)stats->cumulative_runnable_avg < 0);
-	verify_pred_demands_sum(stats);
-}
-
-#else	/* CONFIG_CFS_BANDWIDTH */
-
-static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
-	 struct task_struct *p, int change_cra) { }
-
-static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
-	 struct task_struct *p, int change_cra) { }
-
-#endif	/* CONFIG_CFS_BANDWIDTH */
-
-#else  /* CONFIG_SCHED_HMP */
-
-static inline void
-add_to_scaled_stat(int cpu, struct sched_avg *sa, u64 delta)
-{
-}
-
-static inline void decay_scaled_stat(struct sched_avg *sa, u64 periods)
-{
-}
-
-static inline void init_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq) { }
-
 static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
 	 struct task_struct *p, int change_cra) { }
 
 static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
 	 struct task_struct *p, int change_cra) { }
 
-static inline void inc_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats,
-			 struct cfs_rq *cfs_rq)
-{
-}
-
-static inline void dec_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats,
-			 struct cfs_rq *cfs_rq)
-{
-}
-
-#endif /* CONFIG_SCHED_HMP */
+#endif /* CONFIG_SMP */
 
 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
@@ -7579,9 +6646,6 @@ struct lb_env {
 	struct list_head	tasks;
 };
 
-static DEFINE_PER_CPU(bool, dbs_boost_needed);
-static DEFINE_PER_CPU(int, dbs_boost_load_moved);
-
 /*
  * Is this task likely cache-hot:
  */
@@ -7792,7 +6856,7 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
 	deactivate_task(env->src_rq, p, 0);
 	double_lock_balance(env->src_rq, env->dst_rq);
 	set_task_cpu(p, env->dst_cpu);
-	if (rcu_access_pointer(p->grp))
+	if (task_in_related_thread_group(p))
 		env->flags |= LBF_MOVED_RELATED_THREAD_GROUP_TASK;
 	double_unlock_balance(env->src_rq, env->dst_rq);
 }
@@ -7822,7 +6886,6 @@ static struct task_struct *detach_one_task(struct lb_env *env)
 		 * inside detach_tasks().
 		 */
 		schedstat_inc(env->sd, lb_gained[env->idle]);
-		per_cpu(dbs_boost_load_moved, env->dst_cpu) += pct_task_load(p);
 
 		return p;
 	}
@@ -7895,7 +6958,6 @@ redo:
 
 		detached++;
 		env->imbalance -= load;
-		per_cpu(dbs_boost_load_moved, env->dst_cpu) += pct_task_load(p);
 
 #ifdef CONFIG_PREEMPT
 		/*
@@ -7949,8 +7011,6 @@ static void attach_task(struct rq *rq, struct task_struct *p)
 	activate_task(rq, p, 0);
 	p->on_rq = TASK_ON_RQ_QUEUED;
 	check_preempt_curr(rq, p, 0);
-	if (task_notify_on_migrate(p))
-		per_cpu(dbs_boost_needed, task_cpu(p)) = true;
 }
 
 /*
@@ -9241,7 +8301,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 
 	cpumask_copy(cpus, cpu_active_mask);
 
-	per_cpu(dbs_boost_load_moved, this_cpu) = 0;
 	schedstat_inc(sd, lb_count[idle]);
 
 redo:
@@ -9436,20 +8495,6 @@ no_move:
 		}
 	} else {
 		sd->nr_balance_failed = 0;
-		if (per_cpu(dbs_boost_needed, this_cpu)) {
-			struct migration_notify_data mnd;
-
-			mnd.src_cpu = cpu_of(busiest);
-			mnd.dest_cpu = this_cpu;
-			mnd.load = per_cpu(dbs_boost_load_moved, this_cpu);
-			if (mnd.load > 100)
-				mnd.load = 100;
-			atomic_notifier_call_chain(&migration_notifier_head,
-						   0, (void *)&mnd);
-			per_cpu(dbs_boost_needed, this_cpu) = false;
-			per_cpu(dbs_boost_load_moved, this_cpu) = 0;
-
-		}
 
 		/* Assumes one 'busiest' cpu that we pulled tasks from */
 		if (!same_freq_domain(this_cpu, cpu_of(busiest))) {
@@ -9681,8 +8726,6 @@ static int active_load_balance_cpu_stop(void *data)
 
 	raw_spin_lock_irq(&busiest_rq->lock);
 
-	per_cpu(dbs_boost_load_moved, target_cpu) = 0;
-
 	/* make sure the requested cpu hasn't gone down in the meantime */
 	if (unlikely(busiest_cpu != smp_processor_id() ||
 		     !busiest_rq->active_balance))
@@ -9765,20 +8808,6 @@ out_unlock:
 		check_for_freq_change(target_rq, true, false);
 	}
 
-	if (per_cpu(dbs_boost_needed, target_cpu)) {
-		struct migration_notify_data mnd;
-
-		mnd.src_cpu = cpu_of(busiest_rq);
-		mnd.dest_cpu = target_cpu;
-		mnd.load = per_cpu(dbs_boost_load_moved, target_cpu);
-		if (mnd.load > 100)
-			mnd.load = 100;
-		atomic_notifier_call_chain(&migration_notifier_head,
-					   0, (void *)&mnd);
-
-		per_cpu(dbs_boost_needed, target_cpu) = false;
-		per_cpu(dbs_boost_load_moved, target_cpu) = 0;
-	}
 	return 0;
 }
 
diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c
new file mode 100644
index 000000000000..8da0147b4f89
--- /dev/null
+++ b/kernel/sched/hmp.c
@@ -0,0 +1,3970 @@
+/* Copyright (c) 2012-2016, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Implementation credits: Srivatsa Vaddagiri, Steve Muckle
+ * Syed Rameez Mustafa, Olav haugan, Joonwoo Park, Pavan Kumar Kondeti
+ * and Vikram Mulukutla
+ */
+
+#include <linux/cpufreq.h>
+#include <linux/list_sort.h>
+#include <linux/syscore_ops.h>
+
+#include "sched.h"
+
+#include <trace/events/sched.h>
+
+static ktime_t ktime_last;
+static bool sched_ktime_suspended;
+
+static bool use_cycle_counter;
+static struct cpu_cycle_counter_cb cpu_cycle_counter_cb;
+
+u64 sched_ktime_clock(void)
+{
+	if (unlikely(sched_ktime_suspended))
+		return ktime_to_ns(ktime_last);
+	return ktime_get_ns();
+}
+
+static void sched_resume(void)
+{
+	sched_ktime_suspended = false;
+}
+
+static int sched_suspend(void)
+{
+	ktime_last = ktime_get();
+	sched_ktime_suspended = true;
+	return 0;
+}
+
+static struct syscore_ops sched_syscore_ops = {
+	.resume	= sched_resume,
+	.suspend = sched_suspend
+};
+
+static int __init sched_init_ops(void)
+{
+	register_syscore_ops(&sched_syscore_ops);
+	return 0;
+}
+late_initcall(sched_init_ops);
+
+inline void clear_ed_task(struct task_struct *p, struct rq *rq)
+{
+	if (p == rq->ed_task)
+		rq->ed_task = NULL;
+}
+
+inline void set_task_last_wake(struct task_struct *p, u64 wallclock)
+{
+	p->last_wake_ts = wallclock;
+}
+
+inline void set_task_last_switch_out(struct task_struct *p, u64 wallclock)
+{
+	p->last_switch_out_ts = wallclock;
+}
+
+/*
+ * Note C-state for (idle) cpus.
+ *
+ * @cstate = cstate index, 0 -> active state
+ * @wakeup_energy = energy spent in waking up cpu
+ * @wakeup_latency = latency to wakeup from cstate
+ *
+ */
+void
+sched_set_cpu_cstate(int cpu, int cstate, int wakeup_energy, int wakeup_latency)
+{
+	struct rq *rq = cpu_rq(cpu);
+
+	rq->cstate = cstate; /* C1, C2 etc */
+	rq->wakeup_energy = wakeup_energy;
+	rq->wakeup_latency = wakeup_latency;
+}
+
+/*
+ * Note D-state for (idle) cluster.
+ *
+ * @dstate = dstate index, 0 -> active state
+ * @wakeup_energy = energy spent in waking up cluster
+ * @wakeup_latency = latency to wakeup from cluster
+ *
+ */
+void sched_set_cluster_dstate(const cpumask_t *cluster_cpus, int dstate,
+			int wakeup_energy, int wakeup_latency)
+{
+	struct sched_cluster *cluster =
+		cpu_rq(cpumask_first(cluster_cpus))->cluster;
+	cluster->dstate = dstate;
+	cluster->dstate_wakeup_energy = wakeup_energy;
+	cluster->dstate_wakeup_latency = wakeup_latency;
+}
+
+u32 __weak get_freq_max_load(int cpu, u32 freq)
+{
+	/* 100% by default */
+	return 100;
+}
+
+DEFINE_PER_CPU(struct freq_max_load *, freq_max_load);
+static DEFINE_SPINLOCK(freq_max_load_lock);
+
+struct cpu_pwr_stats __weak *get_cpu_pwr_stats(void)
+{
+	return NULL;
+}
+
+int sched_update_freq_max_load(const cpumask_t *cpumask)
+{
+	int i, cpu, ret;
+	unsigned int freq;
+	struct cpu_pstate_pwr *costs;
+	struct cpu_pwr_stats *per_cpu_info = get_cpu_pwr_stats();
+	struct freq_max_load *max_load, *old_max_load;
+	struct freq_max_load_entry *entry;
+	u64 max_demand_capacity, max_demand;
+	unsigned long flags;
+	u32 hfreq;
+	int hpct;
+
+	if (!per_cpu_info)
+		return 0;
+
+	spin_lock_irqsave(&freq_max_load_lock, flags);
+	max_demand_capacity = div64_u64(max_task_load(), max_possible_capacity);
+	for_each_cpu(cpu, cpumask) {
+		if (!per_cpu_info[cpu].ptable) {
+			ret = -EINVAL;
+			goto fail;
+		}
+
+		old_max_load = rcu_dereference(per_cpu(freq_max_load, cpu));
+
+		/*
+		 * allocate len + 1 and leave the last power cost as 0 for
+		 * power_cost() can stop iterating index when
+		 * per_cpu_info[cpu].len > len of max_load due to race between
+		 * cpu power stats update and get_cpu_pwr_stats().
+		 */
+		max_load = kzalloc(sizeof(struct freq_max_load) +
+				   sizeof(struct freq_max_load_entry) *
+				   (per_cpu_info[cpu].len + 1), GFP_ATOMIC);
+		if (unlikely(!max_load)) {
+			ret = -ENOMEM;
+			goto fail;
+		}
+
+		max_load->length = per_cpu_info[cpu].len;
+
+		max_demand = max_demand_capacity *
+			     cpu_max_possible_capacity(cpu);
+
+		i = 0;
+		costs = per_cpu_info[cpu].ptable;
+		while (costs[i].freq) {
+			entry = &max_load->freqs[i];
+			freq = costs[i].freq;
+			hpct = get_freq_max_load(cpu, freq);
+			if (hpct <= 0 && hpct > 100)
+				hpct = 100;
+			hfreq = div64_u64((u64)freq * hpct, 100);
+			entry->hdemand =
+			    div64_u64(max_demand * hfreq,
+				      cpu_max_possible_freq(cpu));
+			i++;
+		}
+
+		rcu_assign_pointer(per_cpu(freq_max_load, cpu), max_load);
+		if (old_max_load)
+			kfree_rcu(old_max_load, rcu);
+	}
+
+	spin_unlock_irqrestore(&freq_max_load_lock, flags);
+	return 0;
+
+fail:
+	for_each_cpu(cpu, cpumask) {
+		max_load = rcu_dereference(per_cpu(freq_max_load, cpu));
+		if (max_load) {
+			rcu_assign_pointer(per_cpu(freq_max_load, cpu), NULL);
+			kfree_rcu(max_load, rcu);
+		}
+	}
+
+	spin_unlock_irqrestore(&freq_max_load_lock, flags);
+	return ret;
+}
+
+unsigned int max_possible_efficiency = 1;
+unsigned int min_possible_efficiency = UINT_MAX;
+
+unsigned long __weak arch_get_cpu_efficiency(int cpu)
+{
+	return SCHED_LOAD_SCALE;
+}
+
+/* Keep track of max/min capacity possible across CPUs "currently" */
+static void __update_min_max_capacity(void)
+{
+	int i;
+	int max_cap = 0, min_cap = INT_MAX;
+
+	for_each_online_cpu(i) {
+		max_cap = max(max_cap, cpu_capacity(i));
+		min_cap = min(min_cap, cpu_capacity(i));
+	}
+
+	max_capacity = max_cap;
+	min_capacity = min_cap;
+}
+
+static void update_min_max_capacity(void)
+{
+	unsigned long flags;
+	int i;
+
+	local_irq_save(flags);
+	for_each_possible_cpu(i)
+		raw_spin_lock(&cpu_rq(i)->lock);
+
+	__update_min_max_capacity();
+
+	for_each_possible_cpu(i)
+		raw_spin_unlock(&cpu_rq(i)->lock);
+	local_irq_restore(flags);
+}
+
+/*
+ * Return 'capacity' of a cpu in reference to "least" efficient cpu, such that
+ * least efficient cpu gets capacity of 1024
+ */
+static unsigned long
+capacity_scale_cpu_efficiency(struct sched_cluster *cluster)
+{
+	return (1024 * cluster->efficiency) / min_possible_efficiency;
+}
+
+/*
+ * Return 'capacity' of a cpu in reference to cpu with lowest max_freq
+ * (min_max_freq), such that one with lowest max_freq gets capacity of 1024.
+ */
+static unsigned long capacity_scale_cpu_freq(struct sched_cluster *cluster)
+{
+	return (1024 * cluster_max_freq(cluster)) / min_max_freq;
+}
+
+/*
+ * Return load_scale_factor of a cpu in reference to "most" efficient cpu, so
+ * that "most" efficient cpu gets a load_scale_factor of 1
+ */
+static inline unsigned long
+load_scale_cpu_efficiency(struct sched_cluster *cluster)
+{
+	return DIV_ROUND_UP(1024 * max_possible_efficiency,
+			    cluster->efficiency);
+}
+
+/*
+ * Return load_scale_factor of a cpu in reference to cpu with best max_freq
+ * (max_possible_freq), so that one with best max_freq gets a load_scale_factor
+ * of 1.
+ */
+static inline unsigned long load_scale_cpu_freq(struct sched_cluster *cluster)
+{
+	return DIV_ROUND_UP(1024 * max_possible_freq,
+			   cluster_max_freq(cluster));
+}
+
+static int compute_capacity(struct sched_cluster *cluster)
+{
+	int capacity = 1024;
+
+	capacity *= capacity_scale_cpu_efficiency(cluster);
+	capacity >>= 10;
+
+	capacity *= capacity_scale_cpu_freq(cluster);
+	capacity >>= 10;
+
+	return capacity;
+}
+
+static int compute_max_possible_capacity(struct sched_cluster *cluster)
+{
+	int capacity = 1024;
+
+	capacity *= capacity_scale_cpu_efficiency(cluster);
+	capacity >>= 10;
+
+	capacity *= (1024 * cluster->max_possible_freq) / min_max_freq;
+	capacity >>= 10;
+
+	return capacity;
+}
+
+static int compute_load_scale_factor(struct sched_cluster *cluster)
+{
+	int load_scale = 1024;
+
+	/*
+	 * load_scale_factor accounts for the fact that task load
+	 * is in reference to "best" performing cpu. Task's load will need to be
+	 * scaled (up) by a factor to determine suitability to be placed on a
+	 * (little) cpu.
+	 */
+	load_scale *= load_scale_cpu_efficiency(cluster);
+	load_scale >>= 10;
+
+	load_scale *= load_scale_cpu_freq(cluster);
+	load_scale >>= 10;
+
+	return load_scale;
+}
+
+struct list_head cluster_head;
+static DEFINE_MUTEX(cluster_lock);
+static cpumask_t all_cluster_cpus = CPU_MASK_NONE;
+DECLARE_BITMAP(all_cluster_ids, NR_CPUS);
+struct sched_cluster *sched_cluster[NR_CPUS];
+int num_clusters;
+
+struct sched_cluster init_cluster = {
+	.list			=	LIST_HEAD_INIT(init_cluster.list),
+	.id			=	0,
+	.max_power_cost		=	1,
+	.min_power_cost		=	1,
+	.capacity		=	1024,
+	.max_possible_capacity	=	1024,
+	.efficiency		=	1,
+	.load_scale_factor	=	1024,
+	.cur_freq		=	1,
+	.max_freq		=	1,
+	.max_mitigated_freq	=	UINT_MAX,
+	.min_freq		=	1,
+	.max_possible_freq	=	1,
+	.dstate			=	0,
+	.dstate_wakeup_energy	=	0,
+	.dstate_wakeup_latency	=	0,
+	.exec_scale_factor	=	1024,
+	.notifier_sent		=	0,
+};
+
+static void update_all_clusters_stats(void)
+{
+	struct sched_cluster *cluster;
+	u64 highest_mpc = 0, lowest_mpc = U64_MAX;
+
+	pre_big_task_count_change(cpu_possible_mask);
+
+	for_each_sched_cluster(cluster) {
+		u64 mpc;
+
+		cluster->capacity = compute_capacity(cluster);
+		mpc = cluster->max_possible_capacity =
+			compute_max_possible_capacity(cluster);
+		cluster->load_scale_factor = compute_load_scale_factor(cluster);
+
+		cluster->exec_scale_factor =
+			DIV_ROUND_UP(cluster->efficiency * 1024,
+				     max_possible_efficiency);
+
+		if (mpc > highest_mpc)
+			highest_mpc = mpc;
+
+		if (mpc < lowest_mpc)
+			lowest_mpc = mpc;
+	}
+
+	max_possible_capacity = highest_mpc;
+	min_max_possible_capacity = lowest_mpc;
+
+	__update_min_max_capacity();
+	sched_update_freq_max_load(cpu_possible_mask);
+	post_big_task_count_change(cpu_possible_mask);
+}
+
+static void assign_cluster_ids(struct list_head *head)
+{
+	struct sched_cluster *cluster;
+	int pos = 0;
+
+	list_for_each_entry(cluster, head, list) {
+		cluster->id = pos;
+		sched_cluster[pos++] = cluster;
+	}
+}
+
+static void
+move_list(struct list_head *dst, struct list_head *src, bool sync_rcu)
+{
+	struct list_head *first, *last;
+
+	first = src->next;
+	last = src->prev;
+
+	if (sync_rcu) {
+		INIT_LIST_HEAD_RCU(src);
+		synchronize_rcu();
+	}
+
+	first->prev = dst;
+	dst->prev = last;
+	last->next = dst;
+
+	/* Ensure list sanity before making the head visible to all CPUs. */
+	smp_mb();
+	dst->next = first;
+}
+
+static int
+compare_clusters(void *priv, struct list_head *a, struct list_head *b)
+{
+	struct sched_cluster *cluster1, *cluster2;
+	int ret;
+
+	cluster1 = container_of(a, struct sched_cluster, list);
+	cluster2 = container_of(b, struct sched_cluster, list);
+
+	ret = cluster1->max_power_cost > cluster2->max_power_cost ||
+		(cluster1->max_power_cost == cluster2->max_power_cost &&
+		cluster1->max_possible_capacity <
+				cluster2->max_possible_capacity);
+
+	return ret;
+}
+
+static void sort_clusters(void)
+{
+	struct sched_cluster *cluster;
+	struct list_head new_head;
+
+	INIT_LIST_HEAD(&new_head);
+
+	for_each_sched_cluster(cluster) {
+		cluster->max_power_cost = power_cost(cluster_first_cpu(cluster),
+							       max_task_load());
+		cluster->min_power_cost = power_cost(cluster_first_cpu(cluster),
+							       0);
+	}
+
+	move_list(&new_head, &cluster_head, true);
+
+	list_sort(NULL, &new_head, compare_clusters);
+	assign_cluster_ids(&new_head);
+
+	/*
+	 * Ensure cluster ids are visible to all CPUs before making
+	 * cluster_head visible.
+	 */
+	move_list(&cluster_head, &new_head, false);
+}
+
+static void
+insert_cluster(struct sched_cluster *cluster, struct list_head *head)
+{
+	struct sched_cluster *tmp;
+	struct list_head *iter = head;
+
+	list_for_each_entry(tmp, head, list) {
+		if (cluster->max_power_cost < tmp->max_power_cost)
+			break;
+		iter = &tmp->list;
+	}
+
+	list_add(&cluster->list, iter);
+}
+
+static struct sched_cluster *alloc_new_cluster(const struct cpumask *cpus)
+{
+	struct sched_cluster *cluster = NULL;
+
+	cluster = kzalloc(sizeof(struct sched_cluster), GFP_ATOMIC);
+	if (!cluster) {
+		__WARN_printf("Cluster allocation failed. \
+				Possible bad scheduling\n");
+		return NULL;
+	}
+
+	INIT_LIST_HEAD(&cluster->list);
+	cluster->max_power_cost		=	1;
+	cluster->min_power_cost		=	1;
+	cluster->capacity		=	1024;
+	cluster->max_possible_capacity	=	1024;
+	cluster->efficiency		=	1;
+	cluster->load_scale_factor	=	1024;
+	cluster->cur_freq		=	1;
+	cluster->max_freq		=	1;
+	cluster->max_mitigated_freq	=	UINT_MAX;
+	cluster->min_freq		=	1;
+	cluster->max_possible_freq	=	1;
+	cluster->dstate			=	0;
+	cluster->dstate_wakeup_energy	=	0;
+	cluster->dstate_wakeup_latency	=	0;
+	cluster->freq_init_done		=	false;
+
+	cluster->cpus = *cpus;
+	cluster->efficiency = arch_get_cpu_efficiency(cpumask_first(cpus));
+
+	if (cluster->efficiency > max_possible_efficiency)
+		max_possible_efficiency = cluster->efficiency;
+	if (cluster->efficiency < min_possible_efficiency)
+		min_possible_efficiency = cluster->efficiency;
+
+	cluster->notifier_sent = 0;
+	return cluster;
+}
+
+static void add_cluster(const struct cpumask *cpus, struct list_head *head)
+{
+	struct sched_cluster *cluster = alloc_new_cluster(cpus);
+	int i;
+
+	if (!cluster)
+		return;
+
+	for_each_cpu(i, cpus)
+		cpu_rq(i)->cluster = cluster;
+
+	insert_cluster(cluster, head);
+	set_bit(num_clusters, all_cluster_ids);
+	num_clusters++;
+}
+
+void update_cluster_topology(void)
+{
+	struct cpumask cpus = *cpu_possible_mask;
+	const struct cpumask *cluster_cpus;
+	struct list_head new_head;
+	int i;
+
+	INIT_LIST_HEAD(&new_head);
+
+	for_each_cpu(i, &cpus) {
+		cluster_cpus = cpu_coregroup_mask(i);
+		cpumask_or(&all_cluster_cpus, &all_cluster_cpus, cluster_cpus);
+		cpumask_andnot(&cpus, &cpus, cluster_cpus);
+		add_cluster(cluster_cpus, &new_head);
+	}
+
+	assign_cluster_ids(&new_head);
+
+	/*
+	 * Ensure cluster ids are visible to all CPUs before making
+	 * cluster_head visible.
+	 */
+	move_list(&cluster_head, &new_head, false);
+}
+
+void init_clusters(void)
+{
+	bitmap_clear(all_cluster_ids, 0, NR_CPUS);
+	init_cluster.cpus = *cpu_possible_mask;
+	INIT_LIST_HEAD(&cluster_head);
+}
+
+int register_cpu_cycle_counter_cb(struct cpu_cycle_counter_cb *cb)
+{
+	mutex_lock(&cluster_lock);
+	if (!cb->get_cpu_cycle_counter) {
+		mutex_unlock(&cluster_lock);
+		return -EINVAL;
+	}
+
+	cpu_cycle_counter_cb = *cb;
+	use_cycle_counter = true;
+	mutex_unlock(&cluster_lock);
+
+	return 0;
+}
+
+int __init set_sched_enable_hmp(char *str)
+{
+	int enable_hmp = 0;
+
+	get_option(&str, &enable_hmp);
+
+	sched_enable_hmp = !!enable_hmp;
+
+	return 0;
+}
+
+early_param("sched_enable_hmp", set_sched_enable_hmp);
+
+int got_boost_kick(void)
+{
+	int cpu = smp_processor_id();
+	struct rq *rq = cpu_rq(cpu);
+
+	return test_bit(BOOST_KICK, &rq->hmp_flags);
+}
+
+inline void clear_boost_kick(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+
+	clear_bit(BOOST_KICK, &rq->hmp_flags);
+}
+
+inline void boost_kick(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+
+	if (!test_and_set_bit(BOOST_KICK, &rq->hmp_flags))
+		smp_send_reschedule(cpu);
+}
+
+/* Clear any HMP scheduler related requests pending from or on cpu */
+void clear_hmp_request(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	unsigned long flags;
+
+	clear_boost_kick(cpu);
+	clear_reserved(cpu);
+	if (rq->push_task) {
+		raw_spin_lock_irqsave(&rq->lock, flags);
+		if (rq->push_task) {
+			clear_reserved(rq->push_cpu);
+			put_task_struct(rq->push_task);
+			rq->push_task = NULL;
+		}
+		rq->active_balance = 0;
+		raw_spin_unlock_irqrestore(&rq->lock, flags);
+	}
+}
+
+int sched_set_static_cpu_pwr_cost(int cpu, unsigned int cost)
+{
+	struct rq *rq = cpu_rq(cpu);
+
+	rq->static_cpu_pwr_cost = cost;
+	return 0;
+}
+
+unsigned int sched_get_static_cpu_pwr_cost(int cpu)
+{
+	return cpu_rq(cpu)->static_cpu_pwr_cost;
+}
+
+int sched_set_static_cluster_pwr_cost(int cpu, unsigned int cost)
+{
+	struct sched_cluster *cluster = cpu_rq(cpu)->cluster;
+
+	cluster->static_cluster_pwr_cost = cost;
+	return 0;
+}
+
+unsigned int sched_get_static_cluster_pwr_cost(int cpu)
+{
+	return cpu_rq(cpu)->cluster->static_cluster_pwr_cost;
+}
+
+/*
+ * sched_window_stats_policy and sched_ravg_hist_size have a 'sysctl' copy
+ * associated with them. This is required for atomic update of those variables
+ * when being modifed via sysctl interface.
+ *
+ * IMPORTANT: Initialize both copies to same value!!
+ */
+
+/*
+ * Tasks that are runnable continuously for a period greather than
+ * EARLY_DETECTION_DURATION can be flagged early as potential
+ * high load tasks.
+ */
+#define EARLY_DETECTION_DURATION 9500000
+
+static __read_mostly unsigned int sched_ravg_hist_size = 5;
+__read_mostly unsigned int sysctl_sched_ravg_hist_size = 5;
+
+static __read_mostly unsigned int sched_window_stats_policy =
+	 WINDOW_STATS_MAX_RECENT_AVG;
+__read_mostly unsigned int sysctl_sched_window_stats_policy =
+	WINDOW_STATS_MAX_RECENT_AVG;
+
+#define SCHED_ACCOUNT_WAIT_TIME 1
+
+__read_mostly unsigned int sysctl_sched_cpu_high_irqload = (10 * NSEC_PER_MSEC);
+
+unsigned int __read_mostly sysctl_sched_enable_colocation = 1;
+
+/*
+ * Enable colocation and frequency aggregation for all threads in a process.
+ * The children inherits the group id from the parent.
+ */
+unsigned int __read_mostly sysctl_sched_enable_thread_grouping;
+
+
+__read_mostly unsigned int sysctl_sched_new_task_windows = 5;
+
+#define SCHED_FREQ_ACCOUNT_WAIT_TIME 0
+
+/*
+ * For increase, send notification if
+ *      freq_required - cur_freq > sysctl_sched_freq_inc_notify
+ */
+__read_mostly int sysctl_sched_freq_inc_notify = 10 * 1024 * 1024; /* + 10GHz */
+
+/*
+ * For decrease, send notification if
+ *      cur_freq - freq_required > sysctl_sched_freq_dec_notify
+ */
+__read_mostly int sysctl_sched_freq_dec_notify = 10 * 1024 * 1024; /* - 10GHz */
+
+static __read_mostly unsigned int sched_io_is_busy;
+
+__read_mostly unsigned int sysctl_sched_pred_alert_freq = 10 * 1024 * 1024;
+
+/*
+ * Maximum possible frequency across all cpus. Task demand and cpu
+ * capacity (cpu_power) metrics are scaled in reference to it.
+ */
+unsigned int max_possible_freq = 1;
+
+/*
+ * Minimum possible max_freq across all cpus. This will be same as
+ * max_possible_freq on homogeneous systems and could be different from
+ * max_possible_freq on heterogenous systems. min_max_freq is used to derive
+ * capacity (cpu_power) of cpus.
+ */
+unsigned int min_max_freq = 1;
+
+unsigned int max_capacity = 1024; /* max(rq->capacity) */
+unsigned int min_capacity = 1024; /* min(rq->capacity) */
+unsigned int max_possible_capacity = 1024; /* max(rq->max_possible_capacity) */
+unsigned int
+min_max_possible_capacity = 1024; /* min(rq->max_possible_capacity) */
+
+/* Window size (in ns) */
+__read_mostly unsigned int sched_ravg_window = 10000000;
+
+/* Min window size (in ns) = 10ms */
+#define MIN_SCHED_RAVG_WINDOW 10000000
+
+/* Max window size (in ns) = 1s */
+#define MAX_SCHED_RAVG_WINDOW 1000000000
+
+/* Temporarily disable window-stats activity on all cpus */
+unsigned int __read_mostly sched_disable_window_stats;
+
+/*
+ * Major task runtime. If a task runs for more than sched_major_task_runtime
+ * in a window, it's considered to be generating majority of workload
+ * for this window. Prediction could be adjusted for such tasks.
+ */
+__read_mostly unsigned int sched_major_task_runtime = 10000000;
+
+static unsigned int sync_cpu;
+
+static LIST_HEAD(related_thread_groups);
+static DEFINE_RWLOCK(related_thread_group_lock);
+
+#define for_each_related_thread_group(grp) \
+	list_for_each_entry(grp, &related_thread_groups, list)
+
+/*
+ * Demand aggregation for frequency purpose:
+ *
+ * 'sched_freq_aggregate' controls aggregation of cpu demand of related threads
+ * for frequency determination purpose. This aggregation is done per-cluster.
+ *
+ * CPU demand of tasks from various related groups is aggregated per-cluster and
+ * added to the "max_busy_cpu" in that cluster, where max_busy_cpu is determined
+ * by just rq->prev_runnable_sum.
+ *
+ * Some examples follow, which assume:
+ *	Cluster0 = CPU0-3, Cluster1 = CPU4-7
+ *	One related thread group A that has tasks A0, A1, A2
+ *
+ *	A->cpu_time[X].curr/prev_sum = counters in which cpu execution stats of
+ *	tasks belonging to group A are accumulated when they run on cpu X.
+ *
+ *	CX->curr/prev_sum = counters in which cpu execution stats of all tasks
+ *	not belonging to group A are accumulated when they run on cpu X
+ *
+ * Lets say the stats for window M was as below:
+ *
+ *	C0->prev_sum = 1ms, A->cpu_time[0].prev_sum = 5ms
+ *		Task A0 ran 5ms on CPU0
+ *		Task B0 ran 1ms on CPU0
+ *
+ *	C1->prev_sum = 5ms, A->cpu_time[1].prev_sum = 6ms
+ *		Task A1 ran 4ms on CPU1
+ *		Task A2 ran 2ms on CPU1
+ *		Task B1 ran 5ms on CPU1
+ *
+ *	C2->prev_sum = 0ms, A->cpu_time[2].prev_sum = 0
+ *		CPU2 idle
+ *
+ *	C3->prev_sum = 0ms, A->cpu_time[3].prev_sum = 0
+ *		CPU3 idle
+ *
+ * In this case, CPU1 was most busy going by just its prev_sum counter. Demand
+ * from all group A tasks are added to CPU1. IOW, at end of window M, cpu busy
+ * time reported to governor will be:
+ *
+ *
+ *	C0 busy time = 1ms
+ *	C1 busy time = 5 + 5 + 6 = 16ms
+ *
+ */
+static __read_mostly unsigned int sched_freq_aggregate;
+__read_mostly unsigned int sysctl_sched_freq_aggregate;
+
+/* Initial task load. Newly created tasks are assigned this load. */
+unsigned int __read_mostly sched_init_task_load_windows;
+unsigned int __read_mostly sysctl_sched_init_task_load_pct = 15;
+
+unsigned int max_task_load(void)
+{
+	return sched_ravg_window;
+}
+
+/* Use this knob to turn on or off HMP-aware task placement logic */
+unsigned int __read_mostly sched_enable_hmp;
+
+/*
+ * Scheduler boost is a mechanism to temporarily place tasks on CPUs
+ * with higher capacity than those where a task would have normally
+ * ended up with their load characteristics. Any entity enabling
+ * boost is responsible for disabling it as well.
+ */
+unsigned int sysctl_sched_boost;
+
+/* A cpu can no longer accommodate more tasks if:
+ *
+ *	rq->nr_running > sysctl_sched_spill_nr_run ||
+ *	rq->hmp_stats.cumulative_runnable_avg > sched_spill_load
+ */
+unsigned int __read_mostly sysctl_sched_spill_nr_run = 10;
+
+/*
+ * Place sync wakee tasks those have less than configured demand to the waker's
+ * cluster.
+ */
+unsigned int __read_mostly sched_small_wakee_task_load;
+unsigned int __read_mostly sysctl_sched_small_wakee_task_load_pct = 10;
+
+unsigned int __read_mostly sched_big_waker_task_load;
+unsigned int __read_mostly sysctl_sched_big_waker_task_load_pct = 25;
+
+/*
+ * CPUs with load greater than the sched_spill_load_threshold are not
+ * eligible for task placement. When all CPUs in a cluster achieve a
+ * load higher than this level, tasks becomes eligible for inter
+ * cluster migration.
+ */
+unsigned int __read_mostly sched_spill_load;
+unsigned int __read_mostly sysctl_sched_spill_load_pct = 100;
+
+/*
+ * Tasks whose bandwidth consumption on a cpu is more than
+ * sched_upmigrate are considered "big" tasks. Big tasks will be
+ * considered for "up" migration, i.e migrating to a cpu with better
+ * capacity.
+ */
+unsigned int __read_mostly sched_upmigrate;
+unsigned int __read_mostly sysctl_sched_upmigrate_pct = 80;
+
+/*
+ * Big tasks, once migrated, will need to drop their bandwidth
+ * consumption to less than sched_downmigrate before they are "down"
+ * migrated.
+ */
+unsigned int __read_mostly sched_downmigrate;
+unsigned int __read_mostly sysctl_sched_downmigrate_pct = 60;
+
+/*
+ * The load scale factor of a CPU gets boosted when its max frequency
+ * is restricted due to which the tasks are migrating to higher capacity
+ * CPUs early. The sched_upmigrate threshold is auto-upgraded by
+ * rq->max_possible_freq/rq->max_freq of a lower capacity CPU.
+ */
+unsigned int up_down_migrate_scale_factor = 1024;
+
+/*
+ * Scheduler selects and places task to its previous CPU if sleep time is
+ * less than sysctl_sched_select_prev_cpu_us.
+ */
+unsigned int __read_mostly
+sched_short_sleep_task_threshold = 2000 * NSEC_PER_USEC;
+
+unsigned int __read_mostly sysctl_sched_select_prev_cpu_us = 2000;
+
+unsigned int __read_mostly
+sched_long_cpu_selection_threshold = 100 * NSEC_PER_MSEC;
+
+unsigned int __read_mostly sysctl_sched_restrict_cluster_spill;
+
+void update_up_down_migrate(void)
+{
+	unsigned int up_migrate = pct_to_real(sysctl_sched_upmigrate_pct);
+	unsigned int down_migrate = pct_to_real(sysctl_sched_downmigrate_pct);
+	unsigned int delta;
+
+	if (up_down_migrate_scale_factor == 1024)
+		goto done;
+
+	delta = up_migrate - down_migrate;
+
+	up_migrate /= NSEC_PER_USEC;
+	up_migrate *= up_down_migrate_scale_factor;
+	up_migrate >>= 10;
+	up_migrate *= NSEC_PER_USEC;
+
+	up_migrate = min(up_migrate, sched_ravg_window);
+
+	down_migrate /= NSEC_PER_USEC;
+	down_migrate *= up_down_migrate_scale_factor;
+	down_migrate >>= 10;
+	down_migrate *= NSEC_PER_USEC;
+
+	down_migrate = min(down_migrate, up_migrate - delta);
+done:
+	sched_upmigrate = up_migrate;
+	sched_downmigrate = down_migrate;
+}
+
+void set_hmp_defaults(void)
+{
+	sched_spill_load =
+		pct_to_real(sysctl_sched_spill_load_pct);
+
+	update_up_down_migrate();
+
+	sched_major_task_runtime =
+		mult_frac(sched_ravg_window, MAJOR_TASK_PCT, 100);
+
+	sched_init_task_load_windows =
+		div64_u64((u64)sysctl_sched_init_task_load_pct *
+			  (u64)sched_ravg_window, 100);
+
+	sched_short_sleep_task_threshold = sysctl_sched_select_prev_cpu_us *
+					   NSEC_PER_USEC;
+
+	sched_small_wakee_task_load =
+		div64_u64((u64)sysctl_sched_small_wakee_task_load_pct *
+			  (u64)sched_ravg_window, 100);
+
+	sched_big_waker_task_load =
+		div64_u64((u64)sysctl_sched_big_waker_task_load_pct *
+			  (u64)sched_ravg_window, 100);
+}
+
+u32 sched_get_init_task_load(struct task_struct *p)
+{
+	return p->init_load_pct;
+}
+
+int sched_set_init_task_load(struct task_struct *p, int init_load_pct)
+{
+	if (init_load_pct < 0 || init_load_pct > 100)
+		return -EINVAL;
+
+	p->init_load_pct = init_load_pct;
+
+	return 0;
+}
+
+#ifdef CONFIG_CGROUP_SCHED
+
+int upmigrate_discouraged(struct task_struct *p)
+{
+	return task_group(p)->upmigrate_discouraged;
+}
+
+#else
+
+static inline int upmigrate_discouraged(struct task_struct *p)
+{
+	return 0;
+}
+
+#endif
+
+/* Is a task "big" on its current cpu */
+static inline int __is_big_task(struct task_struct *p, u64 scaled_load)
+{
+	int nice = task_nice(p);
+
+	if (nice > SCHED_UPMIGRATE_MIN_NICE || upmigrate_discouraged(p))
+		return 0;
+
+	return scaled_load > sched_upmigrate;
+}
+
+int is_big_task(struct task_struct *p)
+{
+	return __is_big_task(p, scale_load_to_cpu(task_load(p), task_cpu(p)));
+}
+
+u64 cpu_load(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+
+	return scale_load_to_cpu(rq->hmp_stats.cumulative_runnable_avg, cpu);
+}
+
+u64 cpu_load_sync(int cpu, int sync)
+{
+	return scale_load_to_cpu(cpu_cravg_sync(cpu, sync), cpu);
+}
+
+static int boost_refcount;
+static DEFINE_SPINLOCK(boost_lock);
+static DEFINE_MUTEX(boost_mutex);
+
+static void boost_kick_cpus(void)
+{
+	int i;
+
+	for_each_online_cpu(i) {
+		if (cpu_capacity(i) != max_capacity)
+			boost_kick(i);
+	}
+}
+
+int sched_boost(void)
+{
+	return boost_refcount > 0;
+}
+
+int sched_set_boost(int enable)
+{
+	unsigned long flags;
+	int ret = 0;
+	int old_refcount;
+
+	if (!sched_enable_hmp)
+		return -EINVAL;
+
+	spin_lock_irqsave(&boost_lock, flags);
+
+	old_refcount = boost_refcount;
+
+	if (enable == 1) {
+		boost_refcount++;
+	} else if (!enable) {
+		if (boost_refcount >= 1)
+			boost_refcount--;
+		else
+			ret = -EINVAL;
+	} else {
+		ret = -EINVAL;
+	}
+
+	if (!old_refcount && boost_refcount)
+		boost_kick_cpus();
+
+	trace_sched_set_boost(boost_refcount);
+	spin_unlock_irqrestore(&boost_lock, flags);
+
+	return ret;
+}
+
+int sched_boost_handler(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp,
+		loff_t *ppos)
+{
+	int ret;
+
+	mutex_lock(&boost_mutex);
+	if (!write)
+		sysctl_sched_boost = sched_boost();
+
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
+	if (ret || !write)
+		goto done;
+
+	ret = (sysctl_sched_boost <= 1) ?
+		sched_set_boost(sysctl_sched_boost) : -EINVAL;
+
+done:
+	mutex_unlock(&boost_mutex);
+	return ret;
+}
+
+/*
+ * Task will fit on a cpu if it's bandwidth consumption on that cpu
+ * will be less than sched_upmigrate. A big task that was previously
+ * "up" migrated will be considered fitting on "little" cpu if its
+ * bandwidth consumption on "little" cpu will be less than
+ * sched_downmigrate. This will help avoid frequenty migrations for
+ * tasks with load close to the upmigrate threshold
+ */
+int task_load_will_fit(struct task_struct *p, u64 task_load, int cpu,
+			      enum sched_boost_type boost_type)
+{
+	int upmigrate;
+
+	if (cpu_capacity(cpu) == max_capacity)
+		return 1;
+
+	if (boost_type != SCHED_BOOST_ON_BIG) {
+		if (task_nice(p) > SCHED_UPMIGRATE_MIN_NICE ||
+		    upmigrate_discouraged(p))
+			return 1;
+
+		upmigrate = sched_upmigrate;
+		if (cpu_capacity(task_cpu(p)) > cpu_capacity(cpu))
+			upmigrate = sched_downmigrate;
+
+		if (task_load < upmigrate)
+			return 1;
+	}
+
+	return 0;
+}
+
+enum sched_boost_type sched_boost_type(void)
+{
+	if (sched_boost()) {
+		if (min_possible_efficiency != max_possible_efficiency)
+			return SCHED_BOOST_ON_BIG;
+		else
+			return SCHED_BOOST_ON_ALL;
+	}
+	return SCHED_BOOST_NONE;
+}
+
+int task_will_fit(struct task_struct *p, int cpu)
+{
+	u64 tload = scale_load_to_cpu(task_load(p), cpu);
+
+	return task_load_will_fit(p, tload, cpu, sched_boost_type());
+}
+
+int group_will_fit(struct sched_cluster *cluster,
+		 struct related_thread_group *grp, u64 demand)
+{
+	int cpu = cluster_first_cpu(cluster);
+	int prev_capacity = 0;
+	unsigned int threshold = sched_upmigrate;
+	u64 load;
+
+	if (cluster->capacity == max_capacity)
+		return 1;
+
+	if (grp->preferred_cluster)
+		prev_capacity = grp->preferred_cluster->capacity;
+
+	if (cluster->capacity < prev_capacity)
+		threshold = sched_downmigrate;
+
+	load = scale_load_to_cpu(demand, cpu);
+	if (load < threshold)
+		return 1;
+
+	return 0;
+}
+
+/*
+ * Return the cost of running task p on CPU cpu. This function
+ * currently assumes that task p is the only task which will run on
+ * the CPU.
+ */
+unsigned int power_cost(int cpu, u64 demand)
+{
+	int first, mid, last;
+	struct cpu_pwr_stats *per_cpu_info = get_cpu_pwr_stats();
+	struct cpu_pstate_pwr *costs;
+	struct freq_max_load *max_load;
+	int total_static_pwr_cost = 0;
+	struct rq *rq = cpu_rq(cpu);
+	unsigned int pc;
+
+	if (!per_cpu_info || !per_cpu_info[cpu].ptable)
+		/*
+		 * When power aware scheduling is not in use, or CPU
+		 * power data is not available, just use the CPU
+		 * capacity as a rough stand-in for real CPU power
+		 * numbers, assuming bigger CPUs are more power
+		 * hungry.
+		 */
+		return cpu_max_possible_capacity(cpu);
+
+	rcu_read_lock();
+	max_load = rcu_dereference(per_cpu(freq_max_load, cpu));
+	if (!max_load) {
+		pc = cpu_max_possible_capacity(cpu);
+		goto unlock;
+	}
+
+	costs = per_cpu_info[cpu].ptable;
+
+	if (demand <= max_load->freqs[0].hdemand) {
+		pc = costs[0].power;
+		goto unlock;
+	} else if (demand > max_load->freqs[max_load->length - 1].hdemand) {
+		pc = costs[max_load->length - 1].power;
+		goto unlock;
+	}
+
+	first = 0;
+	last = max_load->length - 1;
+	mid = (last - first) >> 1;
+	while (1) {
+		if (demand <= max_load->freqs[mid].hdemand)
+			last = mid;
+		else
+			first = mid;
+
+		if (last - first == 1)
+			break;
+		mid = first + ((last - first) >> 1);
+	}
+
+	pc = costs[last].power;
+
+unlock:
+	rcu_read_unlock();
+
+	if (idle_cpu(cpu) && rq->cstate) {
+		total_static_pwr_cost += rq->static_cpu_pwr_cost;
+		if (rq->cluster->dstate)
+			total_static_pwr_cost +=
+				rq->cluster->static_cluster_pwr_cost;
+	}
+
+	return pc + total_static_pwr_cost;
+
+}
+
+void inc_nr_big_task(struct hmp_sched_stats *stats, struct task_struct *p)
+{
+	if (!sched_enable_hmp || sched_disable_window_stats)
+		return;
+
+	if (is_big_task(p))
+		stats->nr_big_tasks++;
+}
+
+void dec_nr_big_task(struct hmp_sched_stats *stats, struct task_struct *p)
+{
+	if (!sched_enable_hmp || sched_disable_window_stats)
+		return;
+
+	if (is_big_task(p))
+		stats->nr_big_tasks--;
+
+	BUG_ON(stats->nr_big_tasks < 0);
+}
+
+void inc_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra)
+{
+	inc_nr_big_task(&rq->hmp_stats, p);
+	if (change_cra)
+		inc_cumulative_runnable_avg(&rq->hmp_stats, p);
+}
+
+void dec_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra)
+{
+	dec_nr_big_task(&rq->hmp_stats, p);
+	if (change_cra)
+		dec_cumulative_runnable_avg(&rq->hmp_stats, p);
+}
+
+static void reset_hmp_stats(struct hmp_sched_stats *stats, int reset_cra)
+{
+	stats->nr_big_tasks = 0;
+	if (reset_cra) {
+		stats->cumulative_runnable_avg = 0;
+		stats->pred_demands_sum = 0;
+	}
+}
+
+/*
+ * Invoked from three places:
+ *	1) try_to_wake_up() -> ... -> select_best_cpu()
+ *	2) scheduler_tick() -> ... -> migration_needed() -> select_best_cpu()
+ *	3) can_migrate_task()
+ *
+ * Its safe to de-reference p->grp in first case (since p->pi_lock is held)
+ * but not in other cases. p->grp is hence freed after a RCU grace period and
+ * accessed under rcu_read_lock()
+ */
+int preferred_cluster(struct sched_cluster *cluster, struct task_struct *p)
+{
+	struct related_thread_group *grp;
+	int rc = 0;
+
+	rcu_read_lock();
+
+	grp = task_related_thread_group(p);
+	if (!grp || !sysctl_sched_enable_colocation)
+		rc = 1;
+	else
+		rc = (grp->preferred_cluster == cluster);
+
+	rcu_read_unlock();
+	return rc;
+}
+
+struct sched_cluster *rq_cluster(struct rq *rq)
+{
+	return rq->cluster;
+}
+
+/*
+ * reset_cpu_hmp_stats - reset HMP stats for a cpu
+ *	nr_big_tasks
+ *	cumulative_runnable_avg (iff reset_cra is true)
+ */
+void reset_cpu_hmp_stats(int cpu, int reset_cra)
+{
+	reset_cfs_rq_hmp_stats(cpu, reset_cra);
+	reset_hmp_stats(&cpu_rq(cpu)->hmp_stats, reset_cra);
+}
+
+void fixup_nr_big_tasks(struct hmp_sched_stats *stats,
+				struct task_struct *p, s64 delta)
+{
+	u64 new_task_load;
+	u64 old_task_load;
+
+	if (!sched_enable_hmp || sched_disable_window_stats)
+		return;
+
+	old_task_load = scale_load_to_cpu(task_load(p), task_cpu(p));
+	new_task_load = scale_load_to_cpu(delta + task_load(p), task_cpu(p));
+
+	if (__is_big_task(p, old_task_load) && !__is_big_task(p, new_task_load))
+		stats->nr_big_tasks--;
+	else if (!__is_big_task(p, old_task_load) &&
+		 __is_big_task(p, new_task_load))
+		stats->nr_big_tasks++;
+
+	BUG_ON(stats->nr_big_tasks < 0);
+}
+
+/*
+ * Walk runqueue of cpu and re-initialize 'nr_big_tasks' counters.
+ */
+static void update_nr_big_tasks(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	struct task_struct *p;
+
+	/* Do not reset cumulative_runnable_avg */
+	reset_cpu_hmp_stats(cpu, 0);
+
+	list_for_each_entry(p, &rq->cfs_tasks, se.group_node)
+		_inc_hmp_sched_stats_fair(rq, p, 0);
+}
+
+/* Disable interrupts and grab runqueue lock of all cpus listed in @cpus */
+void pre_big_task_count_change(const struct cpumask *cpus)
+{
+	int i;
+
+	local_irq_disable();
+
+	for_each_cpu(i, cpus)
+		raw_spin_lock(&cpu_rq(i)->lock);
+}
+
+/*
+ * Reinitialize 'nr_big_tasks' counters on all affected cpus
+ */
+void post_big_task_count_change(const struct cpumask *cpus)
+{
+	int i;
+
+	/* Assumes local_irq_disable() keeps online cpumap stable */
+	for_each_cpu(i, cpus)
+		update_nr_big_tasks(i);
+
+	for_each_cpu(i, cpus)
+		raw_spin_unlock(&cpu_rq(i)->lock);
+
+	local_irq_enable();
+}
+
+DEFINE_MUTEX(policy_mutex);
+
+static inline int invalid_value_freq_input(unsigned int *data)
+{
+	if (data == &sysctl_sched_freq_aggregate)
+		return !(*data == 0 || *data == 1);
+
+	return 0;
+}
+
+static inline int invalid_value(unsigned int *data)
+{
+	unsigned int val = *data;
+
+	if (data == &sysctl_sched_ravg_hist_size)
+		return (val < 2 || val > RAVG_HIST_SIZE_MAX);
+
+	if (data == &sysctl_sched_window_stats_policy)
+		return val >= WINDOW_STATS_INVALID_POLICY;
+
+	return invalid_value_freq_input(data);
+}
+
+/*
+ * Handle "atomic" update of sysctl_sched_window_stats_policy,
+ * sysctl_sched_ravg_hist_size and sched_freq_legacy_mode variables.
+ */
+int sched_window_update_handler(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp,
+		loff_t *ppos)
+{
+	int ret;
+	unsigned int *data = (unsigned int *)table->data;
+	unsigned int old_val;
+
+	if (!sched_enable_hmp)
+		return -EINVAL;
+
+	mutex_lock(&policy_mutex);
+
+	old_val = *data;
+
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
+	if (ret || !write || (write && (old_val == *data)))
+		goto done;
+
+	if (invalid_value(data)) {
+		*data = old_val;
+		ret = -EINVAL;
+		goto done;
+	}
+
+	reset_all_window_stats(0, 0);
+
+done:
+	mutex_unlock(&policy_mutex);
+
+	return ret;
+}
+
+/*
+ * Convert percentage value into absolute form. This will avoid div() operation
+ * in fast path, to convert task load in percentage scale.
+ */
+int sched_hmp_proc_update_handler(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp,
+		loff_t *ppos)
+{
+	int ret;
+	unsigned int old_val;
+	unsigned int *data = (unsigned int *)table->data;
+	int update_min_nice = 0;
+
+	mutex_lock(&policy_mutex);
+
+	old_val = *data;
+
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+	if (ret || !write || !sched_enable_hmp)
+		goto done;
+
+	if (write && (old_val == *data))
+		goto done;
+
+	if (data != &sysctl_sched_select_prev_cpu_us) {
+		/*
+		 * all tunables other than sched_select_prev_cpu_us are
+		 * in percentage.
+		 */
+		if (sysctl_sched_downmigrate_pct >
+		    sysctl_sched_upmigrate_pct || *data > 100) {
+			*data = old_val;
+			ret = -EINVAL;
+			goto done;
+		}
+	}
+
+	/*
+	 * Big task tunable change will need to re-classify tasks on
+	 * runqueue as big and set their counters appropriately.
+	 * sysctl interface affects secondary variables (*_pct), which is then
+	 * "atomically" carried over to the primary variables. Atomic change
+	 * includes taking runqueue lock of all online cpus and re-initiatizing
+	 * their big counter values based on changed criteria.
+	 */
+	if ((data == &sysctl_sched_upmigrate_pct || update_min_nice)) {
+		get_online_cpus();
+		pre_big_task_count_change(cpu_online_mask);
+	}
+
+	set_hmp_defaults();
+
+	if ((data == &sysctl_sched_upmigrate_pct || update_min_nice)) {
+		post_big_task_count_change(cpu_online_mask);
+		put_online_cpus();
+	}
+
+done:
+	mutex_unlock(&policy_mutex);
+	return ret;
+}
+
+inline int nr_big_tasks(struct rq *rq)
+{
+	return rq->hmp_stats.nr_big_tasks;
+}
+
+unsigned int cpu_temp(int cpu)
+{
+	struct cpu_pwr_stats *per_cpu_info = get_cpu_pwr_stats();
+
+	if (per_cpu_info)
+		return per_cpu_info[cpu].temp;
+	else
+		return 0;
+}
+
+void init_new_task_load(struct task_struct *p)
+{
+	int i;
+	u32 init_load_windows = sched_init_task_load_windows;
+	u32 init_load_pct = current->init_load_pct;
+
+	p->init_load_pct = 0;
+	rcu_assign_pointer(p->grp, NULL);
+	INIT_LIST_HEAD(&p->grp_list);
+	memset(&p->ravg, 0, sizeof(struct ravg));
+	p->cpu_cycles = 0;
+
+	if (init_load_pct)
+		init_load_windows = div64_u64((u64)init_load_pct *
+			  (u64)sched_ravg_window, 100);
+
+	p->ravg.demand = init_load_windows;
+	p->ravg.pred_demand = 0;
+	for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i)
+		p->ravg.sum_history[i] = init_load_windows;
+}
+
+/* Return task demand in percentage scale */
+unsigned int pct_task_load(struct task_struct *p)
+{
+	unsigned int load;
+
+	load = div64_u64((u64)task_load(p) * 100, (u64)max_task_load());
+
+	return load;
+}
+
+/*
+ * Return total number of tasks "eligible" to run on highest capacity cpu
+ *
+ * This is simply nr_big_tasks for cpus which are not of max_capacity and
+ * nr_running for cpus of max_capacity
+ */
+unsigned int nr_eligible_big_tasks(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	int nr_big = rq->hmp_stats.nr_big_tasks;
+	int nr = rq->nr_running;
+
+	if (cpu_max_possible_capacity(cpu) != max_possible_capacity)
+		return nr_big;
+
+	return nr;
+}
+
+static inline int exiting_task(struct task_struct *p)
+{
+	return (p->ravg.sum_history[0] == EXITING_TASK_MARKER);
+}
+
+static int __init set_sched_ravg_window(char *str)
+{
+	unsigned int window_size;
+
+	get_option(&str, &window_size);
+
+	if (window_size < MIN_SCHED_RAVG_WINDOW ||
+			window_size > MAX_SCHED_RAVG_WINDOW) {
+		WARN_ON(1);
+		return -EINVAL;
+	}
+
+	sched_ravg_window = window_size;
+	return 0;
+}
+
+early_param("sched_ravg_window", set_sched_ravg_window);
+
+static inline void
+update_window_start(struct rq *rq, u64 wallclock)
+{
+	s64 delta;
+	int nr_windows;
+
+	delta = wallclock - rq->window_start;
+	BUG_ON(delta < 0);
+	if (delta < sched_ravg_window)
+		return;
+
+	nr_windows = div64_u64(delta, sched_ravg_window);
+	rq->window_start += (u64)nr_windows * (u64)sched_ravg_window;
+}
+
+#define DIV64_U64_ROUNDUP(X, Y) div64_u64((X) + (Y - 1), Y)
+
+static inline u64 scale_exec_time(u64 delta, struct rq *rq)
+{
+	u32 freq;
+
+	freq = cpu_cycles_to_freq(rq->cc.cycles, rq->cc.time);
+	delta = DIV64_U64_ROUNDUP(delta * freq, max_possible_freq);
+	delta *= rq->cluster->exec_scale_factor;
+	delta >>= 10;
+
+	return delta;
+}
+
+static inline int cpu_is_waiting_on_io(struct rq *rq)
+{
+	if (!sched_io_is_busy)
+		return 0;
+
+	return atomic_read(&rq->nr_iowait);
+}
+
+/* Does freq_required sufficiently exceed or fall behind cur_freq? */
+static inline int
+nearly_same_freq(unsigned int cur_freq, unsigned int freq_required)
+{
+	int delta = freq_required - cur_freq;
+
+	if (freq_required > cur_freq)
+		return delta < sysctl_sched_freq_inc_notify;
+
+	delta = -delta;
+
+	return delta < sysctl_sched_freq_dec_notify;
+}
+
+/* Convert busy time to frequency equivalent */
+static inline unsigned int load_to_freq(struct rq *rq, u64 load)
+{
+	unsigned int freq;
+
+	load = scale_load_to_cpu(load, cpu_of(rq));
+	load *= 128;
+	load = div64_u64(load, max_task_load());
+
+	freq = load * cpu_max_possible_freq(cpu_of(rq));
+	freq /= 128;
+
+	return freq;
+}
+
+static inline struct group_cpu_time *
+_group_cpu_time(struct related_thread_group *grp, int cpu);
+
+/*
+ * Return load from all related group in given cpu.
+ * Caller must ensure that related_thread_group_lock is held.
+ */
+static void _group_load_in_cpu(int cpu, u64 *grp_load, u64 *new_grp_load)
+{
+	struct related_thread_group *grp;
+
+	for_each_related_thread_group(grp) {
+		struct group_cpu_time *cpu_time;
+
+		cpu_time = _group_cpu_time(grp, cpu);
+		*grp_load += cpu_time->prev_runnable_sum;
+		if (new_grp_load)
+			*new_grp_load += cpu_time->nt_prev_runnable_sum;
+	}
+}
+
+/*
+ * Return load from all related groups in given frequency domain.
+ * Caller must ensure that related_thread_group_lock is held.
+ */
+static void group_load_in_freq_domain(struct cpumask *cpus,
+				u64 *grp_load, u64 *new_grp_load)
+{
+	struct related_thread_group *grp;
+	int j;
+
+	for_each_related_thread_group(grp) {
+		for_each_cpu(j, cpus) {
+			struct group_cpu_time *cpu_time;
+
+			cpu_time = _group_cpu_time(grp, j);
+			*grp_load += cpu_time->prev_runnable_sum;
+			*new_grp_load += cpu_time->nt_prev_runnable_sum;
+		}
+	}
+}
+
+/*
+ * Should scheduler alert governor for changing frequency?
+ *
+ * @check_pred - evaluate frequency based on the predictive demand
+ * @check_groups - add load from all related groups on given cpu
+ *
+ * check_groups is set to 1 if a "related" task movement/wakeup is triggering
+ * the notification check. To avoid "re-aggregation" of demand in such cases,
+ * we check whether the migrated/woken tasks demand (along with demand from
+ * existing tasks on the cpu) can be met on target cpu
+ *
+ */
+
+static int send_notification(struct rq *rq, int check_pred, int check_groups)
+{
+	unsigned int cur_freq, freq_required;
+	unsigned long flags;
+	int rc = 0;
+	u64 group_load = 0, new_load  = 0;
+
+	if (!sched_enable_hmp)
+		return 0;
+
+	if (check_pred) {
+		u64 prev = rq->old_busy_time;
+		u64 predicted = rq->hmp_stats.pred_demands_sum;
+
+		if (rq->cluster->cur_freq == cpu_max_freq(cpu_of(rq)))
+			return 0;
+
+		prev = max(prev, rq->old_estimated_time);
+		if (prev > predicted)
+			return 0;
+
+		cur_freq = load_to_freq(rq, prev);
+		freq_required = load_to_freq(rq, predicted);
+
+		if (freq_required < cur_freq + sysctl_sched_pred_alert_freq)
+			return 0;
+	} else {
+		read_lock(&related_thread_group_lock);
+		/*
+		 * Protect from concurrent update of rq->prev_runnable_sum and
+		 * group cpu load
+		 */
+		raw_spin_lock_irqsave(&rq->lock, flags);
+		if (check_groups)
+			_group_load_in_cpu(cpu_of(rq), &group_load, NULL);
+
+		new_load = rq->prev_runnable_sum + group_load;
+
+		raw_spin_unlock_irqrestore(&rq->lock, flags);
+		read_unlock(&related_thread_group_lock);
+
+		cur_freq = load_to_freq(rq, rq->old_busy_time);
+		freq_required = load_to_freq(rq, new_load);
+
+		if (nearly_same_freq(cur_freq, freq_required))
+			return 0;
+	}
+
+	raw_spin_lock_irqsave(&rq->lock, flags);
+	if (!rq->cluster->notifier_sent) {
+		rq->cluster->notifier_sent = 1;
+		rc = 1;
+		trace_sched_freq_alert(cpu_of(rq), check_pred, check_groups, rq,
+				       new_load);
+	}
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+	return rc;
+}
+
+/* Alert governor if there is a need to change frequency */
+void check_for_freq_change(struct rq *rq, bool check_pred, bool check_groups)
+{
+	int cpu = cpu_of(rq);
+
+	if (!send_notification(rq, check_pred, check_groups))
+		return;
+
+	atomic_notifier_call_chain(
+		&load_alert_notifier_head, 0,
+		(void *)(long)cpu);
+}
+
+void notify_migration(int src_cpu, int dest_cpu, bool src_cpu_dead,
+			     struct task_struct *p)
+{
+	bool check_groups;
+
+	rcu_read_lock();
+	check_groups = task_in_related_thread_group(p);
+	rcu_read_unlock();
+
+	if (!same_freq_domain(src_cpu, dest_cpu)) {
+		if (!src_cpu_dead)
+			check_for_freq_change(cpu_rq(src_cpu), false,
+					      check_groups);
+		check_for_freq_change(cpu_rq(dest_cpu), false, check_groups);
+	} else {
+		check_for_freq_change(cpu_rq(dest_cpu), true, check_groups);
+	}
+}
+
+static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p,
+				     u64 irqtime, int event)
+{
+	if (is_idle_task(p)) {
+		/* TASK_WAKE && TASK_MIGRATE is not possible on idle task! */
+		if (event == PICK_NEXT_TASK)
+			return 0;
+
+		/* PUT_PREV_TASK, TASK_UPDATE && IRQ_UPDATE are left */
+		return irqtime || cpu_is_waiting_on_io(rq);
+	}
+
+	if (event == TASK_WAKE)
+		return 0;
+
+	if (event == PUT_PREV_TASK || event == IRQ_UPDATE)
+		return 1;
+
+	/*
+	 * TASK_UPDATE can be called on sleeping task, when its moved between
+	 * related groups
+	 */
+	if (event == TASK_UPDATE) {
+		if (rq->curr == p)
+			return 1;
+
+		return p->on_rq ? SCHED_FREQ_ACCOUNT_WAIT_TIME : 0;
+	}
+
+	/* TASK_MIGRATE, PICK_NEXT_TASK left */
+	return SCHED_FREQ_ACCOUNT_WAIT_TIME;
+}
+
+static inline bool is_new_task(struct task_struct *p)
+{
+	return p->ravg.active_windows < sysctl_sched_new_task_windows;
+}
+
+#define INC_STEP 8
+#define DEC_STEP 2
+#define CONSISTENT_THRES 16
+#define INC_STEP_BIG 16
+/*
+ * bucket_increase - update the count of all buckets
+ *
+ * @buckets: array of buckets tracking busy time of a task
+ * @idx: the index of bucket to be incremented
+ *
+ * Each time a complete window finishes, count of bucket that runtime
+ * falls in (@idx) is incremented. Counts of all other buckets are
+ * decayed. The rate of increase and decay could be different based
+ * on current count in the bucket.
+ */
+static inline void bucket_increase(u8 *buckets, int idx)
+{
+	int i, step;
+
+	for (i = 0; i < NUM_BUSY_BUCKETS; i++) {
+		if (idx != i) {
+			if (buckets[i] > DEC_STEP)
+				buckets[i] -= DEC_STEP;
+			else
+				buckets[i] = 0;
+		} else {
+			step = buckets[i] >= CONSISTENT_THRES ?
+						INC_STEP_BIG : INC_STEP;
+			if (buckets[i] > U8_MAX - step)
+				buckets[i] = U8_MAX;
+			else
+				buckets[i] += step;
+		}
+	}
+}
+
+static inline int busy_to_bucket(u32 normalized_rt)
+{
+	int bidx;
+
+	bidx = mult_frac(normalized_rt, NUM_BUSY_BUCKETS, max_task_load());
+	bidx = min(bidx, NUM_BUSY_BUCKETS - 1);
+
+	/*
+	 * Combine lowest two buckets. The lowest frequency falls into
+	 * 2nd bucket and thus keep predicting lowest bucket is not
+	 * useful.
+	 */
+	if (!bidx)
+		bidx++;
+
+	return bidx;
+}
+
+static inline u64
+scale_load_to_freq(u64 load, unsigned int src_freq, unsigned int dst_freq)
+{
+	return div64_u64(load * (u64)src_freq, (u64)dst_freq);
+}
+
+#define HEAVY_TASK_SKIP 2
+#define HEAVY_TASK_SKIP_LIMIT 4
+/*
+ * get_pred_busy - calculate predicted demand for a task on runqueue
+ *
+ * @rq: runqueue of task p
+ * @p: task whose prediction is being updated
+ * @start: starting bucket. returned prediction should not be lower than
+ *         this bucket.
+ * @runtime: runtime of the task. returned prediction should not be lower
+ *           than this runtime.
+ * Note: @start can be derived from @runtime. It's passed in only to
+ * avoid duplicated calculation in some cases.
+ *
+ * A new predicted busy time is returned for task @p based on @runtime
+ * passed in. The function searches through buckets that represent busy
+ * time equal to or bigger than @runtime and attempts to find the bucket to
+ * to use for prediction. Once found, it searches through historical busy
+ * time and returns the latest that falls into the bucket. If no such busy
+ * time exists, it returns the medium of that bucket.
+ */
+static u32 get_pred_busy(struct rq *rq, struct task_struct *p,
+				int start, u32 runtime)
+{
+	int i;
+	u8 *buckets = p->ravg.busy_buckets;
+	u32 *hist = p->ravg.sum_history;
+	u32 dmin, dmax;
+	u64 cur_freq_runtime = 0;
+	int first = NUM_BUSY_BUCKETS, final, skip_to;
+	u32 ret = runtime;
+
+	/* skip prediction for new tasks due to lack of history */
+	if (unlikely(is_new_task(p)))
+		goto out;
+
+	/* find minimal bucket index to pick */
+	for (i = start; i < NUM_BUSY_BUCKETS; i++) {
+		if (buckets[i]) {
+			first = i;
+			break;
+		}
+	}
+	/* if no higher buckets are filled, predict runtime */
+	if (first >= NUM_BUSY_BUCKETS)
+		goto out;
+
+	/* compute the bucket for prediction */
+	final = first;
+	if (first < HEAVY_TASK_SKIP_LIMIT) {
+		/* compute runtime at current CPU frequency */
+		cur_freq_runtime = mult_frac(runtime, max_possible_efficiency,
+					     rq->cluster->efficiency);
+		cur_freq_runtime = scale_load_to_freq(cur_freq_runtime,
+				max_possible_freq, rq->cluster->cur_freq);
+		/*
+		 * if the task runs for majority of the window, try to
+		 * pick higher buckets.
+		 */
+		if (cur_freq_runtime >= sched_major_task_runtime) {
+			int next = NUM_BUSY_BUCKETS;
+			/*
+			 * if there is a higher bucket that's consistently
+			 * hit, don't jump beyond that.
+			 */
+			for (i = start + 1; i <= HEAVY_TASK_SKIP_LIMIT &&
+			     i < NUM_BUSY_BUCKETS; i++) {
+				if (buckets[i] > CONSISTENT_THRES) {
+					next = i;
+					break;
+				}
+			}
+			skip_to = min(next, start + HEAVY_TASK_SKIP);
+			/* don't jump beyond HEAVY_TASK_SKIP_LIMIT */
+			skip_to = min(HEAVY_TASK_SKIP_LIMIT, skip_to);
+			/* don't go below first non-empty bucket, if any */
+			final = max(first, skip_to);
+		}
+	}
+
+	/* determine demand range for the predicted bucket */
+	if (final < 2) {
+		/* lowest two buckets are combined */
+		dmin = 0;
+		final = 1;
+	} else {
+		dmin = mult_frac(final, max_task_load(), NUM_BUSY_BUCKETS);
+	}
+	dmax = mult_frac(final + 1, max_task_load(), NUM_BUSY_BUCKETS);
+
+	/*
+	 * search through runtime history and return first runtime that falls
+	 * into the range of predicted bucket.
+	 */
+	for (i = 0; i < sched_ravg_hist_size; i++) {
+		if (hist[i] >= dmin && hist[i] < dmax) {
+			ret = hist[i];
+			break;
+		}
+	}
+	/* no historical runtime within bucket found, use average of the bin */
+	if (ret < dmin)
+		ret = (dmin + dmax) / 2;
+	/*
+	 * when updating in middle of a window, runtime could be higher
+	 * than all recorded history. Always predict at least runtime.
+	 */
+	ret = max(runtime, ret);
+out:
+	trace_sched_update_pred_demand(rq, p, runtime,
+		mult_frac((unsigned int)cur_freq_runtime, 100,
+			  sched_ravg_window), ret);
+	return ret;
+}
+
+static inline u32 calc_pred_demand(struct rq *rq, struct task_struct *p)
+{
+	if (p->ravg.pred_demand >= p->ravg.curr_window)
+		return p->ravg.pred_demand;
+
+	return get_pred_busy(rq, p, busy_to_bucket(p->ravg.curr_window),
+			     p->ravg.curr_window);
+}
+
+/*
+ * predictive demand of a task is calculated at the window roll-over.
+ * if the task current window busy time exceeds the predicted
+ * demand, update it here to reflect the task needs.
+ */
+void update_task_pred_demand(struct rq *rq, struct task_struct *p, int event)
+{
+	u32 new, old;
+
+	if (is_idle_task(p) || exiting_task(p))
+		return;
+
+	if (event != PUT_PREV_TASK && event != TASK_UPDATE &&
+			(!SCHED_FREQ_ACCOUNT_WAIT_TIME ||
+			 (event != TASK_MIGRATE &&
+			 event != PICK_NEXT_TASK)))
+		return;
+
+	/*
+	 * TASK_UPDATE can be called on sleeping task, when its moved between
+	 * related groups
+	 */
+	if (event == TASK_UPDATE) {
+		if (!p->on_rq && !SCHED_FREQ_ACCOUNT_WAIT_TIME)
+			return;
+	}
+
+	new = calc_pred_demand(rq, p);
+	old = p->ravg.pred_demand;
+
+	if (old >= new)
+		return;
+
+	if (task_on_rq_queued(p) && (!task_has_dl_policy(p) ||
+				!p->dl.dl_throttled))
+		p->sched_class->fixup_hmp_sched_stats(rq, p,
+				p->ravg.demand,
+				new);
+
+	p->ravg.pred_demand = new;
+}
+
+/*
+ * Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum)
+ */
+static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
+				 int event, u64 wallclock, u64 irqtime)
+{
+	int new_window, full_window = 0;
+	int p_is_curr_task = (p == rq->curr);
+	u64 mark_start = p->ravg.mark_start;
+	u64 window_start = rq->window_start;
+	u32 window_size = sched_ravg_window;
+	u64 delta;
+	u64 *curr_runnable_sum = &rq->curr_runnable_sum;
+	u64 *prev_runnable_sum = &rq->prev_runnable_sum;
+	u64 *nt_curr_runnable_sum = &rq->nt_curr_runnable_sum;
+	u64 *nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;
+	int flip_counters = 0;
+	int prev_sum_reset = 0;
+	bool new_task;
+	struct related_thread_group *grp;
+
+	new_window = mark_start < window_start;
+	if (new_window) {
+		full_window = (window_start - mark_start) >= window_size;
+		if (p->ravg.active_windows < USHRT_MAX)
+			p->ravg.active_windows++;
+	}
+
+	new_task = is_new_task(p);
+
+	grp = p->grp;
+	if (grp && sched_freq_aggregate) {
+		/* cpu_time protected by rq_lock */
+		struct group_cpu_time *cpu_time =
+			_group_cpu_time(grp, cpu_of(rq));
+
+		curr_runnable_sum = &cpu_time->curr_runnable_sum;
+		prev_runnable_sum = &cpu_time->prev_runnable_sum;
+
+		nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
+		nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
+
+		if (cpu_time->window_start != rq->window_start) {
+			int nr_windows;
+
+			delta = rq->window_start - cpu_time->window_start;
+			nr_windows = div64_u64(delta, window_size);
+			if (nr_windows > 1)
+				prev_sum_reset = 1;
+
+			cpu_time->window_start = rq->window_start;
+			flip_counters = 1;
+		}
+
+		if (p_is_curr_task && new_window) {
+			u64 curr_sum = rq->curr_runnable_sum;
+			u64 nt_curr_sum = rq->nt_curr_runnable_sum;
+
+			if (full_window)
+				curr_sum = nt_curr_sum = 0;
+
+			rq->prev_runnable_sum = curr_sum;
+			rq->nt_prev_runnable_sum = nt_curr_sum;
+
+			rq->curr_runnable_sum = 0;
+			rq->nt_curr_runnable_sum = 0;
+		}
+	} else {
+		if (p_is_curr_task && new_window) {
+			flip_counters = 1;
+			if (full_window)
+				prev_sum_reset = 1;
+		}
+	}
+
+	/*
+	 * Handle per-task window rollover. We don't care about the idle
+	 * task or exiting tasks.
+	 */
+	if (new_window && !is_idle_task(p) && !exiting_task(p)) {
+		u32 curr_window = 0;
+
+		if (!full_window)
+			curr_window = p->ravg.curr_window;
+
+		p->ravg.prev_window = curr_window;
+		p->ravg.curr_window = 0;
+	}
+
+	if (flip_counters) {
+		u64 curr_sum = *curr_runnable_sum;
+		u64 nt_curr_sum = *nt_curr_runnable_sum;
+
+		if (prev_sum_reset)
+			curr_sum = nt_curr_sum = 0;
+
+		*prev_runnable_sum = curr_sum;
+		*nt_prev_runnable_sum = nt_curr_sum;
+
+		*curr_runnable_sum = 0;
+		*nt_curr_runnable_sum = 0;
+	}
+
+	if (!account_busy_for_cpu_time(rq, p, irqtime, event)) {
+		/*
+		 * account_busy_for_cpu_time() = 0, so no update to the
+		 * task's current window needs to be made. This could be
+		 * for example
+		 *
+		 *   - a wakeup event on a task within the current
+		 *     window (!new_window below, no action required),
+		 *   - switching to a new task from idle (PICK_NEXT_TASK)
+		 *     in a new window where irqtime is 0 and we aren't
+		 *     waiting on IO
+		 */
+
+		if (!new_window)
+			return;
+
+		/*
+		 * A new window has started. The RQ demand must be rolled
+		 * over if p is the current task.
+		 */
+		if (p_is_curr_task) {
+			/* p is idle task */
+			BUG_ON(p != rq->idle);
+		}
+
+		return;
+	}
+
+	if (!new_window) {
+		/*
+		 * account_busy_for_cpu_time() = 1 so busy time needs
+		 * to be accounted to the current window. No rollover
+		 * since we didn't start a new window. An example of this is
+		 * when a task starts execution and then sleeps within the
+		 * same window.
+		 */
+
+		if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq))
+			delta = wallclock - mark_start;
+		else
+			delta = irqtime;
+		delta = scale_exec_time(delta, rq);
+		*curr_runnable_sum += delta;
+		if (new_task)
+			*nt_curr_runnable_sum += delta;
+
+		if (!is_idle_task(p) && !exiting_task(p))
+			p->ravg.curr_window += delta;
+
+		return;
+	}
+
+	if (!p_is_curr_task) {
+		/*
+		 * account_busy_for_cpu_time() = 1 so busy time needs
+		 * to be accounted to the current window. A new window
+		 * has also started, but p is not the current task, so the
+		 * window is not rolled over - just split up and account
+		 * as necessary into curr and prev. The window is only
+		 * rolled over when a new window is processed for the current
+		 * task.
+		 *
+		 * Irqtime can't be accounted by a task that isn't the
+		 * currently running task.
+		 */
+
+		if (!full_window) {
+			/*
+			 * A full window hasn't elapsed, account partial
+			 * contribution to previous completed window.
+			 */
+			delta = scale_exec_time(window_start - mark_start, rq);
+			if (!exiting_task(p))
+				p->ravg.prev_window += delta;
+		} else {
+			/*
+			 * Since at least one full window has elapsed,
+			 * the contribution to the previous window is the
+			 * full window (window_size).
+			 */
+			delta = scale_exec_time(window_size, rq);
+			if (!exiting_task(p))
+				p->ravg.prev_window = delta;
+		}
+
+		*prev_runnable_sum += delta;
+		if (new_task)
+			*nt_prev_runnable_sum += delta;
+
+		/* Account piece of busy time in the current window. */
+		delta = scale_exec_time(wallclock - window_start, rq);
+		*curr_runnable_sum += delta;
+		if (new_task)
+			*nt_curr_runnable_sum += delta;
+
+		if (!exiting_task(p))
+			p->ravg.curr_window = delta;
+
+		return;
+	}
+
+	if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) {
+		/*
+		 * account_busy_for_cpu_time() = 1 so busy time needs
+		 * to be accounted to the current window. A new window
+		 * has started and p is the current task so rollover is
+		 * needed. If any of these three above conditions are true
+		 * then this busy time can't be accounted as irqtime.
+		 *
+		 * Busy time for the idle task or exiting tasks need not
+		 * be accounted.
+		 *
+		 * An example of this would be a task that starts execution
+		 * and then sleeps once a new window has begun.
+		 */
+
+		if (!full_window) {
+			/*
+			 * A full window hasn't elapsed, account partial
+			 * contribution to previous completed window.
+			 */
+			delta = scale_exec_time(window_start - mark_start, rq);
+			if (!is_idle_task(p) && !exiting_task(p))
+				p->ravg.prev_window += delta;
+		} else {
+			/*
+			 * Since at least one full window has elapsed,
+			 * the contribution to the previous window is the
+			 * full window (window_size).
+			 */
+			delta = scale_exec_time(window_size, rq);
+			if (!is_idle_task(p) && !exiting_task(p))
+				p->ravg.prev_window = delta;
+		}
+
+		/*
+		 * Rollover is done here by overwriting the values in
+		 * prev_runnable_sum and curr_runnable_sum.
+		 */
+		*prev_runnable_sum += delta;
+		if (new_task)
+			*nt_prev_runnable_sum += delta;
+
+		/* Account piece of busy time in the current window. */
+		delta = scale_exec_time(wallclock - window_start, rq);
+		*curr_runnable_sum += delta;
+		if (new_task)
+			*nt_curr_runnable_sum += delta;
+
+		if (!is_idle_task(p) && !exiting_task(p))
+			p->ravg.curr_window = delta;
+
+		return;
+	}
+
+	if (irqtime) {
+		/*
+		 * account_busy_for_cpu_time() = 1 so busy time needs
+		 * to be accounted to the current window. A new window
+		 * has started and p is the current task so rollover is
+		 * needed. The current task must be the idle task because
+		 * irqtime is not accounted for any other task.
+		 *
+		 * Irqtime will be accounted each time we process IRQ activity
+		 * after a period of idleness, so we know the IRQ busy time
+		 * started at wallclock - irqtime.
+		 */
+
+		BUG_ON(!is_idle_task(p));
+		mark_start = wallclock - irqtime;
+
+		/*
+		 * Roll window over. If IRQ busy time was just in the current
+		 * window then that is all that need be accounted.
+		 */
+		if (mark_start > window_start) {
+			*curr_runnable_sum = scale_exec_time(irqtime, rq);
+			return;
+		}
+
+		/*
+		 * The IRQ busy time spanned multiple windows. Process the
+		 * busy time preceding the current window start first.
+		 */
+		delta = window_start - mark_start;
+		if (delta > window_size)
+			delta = window_size;
+		delta = scale_exec_time(delta, rq);
+		*prev_runnable_sum += delta;
+
+		/* Process the remaining IRQ busy time in the current window. */
+		delta = wallclock - window_start;
+		rq->curr_runnable_sum = scale_exec_time(delta, rq);
+
+		return;
+	}
+
+	BUG();
+}
+
+static inline u32 predict_and_update_buckets(struct rq *rq,
+			struct task_struct *p, u32 runtime) {
+
+	int bidx;
+	u32 pred_demand;
+
+	bidx = busy_to_bucket(runtime);
+	pred_demand = get_pred_busy(rq, p, bidx, runtime);
+	bucket_increase(p->ravg.busy_buckets, bidx);
+
+	return pred_demand;
+}
+
+static void update_task_cpu_cycles(struct task_struct *p, int cpu)
+{
+	if (use_cycle_counter)
+		p->cpu_cycles = cpu_cycle_counter_cb.get_cpu_cycle_counter(cpu);
+}
+
+static void
+update_task_rq_cpu_cycles(struct task_struct *p, struct rq *rq, int event,
+			  u64 wallclock, u64 irqtime)
+{
+	u64 cur_cycles;
+	int cpu = cpu_of(rq);
+
+	lockdep_assert_held(&rq->lock);
+
+	if (!use_cycle_counter) {
+		rq->cc.cycles = cpu_cur_freq(cpu);
+		rq->cc.time = 1;
+		return;
+	}
+
+	cur_cycles = cpu_cycle_counter_cb.get_cpu_cycle_counter(cpu);
+
+	/*
+	 * If current task is idle task and irqtime == 0 CPU was
+	 * indeed idle and probably its cycle counter was not
+	 * increasing.  We still need estimatied CPU frequency
+	 * for IO wait time accounting.  Use the previously
+	 * calculated frequency in such a case.
+	 */
+	if (!is_idle_task(rq->curr) || irqtime) {
+		if (unlikely(cur_cycles < p->cpu_cycles))
+			rq->cc.cycles = cur_cycles + (U64_MAX - p->cpu_cycles);
+		else
+			rq->cc.cycles = cur_cycles - p->cpu_cycles;
+		rq->cc.cycles = rq->cc.cycles * NSEC_PER_MSEC;
+
+		if (event == IRQ_UPDATE && is_idle_task(p))
+			/*
+			 * Time between mark_start of idle task and IRQ handler
+			 * entry time is CPU cycle counter stall period.
+			 * Upon IRQ handler entry sched_account_irqstart()
+			 * replenishes idle task's cpu cycle counter so
+			 * rq->cc.cycles now represents increased cycles during
+			 * IRQ handler rather than time between idle entry and
+			 * IRQ exit.  Thus use irqtime as time delta.
+			 */
+			rq->cc.time = irqtime;
+		else
+			rq->cc.time = wallclock - p->ravg.mark_start;
+		BUG_ON((s64)rq->cc.time < 0);
+	}
+
+	p->cpu_cycles = cur_cycles;
+
+	trace_sched_get_task_cpu_cycles(cpu, event, rq->cc.cycles, rq->cc.time);
+}
+
+static int account_busy_for_task_demand(struct task_struct *p, int event)
+{
+	/*
+	 * No need to bother updating task demand for exiting tasks
+	 * or the idle task.
+	 */
+	if (exiting_task(p) || is_idle_task(p))
+		return 0;
+
+	/*
+	 * When a task is waking up it is completing a segment of non-busy
+	 * time. Likewise, if wait time is not treated as busy time, then
+	 * when a task begins to run or is migrated, it is not running and
+	 * is completing a segment of non-busy time.
+	 */
+	if (event == TASK_WAKE || (!SCHED_ACCOUNT_WAIT_TIME &&
+			 (event == PICK_NEXT_TASK || event == TASK_MIGRATE)))
+		return 0;
+
+	return 1;
+}
+
+/*
+ * Called when new window is starting for a task, to record cpu usage over
+ * recently concluded window(s). Normally 'samples' should be 1. It can be > 1
+ * when, say, a real-time task runs without preemption for several windows at a
+ * stretch.
+ */
+static void update_history(struct rq *rq, struct task_struct *p,
+			 u32 runtime, int samples, int event)
+{
+	u32 *hist = &p->ravg.sum_history[0];
+	int ridx, widx;
+	u32 max = 0, avg, demand, pred_demand;
+	u64 sum = 0;
+
+	/* Ignore windows where task had no activity */
+	if (!runtime || is_idle_task(p) || exiting_task(p) || !samples)
+		goto done;
+
+	/* Push new 'runtime' value onto stack */
+	widx = sched_ravg_hist_size - 1;
+	ridx = widx - samples;
+	for (; ridx >= 0; --widx, --ridx) {
+		hist[widx] = hist[ridx];
+		sum += hist[widx];
+		if (hist[widx] > max)
+			max = hist[widx];
+	}
+
+	for (widx = 0; widx < samples && widx < sched_ravg_hist_size; widx++) {
+		hist[widx] = runtime;
+		sum += hist[widx];
+		if (hist[widx] > max)
+			max = hist[widx];
+	}
+
+	p->ravg.sum = 0;
+
+	if (sched_window_stats_policy == WINDOW_STATS_RECENT) {
+		demand = runtime;
+	} else if (sched_window_stats_policy == WINDOW_STATS_MAX) {
+		demand = max;
+	} else {
+		avg = div64_u64(sum, sched_ravg_hist_size);
+		if (sched_window_stats_policy == WINDOW_STATS_AVG)
+			demand = avg;
+		else
+			demand = max(avg, runtime);
+	}
+	pred_demand = predict_and_update_buckets(rq, p, runtime);
+
+	/*
+	 * A throttled deadline sched class task gets dequeued without
+	 * changing p->on_rq. Since the dequeue decrements hmp stats
+	 * avoid decrementing it here again.
+	 */
+	if (task_on_rq_queued(p) && (!task_has_dl_policy(p) ||
+						!p->dl.dl_throttled))
+		p->sched_class->fixup_hmp_sched_stats(rq, p, demand,
+						      pred_demand);
+
+	p->ravg.demand = demand;
+	p->ravg.pred_demand = pred_demand;
+
+done:
+	trace_sched_update_history(rq, p, runtime, samples, event);
+}
+
+static void add_to_task_demand(struct rq *rq, struct task_struct *p, u64 delta)
+{
+	delta = scale_exec_time(delta, rq);
+	p->ravg.sum += delta;
+	if (unlikely(p->ravg.sum > sched_ravg_window))
+		p->ravg.sum = sched_ravg_window;
+}
+
+/*
+ * Account cpu demand of task and/or update task's cpu demand history
+ *
+ * ms = p->ravg.mark_start;
+ * wc = wallclock
+ * ws = rq->window_start
+ *
+ * Three possibilities:
+ *
+ *	a) Task event is contained within one window.
+ *		window_start < mark_start < wallclock
+ *
+ *		ws   ms  wc
+ *		|    |   |
+ *		V    V   V
+ *		|---------------|
+ *
+ *	In this case, p->ravg.sum is updated *iff* event is appropriate
+ *	(ex: event == PUT_PREV_TASK)
+ *
+ *	b) Task event spans two windows.
+ *		mark_start < window_start < wallclock
+ *
+ *		ms   ws   wc
+ *		|    |    |
+ *		V    V    V
+ *		-----|-------------------
+ *
+ *	In this case, p->ravg.sum is updated with (ws - ms) *iff* event
+ *	is appropriate, then a new window sample is recorded followed
+ *	by p->ravg.sum being set to (wc - ws) *iff* event is appropriate.
+ *
+ *	c) Task event spans more than two windows.
+ *
+ *		ms ws_tmp			   ws  wc
+ *		|  |				   |   |
+ *		V  V				   V   V
+ *		---|-------|-------|-------|-------|------
+ *		   |				   |
+ *		   |<------ nr_full_windows ------>|
+ *
+ *	In this case, p->ravg.sum is updated with (ws_tmp - ms) first *iff*
+ *	event is appropriate, window sample of p->ravg.sum is recorded,
+ *	'nr_full_window' samples of window_size is also recorded *iff*
+ *	event is appropriate and finally p->ravg.sum is set to (wc - ws)
+ *	*iff* event is appropriate.
+ *
+ * IMPORTANT : Leave p->ravg.mark_start unchanged, as update_cpu_busy_time()
+ * depends on it!
+ */
+static void update_task_demand(struct task_struct *p, struct rq *rq,
+			       int event, u64 wallclock)
+{
+	u64 mark_start = p->ravg.mark_start;
+	u64 delta, window_start = rq->window_start;
+	int new_window, nr_full_windows;
+	u32 window_size = sched_ravg_window;
+
+	new_window = mark_start < window_start;
+	if (!account_busy_for_task_demand(p, event)) {
+		if (new_window)
+			/*
+			 * If the time accounted isn't being accounted as
+			 * busy time, and a new window started, only the
+			 * previous window need be closed out with the
+			 * pre-existing demand. Multiple windows may have
+			 * elapsed, but since empty windows are dropped,
+			 * it is not necessary to account those.
+			 */
+			update_history(rq, p, p->ravg.sum, 1, event);
+		return;
+	}
+
+	if (!new_window) {
+		/*
+		 * The simple case - busy time contained within the existing
+		 * window.
+		 */
+		add_to_task_demand(rq, p, wallclock - mark_start);
+		return;
+	}
+
+	/*
+	 * Busy time spans at least two windows. Temporarily rewind
+	 * window_start to first window boundary after mark_start.
+	 */
+	delta = window_start - mark_start;
+	nr_full_windows = div64_u64(delta, window_size);
+	window_start -= (u64)nr_full_windows * (u64)window_size;
+
+	/* Process (window_start - mark_start) first */
+	add_to_task_demand(rq, p, window_start - mark_start);
+
+	/* Push new sample(s) into task's demand history */
+	update_history(rq, p, p->ravg.sum, 1, event);
+	if (nr_full_windows)
+		update_history(rq, p, scale_exec_time(window_size, rq),
+			       nr_full_windows, event);
+
+	/*
+	 * Roll window_start back to current to process any remainder
+	 * in current window.
+	 */
+	window_start += (u64)nr_full_windows * (u64)window_size;
+
+	/* Process (wallclock - window_start) next */
+	mark_start = window_start;
+	add_to_task_demand(rq, p, wallclock - mark_start);
+}
+
+/* Reflect task activity on its demand and cpu's busy time statistics */
+void update_task_ravg(struct task_struct *p, struct rq *rq, int event,
+						u64 wallclock, u64 irqtime)
+{
+	if (!rq->window_start || sched_disable_window_stats)
+		return;
+
+	lockdep_assert_held(&rq->lock);
+
+	update_window_start(rq, wallclock);
+
+	if (!p->ravg.mark_start) {
+		update_task_cpu_cycles(p, cpu_of(rq));
+		goto done;
+	}
+
+	update_task_rq_cpu_cycles(p, rq, event, wallclock, irqtime);
+	update_task_demand(p, rq, event, wallclock);
+	update_cpu_busy_time(p, rq, event, wallclock, irqtime);
+	update_task_pred_demand(rq, p, event);
+done:
+	trace_sched_update_task_ravg(p, rq, event, wallclock, irqtime,
+				     rq->cc.cycles, rq->cc.time,
+				     _group_cpu_time(p->grp, cpu_of(rq)));
+
+	p->ravg.mark_start = wallclock;
+}
+
+void sched_account_irqtime(int cpu, struct task_struct *curr,
+				 u64 delta, u64 wallclock)
+{
+	struct rq *rq = cpu_rq(cpu);
+	unsigned long flags, nr_windows;
+	u64 cur_jiffies_ts;
+
+	raw_spin_lock_irqsave(&rq->lock, flags);
+
+	/*
+	 * cputime (wallclock) uses sched_clock so use the same here for
+	 * consistency.
+	 */
+	delta += sched_clock() - wallclock;
+	cur_jiffies_ts = get_jiffies_64();
+
+	if (is_idle_task(curr))
+		update_task_ravg(curr, rq, IRQ_UPDATE, sched_ktime_clock(),
+				 delta);
+
+	nr_windows = cur_jiffies_ts - rq->irqload_ts;
+
+	if (nr_windows) {
+		if (nr_windows < 10) {
+			/* Decay CPU's irqload by 3/4 for each window. */
+			rq->avg_irqload *= (3 * nr_windows);
+			rq->avg_irqload = div64_u64(rq->avg_irqload,
+						    4 * nr_windows);
+		} else {
+			rq->avg_irqload = 0;
+		}
+		rq->avg_irqload += rq->cur_irqload;
+		rq->cur_irqload = 0;
+	}
+
+	rq->cur_irqload += delta;
+	rq->irqload_ts = cur_jiffies_ts;
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+void sched_account_irqstart(int cpu, struct task_struct *curr, u64 wallclock)
+{
+	struct rq *rq = cpu_rq(cpu);
+
+	if (!rq->window_start || sched_disable_window_stats)
+		return;
+
+	if (is_idle_task(curr)) {
+		/* We're here without rq->lock held, IRQ disabled */
+		raw_spin_lock(&rq->lock);
+		update_task_cpu_cycles(curr, cpu);
+		raw_spin_unlock(&rq->lock);
+	}
+}
+
+void reset_task_stats(struct task_struct *p)
+{
+	u32 sum = 0;
+
+	if (exiting_task(p))
+		sum = EXITING_TASK_MARKER;
+
+	memset(&p->ravg, 0, sizeof(struct ravg));
+	/* Retain EXITING_TASK marker */
+	p->ravg.sum_history[0] = sum;
+}
+
+void mark_task_starting(struct task_struct *p)
+{
+	u64 wallclock;
+	struct rq *rq = task_rq(p);
+
+	if (!rq->window_start || sched_disable_window_stats) {
+		reset_task_stats(p);
+		return;
+	}
+
+	wallclock = sched_ktime_clock();
+	p->ravg.mark_start = p->last_wake_ts = wallclock;
+	p->last_cpu_selected_ts = wallclock;
+	p->last_switch_out_ts = 0;
+	update_task_cpu_cycles(p, cpu_of(rq));
+}
+
+void set_window_start(struct rq *rq)
+{
+	int cpu = cpu_of(rq);
+	struct rq *sync_rq = cpu_rq(sync_cpu);
+
+	if (rq->window_start || !sched_enable_hmp)
+		return;
+
+	if (cpu == sync_cpu) {
+		rq->window_start = sched_ktime_clock();
+	} else {
+		raw_spin_unlock(&rq->lock);
+		double_rq_lock(rq, sync_rq);
+		rq->window_start = cpu_rq(sync_cpu)->window_start;
+		rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
+		rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0;
+		raw_spin_unlock(&sync_rq->lock);
+	}
+
+	rq->curr->ravg.mark_start = rq->window_start;
+}
+
+void migrate_sync_cpu(int cpu)
+{
+	if (cpu == sync_cpu)
+		sync_cpu = smp_processor_id();
+}
+
+static void reset_all_task_stats(void)
+{
+	struct task_struct *g, *p;
+
+	read_lock(&tasklist_lock);
+	do_each_thread(g, p) {
+		reset_task_stats(p);
+	}  while_each_thread(g, p);
+	read_unlock(&tasklist_lock);
+}
+
+static void disable_window_stats(void)
+{
+	unsigned long flags;
+	int i;
+
+	local_irq_save(flags);
+	for_each_possible_cpu(i)
+		raw_spin_lock(&cpu_rq(i)->lock);
+
+	sched_disable_window_stats = 1;
+
+	for_each_possible_cpu(i)
+		raw_spin_unlock(&cpu_rq(i)->lock);
+
+	local_irq_restore(flags);
+}
+
+/* Called with all cpu's rq->lock held */
+static void enable_window_stats(void)
+{
+	sched_disable_window_stats = 0;
+
+}
+
+enum reset_reason_code {
+	WINDOW_CHANGE,
+	POLICY_CHANGE,
+	HIST_SIZE_CHANGE,
+	FREQ_AGGREGATE_CHANGE,
+};
+
+const char *sched_window_reset_reasons[] = {
+	"WINDOW_CHANGE",
+	"POLICY_CHANGE",
+	"HIST_SIZE_CHANGE",
+};
+
+/* Called with IRQs enabled */
+void reset_all_window_stats(u64 window_start, unsigned int window_size)
+{
+	int cpu;
+	unsigned long flags;
+	u64 start_ts = sched_ktime_clock();
+	int reason = WINDOW_CHANGE;
+	unsigned int old = 0, new = 0;
+	struct related_thread_group *grp;
+
+	disable_window_stats();
+
+	reset_all_task_stats();
+
+	local_irq_save(flags);
+
+	read_lock(&related_thread_group_lock);
+
+	for_each_possible_cpu(cpu)
+		raw_spin_lock(&cpu_rq(cpu)->lock);
+
+	list_for_each_entry(grp, &related_thread_groups, list) {
+		int j;
+
+		for_each_possible_cpu(j) {
+			struct group_cpu_time *cpu_time;
+			/* Protected by rq lock */
+			cpu_time = _group_cpu_time(grp, j);
+			memset(cpu_time, 0, sizeof(struct group_cpu_time));
+			if (window_start)
+				cpu_time->window_start = window_start;
+		}
+	}
+
+	if (window_size) {
+		sched_ravg_window = window_size * TICK_NSEC;
+		set_hmp_defaults();
+	}
+
+	enable_window_stats();
+
+	for_each_possible_cpu(cpu) {
+		struct rq *rq = cpu_rq(cpu);
+
+		if (window_start)
+			rq->window_start = window_start;
+		rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
+		rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0;
+		reset_cpu_hmp_stats(cpu, 1);
+	}
+
+	if (sched_window_stats_policy != sysctl_sched_window_stats_policy) {
+		reason = POLICY_CHANGE;
+		old = sched_window_stats_policy;
+		new = sysctl_sched_window_stats_policy;
+		sched_window_stats_policy = sysctl_sched_window_stats_policy;
+	} else if (sched_ravg_hist_size != sysctl_sched_ravg_hist_size) {
+		reason = HIST_SIZE_CHANGE;
+		old = sched_ravg_hist_size;
+		new = sysctl_sched_ravg_hist_size;
+		sched_ravg_hist_size = sysctl_sched_ravg_hist_size;
+	} else if (sched_freq_aggregate !=
+					sysctl_sched_freq_aggregate) {
+		reason = FREQ_AGGREGATE_CHANGE;
+		old = sched_freq_aggregate;
+		new = sysctl_sched_freq_aggregate;
+		sched_freq_aggregate = sysctl_sched_freq_aggregate;
+	}
+
+	for_each_possible_cpu(cpu)
+		raw_spin_unlock(&cpu_rq(cpu)->lock);
+
+	read_unlock(&related_thread_group_lock);
+
+	local_irq_restore(flags);
+
+	trace_sched_reset_all_window_stats(window_start, window_size,
+		sched_ktime_clock() - start_ts, reason, old, new);
+}
+
+static inline void
+sync_window_start(struct rq *rq, struct group_cpu_time *cpu_time);
+
+void sched_get_cpus_busy(struct sched_load *busy,
+			 const struct cpumask *query_cpus)
+{
+	unsigned long flags;
+	struct rq *rq;
+	const int cpus = cpumask_weight(query_cpus);
+	u64 load[cpus], group_load[cpus];
+	u64 nload[cpus], ngload[cpus];
+	u64 pload[cpus];
+	unsigned int cur_freq[cpus], max_freq[cpus];
+	int notifier_sent = 0;
+	int early_detection[cpus];
+	int cpu, i = 0;
+	unsigned int window_size;
+	u64 max_prev_sum = 0;
+	int max_busy_cpu = cpumask_first(query_cpus);
+	struct related_thread_group *grp;
+
+	if (unlikely(cpus == 0))
+		return;
+
+	/*
+	 * This function could be called in timer context, and the
+	 * current task may have been executing for a long time. Ensure
+	 * that the window stats are current by doing an update.
+	 */
+	read_lock(&related_thread_group_lock);
+
+	local_irq_save(flags);
+	for_each_cpu(cpu, query_cpus)
+		raw_spin_lock(&cpu_rq(cpu)->lock);
+
+	window_size = sched_ravg_window;
+
+	for_each_cpu(cpu, query_cpus) {
+		rq = cpu_rq(cpu);
+
+		update_task_ravg(rq->curr, rq, TASK_UPDATE, sched_ktime_clock(),
+				 0);
+		cur_freq[i] = cpu_cycles_to_freq(rq->cc.cycles, rq->cc.time);
+
+		load[i] = rq->old_busy_time = rq->prev_runnable_sum;
+		nload[i] = rq->nt_prev_runnable_sum;
+		pload[i] = rq->hmp_stats.pred_demands_sum;
+		rq->old_estimated_time = pload[i];
+
+		if (load[i] > max_prev_sum) {
+			max_prev_sum = load[i];
+			max_busy_cpu = cpu;
+		}
+
+		/*
+		 * sched_get_cpus_busy() is called for all CPUs in a
+		 * frequency domain. So the notifier_sent flag per
+		 * cluster works even when a frequency domain spans
+		 * more than 1 cluster.
+		 */
+		if (rq->cluster->notifier_sent) {
+			notifier_sent = 1;
+			rq->cluster->notifier_sent = 0;
+		}
+		early_detection[i] = (rq->ed_task != NULL);
+		cur_freq[i] = cpu_cur_freq(cpu);
+		max_freq[i] = cpu_max_freq(cpu);
+		i++;
+	}
+
+	for_each_related_thread_group(grp) {
+		for_each_cpu(cpu, query_cpus) {
+			/* Protected by rq_lock */
+			struct group_cpu_time *cpu_time =
+						_group_cpu_time(grp, cpu);
+			sync_window_start(cpu_rq(cpu), cpu_time);
+		}
+	}
+
+	i = 0;
+	for_each_cpu(cpu, query_cpus) {
+		group_load[i] = 0;
+		ngload[i] = 0;
+
+		if (early_detection[i])
+			goto skip_early;
+
+		rq = cpu_rq(cpu);
+		if (!notifier_sent) {
+			if (cpu == max_busy_cpu)
+				group_load_in_freq_domain(
+					&rq->freq_domain_cpumask,
+					&group_load[i], &ngload[i]);
+		} else {
+			_group_load_in_cpu(cpu, &group_load[i], &ngload[i]);
+		}
+
+		load[i] += group_load[i];
+		nload[i] += ngload[i];
+		/*
+		 * Scale load in reference to cluster max_possible_freq.
+		 *
+		 * Note that scale_load_to_cpu() scales load in reference to
+		 * the cluster max_freq.
+		 */
+		load[i] = scale_load_to_cpu(load[i], cpu);
+		nload[i] = scale_load_to_cpu(nload[i], cpu);
+		pload[i] = scale_load_to_cpu(pload[i], cpu);
+skip_early:
+		i++;
+	}
+
+	for_each_cpu(cpu, query_cpus)
+		raw_spin_unlock(&(cpu_rq(cpu))->lock);
+	local_irq_restore(flags);
+
+	read_unlock(&related_thread_group_lock);
+
+	i = 0;
+	for_each_cpu(cpu, query_cpus) {
+		rq = cpu_rq(cpu);
+
+		if (early_detection[i]) {
+			busy[i].prev_load = div64_u64(sched_ravg_window,
+							NSEC_PER_USEC);
+			busy[i].new_task_load = 0;
+			goto exit_early;
+		}
+
+		if (!notifier_sent) {
+			load[i] = scale_load_to_freq(load[i], max_freq[i],
+						     cur_freq[i]);
+			nload[i] = scale_load_to_freq(nload[i], max_freq[i],
+						      cur_freq[i]);
+			if (load[i] > window_size)
+				load[i] = window_size;
+			if (nload[i] > window_size)
+				nload[i] = window_size;
+
+			load[i] = scale_load_to_freq(load[i], cur_freq[i],
+						    cpu_max_possible_freq(cpu));
+			nload[i] = scale_load_to_freq(nload[i], cur_freq[i],
+						    cpu_max_possible_freq(cpu));
+		} else {
+			load[i] = scale_load_to_freq(load[i], max_freq[i],
+						    cpu_max_possible_freq(cpu));
+			nload[i] = scale_load_to_freq(nload[i], max_freq[i],
+						    cpu_max_possible_freq(cpu));
+		}
+		pload[i] = scale_load_to_freq(pload[i], max_freq[i],
+					     rq->cluster->max_possible_freq);
+
+		busy[i].prev_load = div64_u64(load[i], NSEC_PER_USEC);
+		busy[i].new_task_load = div64_u64(nload[i], NSEC_PER_USEC);
+		busy[i].predicted_load = div64_u64(pload[i], NSEC_PER_USEC);
+
+exit_early:
+		trace_sched_get_busy(cpu, busy[i].prev_load,
+				     busy[i].new_task_load,
+				     busy[i].predicted_load,
+				     early_detection[i]);
+		i++;
+	}
+}
+
+void sched_set_io_is_busy(int val)
+{
+	sched_io_is_busy = val;
+}
+
+int sched_set_window(u64 window_start, unsigned int window_size)
+{
+	u64 now, cur_jiffies, jiffy_ktime_ns;
+	s64 ws;
+	unsigned long flags;
+
+	if (window_size * TICK_NSEC <  MIN_SCHED_RAVG_WINDOW)
+		return -EINVAL;
+
+	mutex_lock(&policy_mutex);
+
+	/*
+	 * Get a consistent view of ktime, jiffies, and the time
+	 * since the last jiffy (based on last_jiffies_update).
+	 */
+	local_irq_save(flags);
+	cur_jiffies = jiffy_to_ktime_ns(&now, &jiffy_ktime_ns);
+	local_irq_restore(flags);
+
+	/* translate window_start from jiffies to nanoseconds */
+	ws = (window_start - cur_jiffies); /* jiffy difference */
+	ws *= TICK_NSEC;
+	ws += jiffy_ktime_ns;
+
+	/*
+	 * Roll back calculated window start so that it is in
+	 * the past (window stats must have a current window).
+	 */
+	while (ws > now)
+		ws -= (window_size * TICK_NSEC);
+
+	BUG_ON(sched_ktime_clock() < ws);
+
+	reset_all_window_stats(ws, window_size);
+
+	sched_update_freq_max_load(cpu_possible_mask);
+
+	mutex_unlock(&policy_mutex);
+
+	return 0;
+}
+
+void fixup_busy_time(struct task_struct *p, int new_cpu)
+{
+	struct rq *src_rq = task_rq(p);
+	struct rq *dest_rq = cpu_rq(new_cpu);
+	u64 wallclock;
+	u64 *src_curr_runnable_sum, *dst_curr_runnable_sum;
+	u64 *src_prev_runnable_sum, *dst_prev_runnable_sum;
+	u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum;
+	u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum;
+	int migrate_type;
+	struct migration_sum_data d;
+	bool new_task;
+	struct related_thread_group *grp;
+
+	if (!sched_enable_hmp || (!p->on_rq && p->state != TASK_WAKING))
+		return;
+
+	if (exiting_task(p)) {
+		clear_ed_task(p, src_rq);
+		return;
+	}
+
+	if (p->state == TASK_WAKING)
+		double_rq_lock(src_rq, dest_rq);
+
+	if (sched_disable_window_stats)
+		goto done;
+
+	wallclock = sched_ktime_clock();
+
+	update_task_ravg(task_rq(p)->curr, task_rq(p),
+			 TASK_UPDATE,
+			 wallclock, 0);
+	update_task_ravg(dest_rq->curr, dest_rq,
+			 TASK_UPDATE, wallclock, 0);
+
+	update_task_ravg(p, task_rq(p), TASK_MIGRATE,
+			 wallclock, 0);
+
+	update_task_cpu_cycles(p, new_cpu);
+
+	new_task = is_new_task(p);
+	/* Protected by rq_lock */
+	grp = p->grp;
+	if (grp && sched_freq_aggregate) {
+		struct group_cpu_time *cpu_time;
+
+		migrate_type = GROUP_TO_GROUP;
+		/* Protected by rq_lock */
+		cpu_time = _group_cpu_time(grp, cpu_of(src_rq));
+		d.src_rq = NULL;
+		d.src_cpu_time = cpu_time;
+		src_curr_runnable_sum = &cpu_time->curr_runnable_sum;
+		src_prev_runnable_sum = &cpu_time->prev_runnable_sum;
+		src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
+		src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
+
+		/* Protected by rq_lock */
+		cpu_time = _group_cpu_time(grp, cpu_of(dest_rq));
+		d.dst_rq = NULL;
+		d.dst_cpu_time = cpu_time;
+		dst_curr_runnable_sum = &cpu_time->curr_runnable_sum;
+		dst_prev_runnable_sum = &cpu_time->prev_runnable_sum;
+		dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
+		dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
+		sync_window_start(dest_rq, cpu_time);
+	} else {
+		migrate_type = RQ_TO_RQ;
+		d.src_rq = src_rq;
+		d.src_cpu_time = NULL;
+		d.dst_rq = dest_rq;
+		d.dst_cpu_time = NULL;
+		src_curr_runnable_sum = &src_rq->curr_runnable_sum;
+		src_prev_runnable_sum = &src_rq->prev_runnable_sum;
+		src_nt_curr_runnable_sum = &src_rq->nt_curr_runnable_sum;
+		src_nt_prev_runnable_sum = &src_rq->nt_prev_runnable_sum;
+
+		dst_curr_runnable_sum = &dest_rq->curr_runnable_sum;
+		dst_prev_runnable_sum = &dest_rq->prev_runnable_sum;
+		dst_nt_curr_runnable_sum = &dest_rq->nt_curr_runnable_sum;
+		dst_nt_prev_runnable_sum = &dest_rq->nt_prev_runnable_sum;
+	}
+
+	if (p->ravg.curr_window) {
+		*src_curr_runnable_sum -= p->ravg.curr_window;
+		*dst_curr_runnable_sum += p->ravg.curr_window;
+		if (new_task) {
+			*src_nt_curr_runnable_sum -= p->ravg.curr_window;
+			*dst_nt_curr_runnable_sum += p->ravg.curr_window;
+		}
+	}
+
+	if (p->ravg.prev_window) {
+		*src_prev_runnable_sum -= p->ravg.prev_window;
+		*dst_prev_runnable_sum += p->ravg.prev_window;
+		if (new_task) {
+			*src_nt_prev_runnable_sum -= p->ravg.prev_window;
+			*dst_nt_prev_runnable_sum += p->ravg.prev_window;
+		}
+	}
+
+	if (p == src_rq->ed_task) {
+		src_rq->ed_task = NULL;
+		if (!dest_rq->ed_task)
+			dest_rq->ed_task = p;
+	}
+
+	trace_sched_migration_update_sum(p, migrate_type, &d);
+	BUG_ON((s64)*src_prev_runnable_sum < 0);
+	BUG_ON((s64)*src_curr_runnable_sum < 0);
+	BUG_ON((s64)*src_nt_prev_runnable_sum < 0);
+	BUG_ON((s64)*src_nt_curr_runnable_sum < 0);
+
+done:
+	if (p->state == TASK_WAKING)
+		double_rq_unlock(src_rq, dest_rq);
+}
+
+#define sched_up_down_migrate_auto_update 1
+static void check_for_up_down_migrate_update(const struct cpumask *cpus)
+{
+	int i = cpumask_first(cpus);
+
+	if (!sched_up_down_migrate_auto_update)
+		return;
+
+	if (cpu_max_possible_capacity(i) == max_possible_capacity)
+		return;
+
+	if (cpu_max_possible_freq(i) == cpu_max_freq(i))
+		up_down_migrate_scale_factor = 1024;
+	else
+		up_down_migrate_scale_factor = (1024 *
+				 cpu_max_possible_freq(i)) / cpu_max_freq(i);
+
+	update_up_down_migrate();
+}
+
+/* Return cluster which can offer required capacity for group */
+static struct sched_cluster *
+best_cluster(struct related_thread_group *grp, u64 total_demand)
+{
+	struct sched_cluster *cluster = NULL;
+
+	for_each_sched_cluster(cluster) {
+		if (group_will_fit(cluster, grp, total_demand))
+			return cluster;
+	}
+
+	return NULL;
+}
+
+static void _set_preferred_cluster(struct related_thread_group *grp)
+{
+	struct task_struct *p;
+	u64 combined_demand = 0;
+
+	if (!sysctl_sched_enable_colocation) {
+		grp->last_update = sched_ktime_clock();
+		grp->preferred_cluster = NULL;
+		return;
+	}
+
+	/*
+	 * wakeup of two or more related tasks could race with each other and
+	 * could result in multiple calls to _set_preferred_cluster being issued
+	 * at same time. Avoid overhead in such cases of rechecking preferred
+	 * cluster
+	 */
+	if (sched_ktime_clock() - grp->last_update < sched_ravg_window / 10)
+		return;
+
+	list_for_each_entry(p, &grp->tasks, grp_list)
+		combined_demand += p->ravg.demand;
+
+	grp->preferred_cluster = best_cluster(grp, combined_demand);
+	grp->last_update = sched_ktime_clock();
+	trace_sched_set_preferred_cluster(grp, combined_demand);
+}
+
+void set_preferred_cluster(struct related_thread_group *grp)
+{
+	raw_spin_lock(&grp->lock);
+	_set_preferred_cluster(grp);
+	raw_spin_unlock(&grp->lock);
+}
+
+#define ADD_TASK	0
+#define REM_TASK	1
+
+static inline void free_group_cputime(struct related_thread_group *grp)
+{
+	free_percpu(grp->cpu_time);
+}
+
+static int alloc_group_cputime(struct related_thread_group *grp)
+{
+	int i;
+	struct group_cpu_time *cpu_time;
+	int cpu = raw_smp_processor_id();
+	struct rq *rq = cpu_rq(cpu);
+	u64 window_start = rq->window_start;
+
+	grp->cpu_time = alloc_percpu(struct group_cpu_time);
+	if (!grp->cpu_time)
+		return -ENOMEM;
+
+	for_each_possible_cpu(i) {
+		cpu_time = per_cpu_ptr(grp->cpu_time, i);
+		memset(cpu_time, 0, sizeof(struct group_cpu_time));
+		cpu_time->window_start = window_start;
+	}
+
+	return 0;
+}
+
+/*
+ * A group's window_start may be behind. When moving it forward, flip prev/curr
+ * counters. When moving forward > 1 window, prev counter is set to 0
+ */
+static inline void
+sync_window_start(struct rq *rq, struct group_cpu_time *cpu_time)
+{
+	u64 delta;
+	int nr_windows;
+	u64 curr_sum = cpu_time->curr_runnable_sum;
+	u64 nt_curr_sum = cpu_time->nt_curr_runnable_sum;
+
+	delta = rq->window_start - cpu_time->window_start;
+	if (!delta)
+		return;
+
+	nr_windows = div64_u64(delta, sched_ravg_window);
+	if (nr_windows > 1)
+		curr_sum = nt_curr_sum = 0;
+
+	cpu_time->prev_runnable_sum  = curr_sum;
+	cpu_time->curr_runnable_sum  = 0;
+
+	cpu_time->nt_prev_runnable_sum = nt_curr_sum;
+	cpu_time->nt_curr_runnable_sum = 0;
+
+	cpu_time->window_start = rq->window_start;
+}
+
+/*
+ * Task's cpu usage is accounted in:
+ *	rq->curr/prev_runnable_sum,  when its ->grp is NULL
+ *	grp->cpu_time[cpu]->curr/prev_runnable_sum, when its ->grp is !NULL
+ *
+ * Transfer task's cpu usage between those counters when transitioning between
+ * groups
+ */
+static void transfer_busy_time(struct rq *rq, struct related_thread_group *grp,
+				struct task_struct *p, int event)
+{
+	u64 wallclock;
+	struct group_cpu_time *cpu_time;
+	u64 *src_curr_runnable_sum, *dst_curr_runnable_sum;
+	u64 *src_prev_runnable_sum, *dst_prev_runnable_sum;
+	u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum;
+	u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum;
+	struct migration_sum_data d;
+	int migrate_type;
+
+	if (!sched_freq_aggregate)
+		return;
+
+	wallclock = sched_ktime_clock();
+
+	update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+	update_task_ravg(p, rq, TASK_UPDATE, wallclock, 0);
+
+	/* cpu_time protected by related_thread_group_lock, grp->lock rq_lock */
+	cpu_time = _group_cpu_time(grp, cpu_of(rq));
+	if (event == ADD_TASK) {
+		sync_window_start(rq, cpu_time);
+		migrate_type = RQ_TO_GROUP;
+		d.src_rq = rq;
+		d.src_cpu_time = NULL;
+		d.dst_rq = NULL;
+		d.dst_cpu_time = cpu_time;
+		src_curr_runnable_sum = &rq->curr_runnable_sum;
+		dst_curr_runnable_sum = &cpu_time->curr_runnable_sum;
+		src_prev_runnable_sum = &rq->prev_runnable_sum;
+		dst_prev_runnable_sum = &cpu_time->prev_runnable_sum;
+
+		src_nt_curr_runnable_sum = &rq->nt_curr_runnable_sum;
+		dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
+		src_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;
+		dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
+	} else {
+		migrate_type = GROUP_TO_RQ;
+		d.src_rq = NULL;
+		d.src_cpu_time = cpu_time;
+		d.dst_rq = rq;
+		d.dst_cpu_time = NULL;
+
+		/*
+		 * In case of REM_TASK, cpu_time->window_start would be
+		 * uptodate, because of the update_task_ravg() we called
+		 * above on the moving task. Hence no need for
+		 * sync_window_start()
+		 */
+		src_curr_runnable_sum = &cpu_time->curr_runnable_sum;
+		dst_curr_runnable_sum = &rq->curr_runnable_sum;
+		src_prev_runnable_sum = &cpu_time->prev_runnable_sum;
+		dst_prev_runnable_sum = &rq->prev_runnable_sum;
+
+		src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
+		dst_nt_curr_runnable_sum = &rq->nt_curr_runnable_sum;
+		src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
+		dst_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;
+	}
+
+	*src_curr_runnable_sum -= p->ravg.curr_window;
+	*dst_curr_runnable_sum += p->ravg.curr_window;
+
+	*src_prev_runnable_sum -= p->ravg.prev_window;
+	*dst_prev_runnable_sum += p->ravg.prev_window;
+
+	if (is_new_task(p)) {
+		*src_nt_curr_runnable_sum -= p->ravg.curr_window;
+		*dst_nt_curr_runnable_sum += p->ravg.curr_window;
+		*src_nt_prev_runnable_sum -= p->ravg.prev_window;
+		*dst_nt_prev_runnable_sum += p->ravg.prev_window;
+	}
+
+	trace_sched_migration_update_sum(p, migrate_type, &d);
+
+	BUG_ON((s64)*src_curr_runnable_sum < 0);
+	BUG_ON((s64)*src_prev_runnable_sum < 0);
+}
+
+static inline struct group_cpu_time *
+task_group_cpu_time(struct task_struct *p, int cpu)
+{
+	return _group_cpu_time(rcu_dereference(p->grp), cpu);
+}
+
+static inline struct group_cpu_time *
+_group_cpu_time(struct related_thread_group *grp, int cpu)
+{
+	return grp ? per_cpu_ptr(grp->cpu_time, cpu) : NULL;
+}
+
+struct related_thread_group *alloc_related_thread_group(int group_id)
+{
+	struct related_thread_group *grp;
+
+	grp = kzalloc(sizeof(*grp), GFP_KERNEL);
+	if (!grp)
+		return ERR_PTR(-ENOMEM);
+
+	if (alloc_group_cputime(grp)) {
+		kfree(grp);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	grp->id = group_id;
+	INIT_LIST_HEAD(&grp->tasks);
+	INIT_LIST_HEAD(&grp->list);
+	raw_spin_lock_init(&grp->lock);
+
+	return grp;
+}
+
+struct related_thread_group *lookup_related_thread_group(unsigned int group_id)
+{
+	struct related_thread_group *grp;
+
+	list_for_each_entry(grp, &related_thread_groups, list) {
+		if (grp->id == group_id)
+			return grp;
+	}
+
+	return NULL;
+}
+
+/* See comments before preferred_cluster() */
+static void free_related_thread_group(struct rcu_head *rcu)
+{
+	struct related_thread_group *grp = container_of(rcu, struct
+			related_thread_group, rcu);
+
+	free_group_cputime(grp);
+	kfree(grp);
+}
+
+static void remove_task_from_group(struct task_struct *p)
+{
+	struct related_thread_group *grp = p->grp;
+	struct rq *rq;
+	int empty_group = 1;
+
+	raw_spin_lock(&grp->lock);
+
+	rq = __task_rq_lock(p);
+	transfer_busy_time(rq, p->grp, p, REM_TASK);
+	list_del_init(&p->grp_list);
+	rcu_assign_pointer(p->grp, NULL);
+	__task_rq_unlock(rq);
+
+	if (!list_empty(&grp->tasks)) {
+		empty_group = 0;
+		_set_preferred_cluster(grp);
+	}
+
+	raw_spin_unlock(&grp->lock);
+
+	if (empty_group) {
+		list_del(&grp->list);
+		call_rcu(&grp->rcu, free_related_thread_group);
+	}
+}
+
+static int
+add_task_to_group(struct task_struct *p, struct related_thread_group *grp)
+{
+	struct rq *rq;
+
+	raw_spin_lock(&grp->lock);
+
+	/*
+	 * Change p->grp under rq->lock. Will prevent races with read-side
+	 * reference of p->grp in various hot-paths
+	 */
+	rq = __task_rq_lock(p);
+	transfer_busy_time(rq, grp, p, ADD_TASK);
+	list_add(&p->grp_list, &grp->tasks);
+	rcu_assign_pointer(p->grp, grp);
+	__task_rq_unlock(rq);
+
+	_set_preferred_cluster(grp);
+
+	raw_spin_unlock(&grp->lock);
+
+	return 0;
+}
+
+void add_new_task_to_grp(struct task_struct *new)
+{
+	unsigned long flags;
+	struct related_thread_group *grp;
+	struct task_struct *parent;
+
+	if (!sysctl_sched_enable_thread_grouping)
+		return;
+
+	if (thread_group_leader(new))
+		return;
+
+	parent = new->group_leader;
+
+	/*
+	 * The parent's pi_lock is required here to protect race
+	 * against the parent task being removed from the
+	 * group.
+	 */
+	raw_spin_lock_irqsave(&parent->pi_lock, flags);
+
+	/* protected by pi_lock. */
+	grp = task_related_thread_group(parent);
+	if (!grp) {
+		raw_spin_unlock_irqrestore(&parent->pi_lock, flags);
+		return;
+	}
+	raw_spin_lock(&grp->lock);
+
+	rcu_assign_pointer(new->grp, grp);
+	list_add(&new->grp_list, &grp->tasks);
+
+	raw_spin_unlock(&grp->lock);
+	raw_spin_unlock_irqrestore(&parent->pi_lock, flags);
+}
+
+int sched_set_group_id(struct task_struct *p, unsigned int group_id)
+{
+	int rc = 0, destroy = 0;
+	unsigned long flags;
+	struct related_thread_group *grp = NULL, *new = NULL;
+
+redo:
+	raw_spin_lock_irqsave(&p->pi_lock, flags);
+
+	if ((current != p && p->flags & PF_EXITING) ||
+			(!p->grp && !group_id) ||
+			(p->grp && p->grp->id == group_id))
+		goto done;
+
+	write_lock(&related_thread_group_lock);
+
+	if (!group_id) {
+		remove_task_from_group(p);
+		write_unlock(&related_thread_group_lock);
+		goto done;
+	}
+
+	if (p->grp && p->grp->id != group_id)
+		remove_task_from_group(p);
+
+	grp = lookup_related_thread_group(group_id);
+	if (!grp && !new) {
+		/* New group */
+		write_unlock(&related_thread_group_lock);
+		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+		new = alloc_related_thread_group(group_id);
+		if (IS_ERR(new))
+			return -ENOMEM;
+		destroy = 1;
+		/* Rerun checks (like task exiting), since we dropped pi_lock */
+		goto redo;
+	} else if (!grp && new) {
+		/* New group - use object allocated before */
+		destroy = 0;
+		list_add(&new->list, &related_thread_groups);
+		grp = new;
+	}
+
+	BUG_ON(!grp);
+	rc = add_task_to_group(p, grp);
+	write_unlock(&related_thread_group_lock);
+done:
+	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+	if (new && destroy) {
+		free_group_cputime(new);
+		kfree(new);
+	}
+
+	return rc;
+}
+
+unsigned int sched_get_group_id(struct task_struct *p)
+{
+	unsigned int group_id;
+	struct related_thread_group *grp;
+
+	rcu_read_lock();
+	grp = task_related_thread_group(p);
+	group_id = grp ? grp->id : 0;
+	rcu_read_unlock();
+
+	return group_id;
+}
+
+static void update_cpu_cluster_capacity(const cpumask_t *cpus)
+{
+	int i;
+	struct sched_cluster *cluster;
+	struct cpumask cpumask;
+
+	cpumask_copy(&cpumask, cpus);
+	pre_big_task_count_change(cpu_possible_mask);
+
+	for_each_cpu(i, &cpumask) {
+		cluster = cpu_rq(i)->cluster;
+		cpumask_andnot(&cpumask, &cpumask, &cluster->cpus);
+
+		cluster->capacity = compute_capacity(cluster);
+		cluster->load_scale_factor = compute_load_scale_factor(cluster);
+
+		/* 'cpus' can contain cpumask more than one cluster */
+		check_for_up_down_migrate_update(&cluster->cpus);
+	}
+
+	__update_min_max_capacity();
+
+	post_big_task_count_change(cpu_possible_mask);
+}
+
+static DEFINE_SPINLOCK(cpu_freq_min_max_lock);
+void sched_update_cpu_freq_min_max(const cpumask_t *cpus, u32 fmin, u32 fmax)
+{
+	struct cpumask cpumask;
+	struct sched_cluster *cluster;
+	int i, update_capacity = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&cpu_freq_min_max_lock, flags);
+	cpumask_copy(&cpumask, cpus);
+	for_each_cpu(i, &cpumask) {
+		cluster = cpu_rq(i)->cluster;
+		cpumask_andnot(&cpumask, &cpumask, &cluster->cpus);
+
+		update_capacity += (cluster->max_mitigated_freq != fmax);
+		cluster->max_mitigated_freq = fmax;
+	}
+	spin_unlock_irqrestore(&cpu_freq_min_max_lock, flags);
+
+	if (update_capacity)
+		update_cpu_cluster_capacity(cpus);
+}
+
+static int cpufreq_notifier_policy(struct notifier_block *nb,
+		unsigned long val, void *data)
+{
+	struct cpufreq_policy *policy = (struct cpufreq_policy *)data;
+	struct sched_cluster *cluster = NULL;
+	struct cpumask policy_cluster = *policy->related_cpus;
+	unsigned int orig_max_freq = 0;
+	int i, j, update_capacity = 0;
+
+	if (val != CPUFREQ_NOTIFY && val != CPUFREQ_REMOVE_POLICY &&
+						val != CPUFREQ_CREATE_POLICY)
+		return 0;
+
+	if (val == CPUFREQ_REMOVE_POLICY || val == CPUFREQ_CREATE_POLICY) {
+		update_min_max_capacity();
+		return 0;
+	}
+
+	max_possible_freq = max(max_possible_freq, policy->cpuinfo.max_freq);
+	if (min_max_freq == 1)
+		min_max_freq = UINT_MAX;
+	min_max_freq = min(min_max_freq, policy->cpuinfo.max_freq);
+	BUG_ON(!min_max_freq);
+	BUG_ON(!policy->max);
+
+	for_each_cpu(i, &policy_cluster) {
+		cluster = cpu_rq(i)->cluster;
+		cpumask_andnot(&policy_cluster, &policy_cluster,
+						&cluster->cpus);
+
+		orig_max_freq = cluster->max_freq;
+		cluster->min_freq = policy->min;
+		cluster->max_freq = policy->max;
+		cluster->cur_freq = policy->cur;
+
+		if (!cluster->freq_init_done) {
+			mutex_lock(&cluster_lock);
+			for_each_cpu(j, &cluster->cpus)
+				cpumask_copy(&cpu_rq(j)->freq_domain_cpumask,
+						policy->related_cpus);
+			cluster->max_possible_freq = policy->cpuinfo.max_freq;
+			cluster->max_possible_capacity =
+				compute_max_possible_capacity(cluster);
+			cluster->freq_init_done = true;
+
+			sort_clusters();
+			update_all_clusters_stats();
+			mutex_unlock(&cluster_lock);
+			continue;
+		}
+
+		update_capacity += (orig_max_freq != cluster->max_freq);
+	}
+
+	if (update_capacity)
+		update_cpu_cluster_capacity(policy->related_cpus);
+
+	return 0;
+}
+
+static int cpufreq_notifier_trans(struct notifier_block *nb,
+		unsigned long val, void *data)
+{
+	struct cpufreq_freqs *freq = (struct cpufreq_freqs *)data;
+	unsigned int cpu = freq->cpu, new_freq = freq->new;
+	unsigned long flags;
+	struct sched_cluster *cluster;
+	struct cpumask policy_cpus = cpu_rq(cpu)->freq_domain_cpumask;
+	int i, j;
+
+	if (val != CPUFREQ_POSTCHANGE)
+		return 0;
+
+	BUG_ON(!new_freq);
+
+	if (cpu_cur_freq(cpu) == new_freq)
+		return 0;
+
+	for_each_cpu(i, &policy_cpus) {
+		cluster = cpu_rq(i)->cluster;
+
+		for_each_cpu(j, &cluster->cpus) {
+			struct rq *rq = cpu_rq(j);
+
+			raw_spin_lock_irqsave(&rq->lock, flags);
+			update_task_ravg(rq->curr, rq, TASK_UPDATE,
+						sched_ktime_clock(), 0);
+			raw_spin_unlock_irqrestore(&rq->lock, flags);
+		}
+
+		cluster->cur_freq = new_freq;
+		cpumask_andnot(&policy_cpus, &policy_cpus, &cluster->cpus);
+	}
+
+	return 0;
+}
+
+static int pwr_stats_ready_notifier(struct notifier_block *nb,
+				    unsigned long cpu, void *data)
+{
+	cpumask_t mask = CPU_MASK_NONE;
+
+	cpumask_set_cpu(cpu, &mask);
+	sched_update_freq_max_load(&mask);
+
+	mutex_lock(&cluster_lock);
+	sort_clusters();
+	mutex_unlock(&cluster_lock);
+
+	return 0;
+}
+
+static struct notifier_block notifier_policy_block = {
+	.notifier_call = cpufreq_notifier_policy
+};
+
+static struct notifier_block notifier_trans_block = {
+	.notifier_call = cpufreq_notifier_trans
+};
+
+static struct notifier_block notifier_pwr_stats_ready = {
+	.notifier_call = pwr_stats_ready_notifier
+};
+
+int __weak register_cpu_pwr_stats_ready_notifier(struct notifier_block *nb)
+{
+	return -EINVAL;
+}
+
+static int register_sched_callback(void)
+{
+	int ret;
+
+	if (!sched_enable_hmp)
+		return 0;
+
+	ret = cpufreq_register_notifier(&notifier_policy_block,
+						CPUFREQ_POLICY_NOTIFIER);
+
+	if (!ret)
+		ret = cpufreq_register_notifier(&notifier_trans_block,
+						CPUFREQ_TRANSITION_NOTIFIER);
+
+	register_cpu_pwr_stats_ready_notifier(&notifier_pwr_stats_ready);
+
+	return 0;
+}
+
+/*
+ * cpufreq callbacks can be registered at core_initcall or later time.
+ * Any registration done prior to that is "forgotten" by cpufreq. See
+ * initialization of variable init_cpufreq_transition_notifier_list_called
+ * for further information.
+ */
+core_initcall(register_sched_callback);
+
+int update_preferred_cluster(struct related_thread_group *grp,
+		struct task_struct *p, u32 old_load)
+{
+	u32 new_load = task_load(p);
+
+	if (!grp)
+		return 0;
+
+	/*
+	 * Update if task's load has changed significantly or a complete window
+	 * has passed since we last updated preference
+	 */
+	if (abs(new_load - old_load) > sched_ravg_window / 4 ||
+		sched_ktime_clock() - grp->last_update > sched_ravg_window)
+		return 1;
+
+	return 0;
+}
+
+bool early_detection_notify(struct rq *rq, u64 wallclock)
+{
+	struct task_struct *p;
+	int loop_max = 10;
+
+	if (!sched_boost() || !rq->cfs.h_nr_running)
+		return 0;
+
+	rq->ed_task = NULL;
+	list_for_each_entry(p, &rq->cfs_tasks, se.group_node) {
+		if (!loop_max)
+			break;
+
+		if (wallclock - p->last_wake_ts >= EARLY_DETECTION_DURATION) {
+			rq->ed_task = p;
+			return 1;
+		}
+
+		loop_max--;
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_CGROUP_SCHED
+u64 cpu_upmigrate_discourage_read_u64(struct cgroup_subsys_state *css,
+					  struct cftype *cft)
+{
+	struct task_group *tg = css_tg(css);
+
+	return tg->upmigrate_discouraged;
+}
+
+int cpu_upmigrate_discourage_write_u64(struct cgroup_subsys_state *css,
+				struct cftype *cft, u64 upmigrate_discourage)
+{
+	struct task_group *tg = css_tg(css);
+	int discourage = upmigrate_discourage > 0;
+
+	if (tg->upmigrate_discouraged == discourage)
+		return 0;
+
+	/*
+	 * Revisit big-task classification for tasks of this cgroup. It would
+	 * have been efficient to walk tasks of just this cgroup in running
+	 * state, but we don't have easy means to do that. Walk all tasks in
+	 * running state on all cpus instead and re-visit their big task
+	 * classification.
+	 */
+	get_online_cpus();
+	pre_big_task_count_change(cpu_online_mask);
+
+	tg->upmigrate_discouraged = discourage;
+
+	post_big_task_count_change(cpu_online_mask);
+	put_online_cpus();
+
+	return 0;
+}
+#endif /* CONFIG_CGROUP_SCHED */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b9566cf3ad37..e31334d5f581 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -254,7 +254,6 @@ struct cfs_bandwidth {
 struct task_group {
 	struct cgroup_subsys_state css;
 
-	bool notify_on_migrate;
 #ifdef CONFIG_SCHED_HMP
 	bool upmigrate_discouraged;
 #endif
@@ -356,6 +355,7 @@ extern void sched_move_task(struct task_struct *tsk);
 extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
 #endif
 
+extern struct task_group *css_tg(struct cgroup_subsys_state *css);
 #else /* CONFIG_CGROUP_SCHED */
 
 struct cfs_bandwidth { };
@@ -367,9 +367,7 @@ struct cfs_bandwidth { };
 struct hmp_sched_stats {
 	int nr_big_tasks;
 	u64 cumulative_runnable_avg;
-#ifdef CONFIG_SCHED_FREQ_INPUT
 	u64 pred_demands_sum;
-#endif
 };
 
 struct sched_cluster {
@@ -393,6 +391,7 @@ struct sched_cluster {
 	bool freq_init_done;
 	int dstate, dstate_wakeup_latency, dstate_wakeup_energy;
 	unsigned int static_cluster_pwr_cost;
+	int notifier_sent;
 };
 
 extern unsigned long all_cluster_ids[];
@@ -410,23 +409,17 @@ struct related_thread_group {
 	struct sched_cluster *preferred_cluster;
 	struct rcu_head rcu;
 	u64 last_update;
-#ifdef CONFIG_SCHED_FREQ_INPUT
 	struct group_cpu_time __percpu *cpu_time;	/* one per cluster */
-#endif
 };
 
 struct migration_sum_data {
 	struct rq *src_rq, *dst_rq;
-#ifdef CONFIG_SCHED_FREQ_INPUT
 	struct group_cpu_time *src_cpu_time, *dst_cpu_time;
-#endif
 };
 
 extern struct list_head cluster_head;
 extern int num_clusters;
 extern struct sched_cluster *sched_cluster[NR_CPUS];
-extern int group_will_fit(struct sched_cluster *cluster,
-		 struct related_thread_group *grp, u64 demand);
 
 struct cpu_cycle {
 	u64 cycles;
@@ -436,7 +429,7 @@ struct cpu_cycle {
 #define for_each_sched_cluster(cluster) \
 	list_for_each_entry_rcu(cluster, &cluster_head, list)
 
-#endif
+#endif /* CONFIG_SCHED_HMP */
 
 /* CFS-related fields in a runqueue */
 struct cfs_rq {
@@ -736,7 +729,6 @@ struct rq {
 	u64 age_stamp;
 	u64 idle_stamp;
 	u64 avg_idle;
-	int cstate, wakeup_latency, wakeup_energy;
 
 	/* This is used to determine avg_idle's max value */
 	u64 max_idle_balance_cost;
@@ -747,6 +739,7 @@ struct rq {
 	struct cpumask freq_domain_cpumask;
 	struct hmp_sched_stats hmp_stats;
 
+	int cstate, wakeup_latency, wakeup_energy;
 	u64 window_start;
 	unsigned long hmp_flags;
 
@@ -756,15 +749,8 @@ struct rq {
 	unsigned int static_cpu_pwr_cost;
 	struct task_struct *ed_task;
 	struct cpu_cycle cc;
-
-#ifdef CONFIG_SCHED_FREQ_INPUT
 	u64 old_busy_time, old_busy_time_group;
-	int notifier_sent;
 	u64 old_estimated_time;
-#endif
-#endif
-
-#ifdef CONFIG_SCHED_FREQ_INPUT
 	u64 curr_runnable_sum;
 	u64 prev_runnable_sum;
 	u64 nt_curr_runnable_sum;
@@ -1036,8 +1022,6 @@ static inline void sched_ttwu_pending(void) { }
 #include "stats.h"
 #include "auto_group.h"
 
-extern void init_new_task_load(struct task_struct *p);
-
 #ifdef CONFIG_SCHED_HMP
 
 #define WINDOW_STATS_RECENT		0
@@ -1046,9 +1030,16 @@ extern void init_new_task_load(struct task_struct *p);
 #define WINDOW_STATS_AVG		3
 #define WINDOW_STATS_INVALID_POLICY	4
 
+#define MAJOR_TASK_PCT 85
+#define SCHED_UPMIGRATE_MIN_NICE 15
+#define EXITING_TASK_MARKER	0xdeaddead
+
+#define UP_MIGRATION		1
+#define DOWN_MIGRATION		2
+#define IRQLOAD_MIGRATION	3
+
 extern struct mutex policy_mutex;
 extern unsigned int sched_ravg_window;
-extern unsigned int sched_use_pelt;
 extern unsigned int sched_disable_window_stats;
 extern unsigned int sched_enable_hmp;
 extern unsigned int max_possible_freq;
@@ -1063,28 +1054,59 @@ extern unsigned int max_possible_capacity;
 extern unsigned int min_max_possible_capacity;
 extern unsigned int sched_upmigrate;
 extern unsigned int sched_downmigrate;
-extern unsigned int sched_init_task_load_pelt;
 extern unsigned int sched_init_task_load_windows;
 extern unsigned int up_down_migrate_scale_factor;
 extern unsigned int sysctl_sched_restrict_cluster_spill;
 extern unsigned int sched_pred_alert_load;
-
-#ifdef CONFIG_SCHED_FREQ_INPUT
-#define MAJOR_TASK_PCT 85
 extern unsigned int sched_major_task_runtime;
-#endif
+extern struct sched_cluster init_cluster;
+extern unsigned int  __read_mostly sched_short_sleep_task_threshold;
+extern unsigned int  __read_mostly sched_long_cpu_selection_threshold;
+extern unsigned int  __read_mostly sched_big_waker_task_load;
+extern unsigned int  __read_mostly sched_small_wakee_task_load;
+extern unsigned int  __read_mostly sched_spill_load;
+extern unsigned int  __read_mostly sched_upmigrate;
+extern unsigned int  __read_mostly sched_downmigrate;
+extern unsigned int  __read_mostly sysctl_sched_spill_nr_run;
 
+extern void init_new_task_load(struct task_struct *p);
+extern u64 sched_ktime_clock(void);
+extern int got_boost_kick(void);
+extern int register_cpu_cycle_counter_cb(struct cpu_cycle_counter_cb *cb);
+extern void update_task_ravg(struct task_struct *p, struct rq *rq, int event,
+						u64 wallclock, u64 irqtime);
+extern bool early_detection_notify(struct rq *rq, u64 wallclock);
+extern void clear_ed_task(struct task_struct *p, struct rq *rq);
+extern void fixup_busy_time(struct task_struct *p, int new_cpu);
+extern void clear_boost_kick(int cpu);
+extern void clear_hmp_request(int cpu);
+extern void mark_task_starting(struct task_struct *p);
+extern void set_window_start(struct rq *rq);
+extern void migrate_sync_cpu(int cpu);
+extern void update_cluster_topology(void);
+extern void set_task_last_wake(struct task_struct *p, u64 wallclock);
+extern void set_task_last_switch_out(struct task_struct *p, u64 wallclock);
+extern void init_clusters(void);
+extern int __init set_sched_enable_hmp(char *str);
 extern void reset_cpu_hmp_stats(int cpu, int reset_cra);
 extern unsigned int max_task_load(void);
 extern void sched_account_irqtime(int cpu, struct task_struct *curr,
 				 u64 delta, u64 wallclock);
 extern void sched_account_irqstart(int cpu, struct task_struct *curr,
 				   u64 wallclock);
-
-unsigned int cpu_temp(int cpu);
-int sched_set_group_id(struct task_struct *p, unsigned int group_id);
+extern unsigned int cpu_temp(int cpu);
 extern unsigned int nr_eligible_big_tasks(int cpu);
 extern void update_up_down_migrate(void);
+extern int update_preferred_cluster(struct related_thread_group *grp,
+			struct task_struct *p, u32 old_load);
+extern void set_preferred_cluster(struct related_thread_group *grp);
+extern void add_new_task_to_grp(struct task_struct *new);
+
+enum sched_boost_type {
+	SCHED_BOOST_NONE,
+	SCHED_BOOST_ON_BIG,
+	SCHED_BOOST_ON_ALL,
+};
 
 static inline struct sched_cluster *cpu_cluster(int cpu)
 {
@@ -1180,20 +1202,9 @@ static inline u64 scale_load_to_cpu(u64 task_load, int cpu)
 
 static inline unsigned int task_load(struct task_struct *p)
 {
-	if (sched_use_pelt)
-		return p->se.avg.runnable_avg_sum_scaled;
-
 	return p->ravg.demand;
 }
 
-#ifdef CONFIG_SCHED_FREQ_INPUT
-#define set_pred_demands_sum(stats, x) ((stats)->pred_demands_sum = (x))
-#define verify_pred_demands_sum(stat) BUG_ON((s64)(stat)->pred_demands_sum < 0)
-#else
-#define set_pred_demands_sum(stats, x)
-#define verify_pred_demands_sum(stat)
-#endif
-
 static inline void
 inc_cumulative_runnable_avg(struct hmp_sched_stats *stats,
 				 struct task_struct *p)
@@ -1203,33 +1214,29 @@ inc_cumulative_runnable_avg(struct hmp_sched_stats *stats,
 	if (!sched_enable_hmp || sched_disable_window_stats)
 		return;
 
-	task_load = sched_use_pelt ? p->se.avg.runnable_avg_sum_scaled :
-			(sched_disable_window_stats ? 0 : p->ravg.demand);
+	task_load = sched_disable_window_stats ? 0 : p->ravg.demand;
 
 	stats->cumulative_runnable_avg += task_load;
-	set_pred_demands_sum(stats, stats->pred_demands_sum +
-			     p->ravg.pred_demand);
+	stats->pred_demands_sum += p->ravg.pred_demand;
 }
 
 static inline void
 dec_cumulative_runnable_avg(struct hmp_sched_stats *stats,
-				 struct task_struct *p)
+				struct task_struct *p)
 {
 	u32 task_load;
 
 	if (!sched_enable_hmp || sched_disable_window_stats)
 		return;
 
-	task_load = sched_use_pelt ? p->se.avg.runnable_avg_sum_scaled :
-			(sched_disable_window_stats ? 0 : p->ravg.demand);
+	task_load = sched_disable_window_stats ? 0 : p->ravg.demand;
 
 	stats->cumulative_runnable_avg -= task_load;
 
 	BUG_ON((s64)stats->cumulative_runnable_avg < 0);
 
-	set_pred_demands_sum(stats, stats->pred_demands_sum -
-			     p->ravg.pred_demand);
-	verify_pred_demands_sum(stats);
+	stats->pred_demands_sum -= p->ravg.pred_demand;
+	BUG_ON((s64)stats->pred_demands_sum < 0);
 }
 
 static inline void
@@ -1243,12 +1250,10 @@ fixup_cumulative_runnable_avg(struct hmp_sched_stats *stats,
 	stats->cumulative_runnable_avg += task_load_delta;
 	BUG_ON((s64)stats->cumulative_runnable_avg < 0);
 
-	set_pred_demands_sum(stats, stats->pred_demands_sum +
-			     pred_demand_delta);
-	verify_pred_demands_sum(stats);
+	stats->pred_demands_sum += pred_demand_delta;
+	BUG_ON((s64)stats->pred_demands_sum < 0);
 }
 
-
 #define pct_to_real(tunable)	\
 		(div64_u64((u64)tunable * (u64)max_task_load(), 100))
 
@@ -1280,90 +1285,25 @@ static inline int sched_cpu_high_irqload(int cpu)
 	return sched_irqload(cpu) >= sysctl_sched_cpu_high_irqload;
 }
 
-static inline
-struct related_thread_group *task_related_thread_group(struct task_struct *p)
+static inline bool task_in_related_thread_group(struct task_struct *p)
 {
-	return rcu_dereference(p->grp);
+	return !!(rcu_access_pointer(p->grp) != NULL);
 }
 
-#else	/* CONFIG_SCHED_HMP */
-
-#define sched_use_pelt 0
-
-struct hmp_sched_stats;
-struct related_thread_group;
-
-static inline u64 scale_load_to_cpu(u64 load, int cpu)
-{
-	return load;
-}
-
-static inline unsigned int nr_eligible_big_tasks(int cpu)
-{
-	return 0;
-}
-
-static inline int pct_task_load(struct task_struct *p) { return 0; }
-
-static inline int cpu_capacity(int cpu)
-{
-	return SCHED_LOAD_SCALE;
-}
-
-static inline int same_cluster(int src_cpu, int dst_cpu) { return 1; }
-
-static inline void inc_cumulative_runnable_avg(struct hmp_sched_stats *stats,
-		 struct task_struct *p)
-{
-}
-
-static inline void dec_cumulative_runnable_avg(struct hmp_sched_stats *stats,
-		 struct task_struct *p)
-{
-}
-
-static inline void sched_account_irqtime(int cpu, struct task_struct *curr,
-				 u64 delta, u64 wallclock)
-{
-}
-
-static inline void sched_account_irqstart(int cpu, struct task_struct *curr,
-					  u64 wallclock)
-{
-}
-
-static inline int sched_cpu_high_irqload(int cpu) { return 0; }
-
-static inline void set_preferred_cluster(struct related_thread_group *grp) { }
-
 static inline
 struct related_thread_group *task_related_thread_group(struct task_struct *p)
 {
-	return NULL;
-}
-
-static inline u32 task_load(struct task_struct *p) { return 0; }
-
-static inline int update_preferred_cluster(struct related_thread_group *grp,
-			 struct task_struct *p, u32 old_load)
-{
-	return 0;
+	return rcu_dereference(p->grp);
 }
 
-#endif	/* CONFIG_SCHED_HMP */
-
-/*
- * Returns the rq capacity of any rq in a group. This does not play
- * well with groups where rq capacity can change independently.
- */
-#define group_rq_capacity(group) cpu_capacity(group_first_cpu(group))
-
-#ifdef CONFIG_SCHED_FREQ_INPUT
 #define PRED_DEMAND_DELTA ((s64)new_pred_demand - p->ravg.pred_demand)
 
 extern void
 check_for_freq_change(struct rq *rq, bool check_pred, bool check_groups);
 
+extern void notify_migration(int src_cpu, int dest_cpu,
+			bool src_cpu_dead, struct task_struct *p);
+
 struct group_cpu_time {
 	u64 curr_runnable_sum;
 	u64 prev_runnable_sum;
@@ -1383,23 +1323,6 @@ static inline int same_freq_domain(int src_cpu, int dst_cpu)
 	return cpumask_test_cpu(dst_cpu, &rq->freq_domain_cpumask);
 }
 
-#else	/* CONFIG_SCHED_FREQ_INPUT */
-
-#define sched_migration_fixup	0
-#define PRED_DEMAND_DELTA (0)
-
-static inline void
-check_for_freq_change(struct rq *rq, bool check_pred, bool check_groups) { }
-
-static inline int same_freq_domain(int src_cpu, int dst_cpu)
-{
-	return 1;
-}
-
-#endif	/* CONFIG_SCHED_FREQ_INPUT */
-
-#ifdef CONFIG_SCHED_HMP
-
 #define	BOOST_KICK	0
 #define	CPU_RESERVED	1
 
@@ -1456,11 +1379,220 @@ extern unsigned int power_cost(int cpu, u64 demand);
 extern void reset_all_window_stats(u64 window_start, unsigned int window_size);
 extern void boost_kick(int cpu);
 extern int sched_boost(void);
+extern int task_load_will_fit(struct task_struct *p, u64 task_load, int cpu,
+					enum sched_boost_type boost_type);
+extern enum sched_boost_type sched_boost_type(void);
+extern int task_will_fit(struct task_struct *p, int cpu);
+extern int group_will_fit(struct sched_cluster *cluster,
+		 struct related_thread_group *grp, u64 demand);
+extern u64 cpu_load(int cpu);
+extern u64 cpu_load_sync(int cpu, int sync);
+extern int preferred_cluster(struct sched_cluster *cluster,
+						struct task_struct *p);
+extern void inc_nr_big_task(struct hmp_sched_stats *stats,
+					struct task_struct *p);
+extern void dec_nr_big_task(struct hmp_sched_stats *stats,
+					struct task_struct *p);
+extern void inc_rq_hmp_stats(struct rq *rq,
+				struct task_struct *p, int change_cra);
+extern void dec_rq_hmp_stats(struct rq *rq,
+				struct task_struct *p, int change_cra);
+extern int is_big_task(struct task_struct *p);
+extern int upmigrate_discouraged(struct task_struct *p);
+extern struct sched_cluster *rq_cluster(struct rq *rq);
+extern int nr_big_tasks(struct rq *rq);
+extern void fixup_nr_big_tasks(struct hmp_sched_stats *stats,
+					struct task_struct *p, s64 delta);
+extern void reset_task_stats(struct task_struct *p);
+extern void reset_cfs_rq_hmp_stats(int cpu, int reset_cra);
+extern void _inc_hmp_sched_stats_fair(struct rq *rq,
+			struct task_struct *p, int change_cra);
+extern u64 cpu_upmigrate_discourage_read_u64(struct cgroup_subsys_state *css,
+					struct cftype *cft);
+extern int cpu_upmigrate_discourage_write_u64(struct cgroup_subsys_state *css,
+				struct cftype *cft, u64 upmigrate_discourage);
+
+#else	/* CONFIG_SCHED_HMP */
+
+struct hmp_sched_stats;
+struct related_thread_group;
+struct sched_cluster;
+
+static inline int got_boost_kick(void)
+{
+	return 0;
+}
+
+static inline void update_task_ravg(struct task_struct *p, struct rq *rq,
+				int event, u64 wallclock, u64 irqtime) { }
+
+static inline bool early_detection_notify(struct rq *rq, u64 wallclock)
+{
+	return 0;
+}
+
+static inline void clear_ed_task(struct task_struct *p, struct rq *rq) { }
+static inline void fixup_busy_time(struct task_struct *p, int new_cpu) { }
+static inline void clear_boost_kick(int cpu) { }
+static inline void clear_hmp_request(int cpu) { }
+static inline void mark_task_starting(struct task_struct *p) { }
+static inline void set_window_start(struct rq *rq) { }
+static inline void migrate_sync_cpu(int cpu) { }
+static inline void update_cluster_topology(void) { }
+static inline void set_task_last_wake(struct task_struct *p, u64 wallclock) { }
+static inline void set_task_last_switch_out(struct task_struct *p,
+					    u64 wallclock) { }
+
+static inline int task_will_fit(struct task_struct *p, int cpu)
+{
+	return 1;
+}
+
+static inline int select_best_cpu(struct task_struct *p, int target,
+				  int reason, int sync)
+{
+	return 0;
+}
+
+static inline unsigned int power_cost(int cpu, u64 demand)
+{
+	return SCHED_CAPACITY_SCALE;
+}
+
+static inline int sched_boost(void)
+{
+	return 0;
+}
+
+static inline int is_big_task(struct task_struct *p)
+{
+	return 0;
+}
 
-#else /* CONFIG_SCHED_HMP */
+static inline int nr_big_tasks(struct rq *rq)
+{
+	return 0;
+}
+
+static inline int is_cpu_throttling_imminent(int cpu)
+{
+	return 0;
+}
+
+static inline int is_task_migration_throttled(struct task_struct *p)
+{
+	return 0;
+}
+
+static inline unsigned int cpu_temp(int cpu)
+{
+	return 0;
+}
+
+static inline void
+inc_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra) { }
+
+static inline void
+dec_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra) { }
+
+static inline void
+inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p) { }
+
+static inline void
+dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p) { }
+
+static inline int
+preferred_cluster(struct sched_cluster *cluster, struct task_struct *p)
+{
+	return 1;
+}
+
+static inline struct sched_cluster *rq_cluster(struct rq *rq)
+{
+	return NULL;
+}
+
+static inline void init_new_task_load(struct task_struct *p) { }
+
+static inline u64 scale_load_to_cpu(u64 load, int cpu)
+{
+	return load;
+}
+
+static inline unsigned int nr_eligible_big_tasks(int cpu)
+{
+	return 0;
+}
+
+static inline int pct_task_load(struct task_struct *p) { return 0; }
+
+static inline int cpu_capacity(int cpu)
+{
+	return SCHED_LOAD_SCALE;
+}
+
+static inline int same_cluster(int src_cpu, int dst_cpu) { return 1; }
+
+static inline void inc_cumulative_runnable_avg(struct hmp_sched_stats *stats,
+		 struct task_struct *p)
+{
+}
+
+static inline void dec_cumulative_runnable_avg(struct hmp_sched_stats *stats,
+		 struct task_struct *p)
+{
+}
+
+static inline void sched_account_irqtime(int cpu, struct task_struct *curr,
+				 u64 delta, u64 wallclock)
+{
+}
+
+static inline void sched_account_irqstart(int cpu, struct task_struct *curr,
+					  u64 wallclock)
+{
+}
+
+static inline int sched_cpu_high_irqload(int cpu) { return 0; }
+
+static inline void set_preferred_cluster(struct related_thread_group *grp) { }
+
+static inline bool task_in_related_thread_group(struct task_struct *p)
+{
+	return false;
+}
+
+static inline
+struct related_thread_group *task_related_thread_group(struct task_struct *p)
+{
+	return NULL;
+}
+
+static inline u32 task_load(struct task_struct *p) { return 0; }
+
+static inline int update_preferred_cluster(struct related_thread_group *grp,
+			 struct task_struct *p, u32 old_load)
+{
+	return 0;
+}
+
+static inline void add_new_task_to_grp(struct task_struct *new) {}
 
 #define sched_enable_hmp 0
 #define sched_freq_legacy_mode 1
+#define sched_migration_fixup	0
+#define PRED_DEMAND_DELTA (0)
+
+static inline void
+check_for_freq_change(struct rq *rq, bool check_pred, bool check_groups) { }
+
+static inline void notify_migration(int src_cpu, int dest_cpu,
+			bool src_cpu_dead, struct task_struct *p) { }
+
+static inline int same_freq_domain(int src_cpu, int dst_cpu)
+{
+	return 1;
+}
 
 static inline void check_for_migration(struct rq *rq, struct task_struct *p) { }
 static inline void pre_big_task_count_change(void) { }
@@ -1474,7 +1606,13 @@ static inline void clear_reserved(int cpu) { }
 #define trace_sched_cpu_load_cgroup(...)
 #define trace_sched_cpu_load_wakeup(...)
 
-#endif /* CONFIG_SCHED_HMP */
+#endif	/* CONFIG_SCHED_HMP */
+
+/*
+ * Returns the rq capacity of any rq in a group. This does not play
+ * well with groups where rq capacity can change independently.
+ */
+#define group_rq_capacity(group) cpu_capacity(group_first_cpu(group))
 
 #ifdef CONFIG_CGROUP_SCHED
 
@@ -1496,11 +1634,6 @@ static inline struct task_group *task_group(struct task_struct *p)
 	return p->sched_task_group;
 }
 
-static inline bool task_notify_on_migrate(struct task_struct *p)
-{
-	return task_group(p)->notify_on_migrate;
-}
-
 /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
 static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
 {
@@ -1526,10 +1659,6 @@ static inline struct task_group *task_group(struct task_struct *p)
 {
 	return NULL;
 }
-static inline bool task_notify_on_migrate(struct task_struct *p)
-{
-	return false;
-}
 #endif /* CONFIG_CGROUP_SCHED */
 
 static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 81fbed978da3..ac34212f6881 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -292,14 +292,7 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
-	{
-		.procname	= "sched_wakeup_load_threshold",
-		.data		= &sysctl_sched_wakeup_load_threshold,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
-#ifdef CONFIG_SCHED_FREQ_INPUT
+#ifdef CONFIG_SCHED_HMP
 	{
 		.procname	= "sched_freq_inc_notify",
 		.data		= &sysctl_sched_freq_inc_notify,
@@ -316,8 +309,6 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &zero,
 	},
-#endif
-#ifdef CONFIG_SCHED_HMP
 	{
 		.procname       = "sched_cpu_high_irqload",
 		.data           = &sysctl_sched_cpu_high_irqload,
@@ -414,7 +405,13 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler   = sched_hmp_proc_update_handler,
 	},
-#ifdef CONFIG_SCHED_FREQ_INPUT
+	{
+		.procname       = "sched_enable_thread_grouping",
+		.data           = &sysctl_sched_enable_thread_grouping,
+		.maxlen         = sizeof(unsigned int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec,
+	},
 	{
 		.procname       = "sched_new_task_windows",
 		.data           = &sysctl_sched_new_task_windows,
@@ -437,7 +434,6 @@ static struct ctl_table kern_table[] = {
 		.mode           = 0644,
 		.proc_handler   = sched_window_update_handler,
 	},
-#endif
 	{
 		.procname	= "sched_boost",
 		.data		= &sysctl_sched_boost,
author	Linux Build Service Account <lnxbuild@localhost>	2016-08-26 22:22:38 -0700
committer	Gerrit - the friendly Code Review server <code-review@localhost>	2016-08-26 22:22:38 -0700
commit	1b7819036eaa2ba0c93bcc65fbbc1e72fa9b824a (patch)
tree	a08a77e7e4a5c5bd332a7a98637516dd65b217d3 /kernel
parent	373c6ba99fddd8071aa8eebd889c73426760fa3c (diff)
parent	2552980f79e476b99d9f489c265a836dd61a2102 (diff)