89 files changed, 14443 insertions, 1313 deletions
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 2b9a4739381d..b7aac4c5c9a1 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2251,10 +2251,11 @@ int __audit_signal_info(int sig, struct task_struct *t)
 				audit_sig_uid = uid;
 			security_task_getsecid(tsk, &audit_sig_sid);
 		}
-		if (!audit_signals || audit_dummy_context())
-			return 0;
 	}
 
+	if (!audit_signals || audit_dummy_context())
+		return 0;
+
 	/* optimize the common case by putting first signal recipient directly
 	 * in audit_context */
 	if (!ctx->target_pid) {
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 96f0fe9441af..cd3d81961cc2 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -211,6 +211,7 @@ static unsigned long have_free_callback __read_mostly;
 /* Ditto for the can_fork callback. */
 static unsigned long have_canfork_callback __read_mostly;
 
+static struct file_system_type cgroup2_fs_type;
 static struct cftype cgroup_dfl_base_files[];
 static struct cftype cgroup_legacy_base_files[];
 
@@ -716,10 +717,10 @@ static void css_set_move_task(struct task_struct *task,
 
 	if (to_cset) {
 		/*
-		 * We are synchronized through cgroup_threadgroup_rwsem
-		 * against PF_EXITING setting such that we can't race
-		 * against cgroup_exit() changing the css_set to
-		 * init_css_set and dropping the old one.
+		 * We are synchronized through css_set_lock against
+		 * PF_EXITING setting such that we can't race against
+		 * cgroup_exit() disassociating the task from the
+		 * css_set.
 		 */
 		WARN_ON_ONCE(task->flags & PF_EXITING);
 
@@ -1652,10 +1653,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 			all_ss = true;
 			continue;
 		}
-		if (!strcmp(token, "__DEVEL__sane_behavior")) {
-			opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;
-			continue;
-		}
 		if (!strcmp(token, "noprefix")) {
 			opts->flags |= CGRP_ROOT_NOPREFIX;
 			continue;
@@ -1722,15 +1719,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 			return -ENOENT;
 	}
 
-	if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
-		pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
-		if (nr_opts != 1) {
-			pr_err("sane_behavior: no other mount options allowed\n");
-			return -EINVAL;
-		}
-		return 0;
-	}
-
 	/*
 	 * If the 'all' option was specified select all the subsystems,
 	 * otherwise if 'none', 'name=' and a subsystem name options were
@@ -2013,9 +2001,10 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 			 int flags, const char *unused_dev_name,
 			 void *data)
 {
+	bool is_v2 = fs_type == &cgroup2_fs_type;
 	struct super_block *pinned_sb = NULL;
 	struct cgroup_subsys *ss;
-	struct cgroup_root *root;
+	struct cgroup_root *root = NULL;
 	struct cgroup_sb_opts opts;
 	struct dentry *dentry;
 	int ret;
@@ -2029,6 +2018,17 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 	if (!use_task_css_set_links)
 		cgroup_enable_task_cg_lists();
 
+	if (is_v2) {
+		if (data) {
+			pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
+			return ERR_PTR(-EINVAL);
+		}
+		cgrp_dfl_root_visible = true;
+		root = &cgrp_dfl_root;
+		cgroup_get(&root->cgrp);
+		goto out_mount;
+	}
+
 	mutex_lock(&cgroup_mutex);
 
 	/* First find the desired set of subsystems */
@@ -2036,15 +2036,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 	if (ret)
 		goto out_unlock;
 
-	/* look for a matching existing root */
-	if (opts.flags & CGRP_ROOT_SANE_BEHAVIOR) {
-		cgrp_dfl_root_visible = true;
-		root = &cgrp_dfl_root;
-		cgroup_get(&root->cgrp);
-		ret = 0;
-		goto out_unlock;
-	}
-
 	/*
 	 * Destruction of cgroup root is asynchronous, so subsystems may
 	 * still be dying after the previous unmount.  Let's drain the
@@ -2155,9 +2146,10 @@ out_free:
 
 	if (ret)
 		return ERR_PTR(ret);
-
+out_mount:
 	dentry = kernfs_mount(fs_type, flags, root->kf_root,
-				CGROUP_SUPER_MAGIC, &new_sb);
+			      is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC,
+			      &new_sb);
 	if (IS_ERR(dentry) || !new_sb)
 		cgroup_put(&root->cgrp);
 
@@ -2200,6 +2192,12 @@ static struct file_system_type cgroup_fs_type = {
 	.kill_sb = cgroup_kill_sb,
 };
 
+static struct file_system_type cgroup2_fs_type = {
+	.name = "cgroup2",
+	.mount = cgroup_mount,
+	.kill_sb = cgroup_kill_sb,
+};
+
 /**
  * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
  * @task: target task
@@ -2801,6 +2799,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
 	int retval = 0;
 
 	mutex_lock(&cgroup_mutex);
+	percpu_down_write(&cgroup_threadgroup_rwsem);
 	for_each_root(root) {
 		struct cgroup *from_cgrp;
 
@@ -2815,6 +2814,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
 		if (retval)
 			break;
 	}
+	percpu_up_write(&cgroup_threadgroup_rwsem);
 	mutex_unlock(&cgroup_mutex);
 
 	return retval;
@@ -4074,6 +4074,8 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
 
 	mutex_lock(&cgroup_mutex);
 
+	percpu_down_write(&cgroup_threadgroup_rwsem);
+
 	/* all tasks in @from are being moved, all csets are source */
 	spin_lock_irq(&css_set_lock);
 	list_for_each_entry(link, &from->cset_links, cset_link)
@@ -4106,6 +4108,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
 	} while (task && !ret);
 out_err:
 	cgroup_migrate_finish(&preloaded_csets);
+	percpu_up_write(&cgroup_threadgroup_rwsem);
 	mutex_unlock(&cgroup_mutex);
 	return ret;
 }
@@ -5401,6 +5404,7 @@ int __init cgroup_init(void)
 
 	WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
 	WARN_ON(register_filesystem(&cgroup_fs_type));
+	WARN_ON(register_filesystem(&cgroup2_fs_type));
 	WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations));
 
 	return 0;
@@ -5706,19 +5710,22 @@ void cgroup_exit(struct task_struct *tsk)
 	int i;
 
 	/*
-	 * Unlink from @tsk from its css_set.  As migration path can't race
-	 * with us, we can check css_set and cg_list without synchronization.
+	 * Avoid potential race with the migrate path.
+	 */
+	spin_lock_irq(&css_set_lock);
+	/*
+	 * Unlink from @tsk from its css_set.
 	 */
 	cset = task_css_set(tsk);
 
 	if (!list_empty(&tsk->cg_list)) {
-		spin_lock_irq(&css_set_lock);
 		css_set_move_task(tsk, cset, NULL, false);
-		spin_unlock_irq(&css_set_lock);
 	} else {
 		get_css_set(cset);
 	}
 
+	spin_unlock_irq(&css_set_lock);
+
 	/* see cgroup_post_fork() for details */
 	for_each_subsys_which(ss, i, &have_exit_callback)
 		ss->exit(tsk);
@@ -5780,7 +5787,9 @@ static void cgroup_release_agent(struct work_struct *work)
 	if (!pathbuf || !agentbuf)
 		goto out;
 
+	spin_lock_irq(&css_set_lock);
 	path = cgroup_path(cgrp, pathbuf, PATH_MAX);
+	spin_unlock_irq(&css_set_lock);
 	if (!path)
 		goto out;
 
@@ -5955,7 +5964,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
 		struct task_struct *task;
 		int count = 0;
 
-		seq_printf(seq, "css_set %p\n", cset);
+		seq_printf(seq, "css_set %pK\n", cset);
 
 		list_for_each_entry(task, &cset->tasks, cg_list) {
 			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 24ca2963754b..5b4440d57f89 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -91,6 +91,11 @@ static struct {
 #define cpuhp_lock_acquire()      lock_map_acquire(&cpu_hotplug.dep_map)
 #define cpuhp_lock_release()      lock_map_release(&cpu_hotplug.dep_map)
 
+void cpu_hotplug_mutex_held(void)
+{
+	lockdep_assert_held(&cpu_hotplug.lock);
+}
+EXPORT_SYMBOL(cpu_hotplug_mutex_held);
 
 void get_online_cpus(void)
 {
@@ -361,6 +366,9 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
 	if (!cpu_online(cpu))
 		return -EINVAL;
 
+	if (!tasks_frozen && !cpu_isolated(cpu) && num_online_uniso_cpus() == 1)
+		return -EBUSY;
+
 	cpu_hotplug_begin();
 
 	err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
@@ -520,8 +528,8 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen)
 	ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
 	if (ret) {
 		nr_calls--;
-		pr_warn("%s: attempt to bring up CPU %u failed\n",
-			__func__, cpu);
+		pr_warn_ratelimited("%s: attempt to bring up CPU %u failed\n",
+				    __func__, cpu);
 		goto out_notify;
 	}
 
@@ -545,9 +553,41 @@ out:
 	return ret;
 }
 
+static int switch_to_rt_policy(void)
+{
+	struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
+	unsigned int policy = current->policy;
+	int err;
+
+	/* Nobody should be attempting hotplug from these policy contexts. */
+	if (policy == SCHED_BATCH || policy == SCHED_IDLE ||
+					policy == SCHED_DEADLINE)
+		return -EPERM;
+
+	if (policy == SCHED_FIFO || policy == SCHED_RR)
+		return 1;
+
+	/* Only SCHED_NORMAL left. */
+	err = sched_setscheduler_nocheck(current, SCHED_FIFO, &param);
+	return err;
+
+}
+
+static int switch_to_fair_policy(void)
+{
+	struct sched_param param = { .sched_priority = 0 };
+
+	return sched_setscheduler_nocheck(current, SCHED_NORMAL, &param);
+}
+
 int cpu_up(unsigned int cpu)
 {
 	int err = 0;
+	int switch_err = 0;
+
+	switch_err = switch_to_rt_policy();
+	if (switch_err < 0)
+		return switch_err;
 
 	if (!cpu_possible(cpu)) {
 		pr_err("can't online cpu %d because it is not configured as may-hotadd at boot time\n",
@@ -573,6 +613,14 @@ int cpu_up(unsigned int cpu)
 
 out:
 	cpu_maps_update_done();
+
+	if (!switch_err) {
+		switch_err = switch_to_fair_policy();
+		if (switch_err)
+			pr_err("Hotplug policy switch err=%d Task %s pid=%d\n",
+				switch_err, current->comm, current->pid);
+	}
+
 	return err;
 }
 EXPORT_SYMBOL_GPL(cpu_up);
@@ -797,6 +845,10 @@ static DECLARE_BITMAP(cpu_active_bits, CONFIG_NR_CPUS) __read_mostly;
 const struct cpumask *const cpu_active_mask = to_cpumask(cpu_active_bits);
 EXPORT_SYMBOL(cpu_active_mask);
 
+static DECLARE_BITMAP(cpu_isolated_bits, CONFIG_NR_CPUS) __read_mostly;
+const struct cpumask *const cpu_isolated_mask = to_cpumask(cpu_isolated_bits);
+EXPORT_SYMBOL(cpu_isolated_mask);
+
 void set_cpu_possible(unsigned int cpu, bool possible)
 {
 	if (possible)
@@ -831,6 +883,14 @@ void set_cpu_active(unsigned int cpu, bool active)
 		cpumask_clear_cpu(cpu, to_cpumask(cpu_active_bits));
 }
 
+void set_cpu_isolated(unsigned int cpu, bool isolated)
+{
+	if (isolated)
+		cpumask_set_cpu(cpu, to_cpumask(cpu_isolated_bits));
+	else
+		cpumask_clear_cpu(cpu, to_cpumask(cpu_isolated_bits));
+}
+
 void init_cpu_present(const struct cpumask *src)
 {
 	cpumask_copy(to_cpumask(cpu_present_bits), src);
@@ -846,6 +906,11 @@ void init_cpu_online(const struct cpumask *src)
 	cpumask_copy(to_cpumask(cpu_online_bits), src);
 }
 
+void init_cpu_isolated(const struct cpumask *src)
+{
+	cpumask_copy(to_cpumask(cpu_isolated_bits), src);
+}
+
 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
 
 void idle_notifier_register(struct notifier_block *n)
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
index 009cc9a17d95..774bfe7a2893 100644
--- a/kernel/cpu_pm.c
+++ b/kernel/cpu_pm.c
@@ -22,14 +22,17 @@
 #include <linux/spinlock.h>
 #include <linux/syscore_ops.h>
 
+bool from_suspend = false;
+
 static DEFINE_RWLOCK(cpu_pm_notifier_lock);
 static RAW_NOTIFIER_HEAD(cpu_pm_notifier_chain);
 
-static int cpu_pm_notify(enum cpu_pm_event event, int nr_to_call, int *nr_calls)
+static int cpu_pm_notify(enum cpu_pm_event event, int nr_to_call, int *nr_calls,
+		void *data)
 {
 	int ret;
 
-	ret = __raw_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL,
+	ret = __raw_notifier_call_chain(&cpu_pm_notifier_chain, event, data,
 		nr_to_call, nr_calls);
 
 	return notifier_to_errno(ret);
@@ -101,13 +104,13 @@ int cpu_pm_enter(void)
 	int ret = 0;
 
 	read_lock(&cpu_pm_notifier_lock);
-	ret = cpu_pm_notify(CPU_PM_ENTER, -1, &nr_calls);
+	ret = cpu_pm_notify(CPU_PM_ENTER, -1, &nr_calls, NULL);
 	if (ret)
 		/*
 		 * Inform listeners (nr_calls - 1) about failure of CPU PM
 		 * PM entry who are notified earlier to prepare for it.
 		 */
-		cpu_pm_notify(CPU_PM_ENTER_FAILED, nr_calls - 1, NULL);
+		cpu_pm_notify(CPU_PM_ENTER_FAILED, nr_calls - 1, NULL, NULL);
 	read_unlock(&cpu_pm_notifier_lock);
 
 	return ret;
@@ -131,7 +134,7 @@ int cpu_pm_exit(void)
 	int ret;
 
 	read_lock(&cpu_pm_notifier_lock);
-	ret = cpu_pm_notify(CPU_PM_EXIT, -1, NULL);
+	ret = cpu_pm_notify(CPU_PM_EXIT, -1, NULL, NULL);
 	read_unlock(&cpu_pm_notifier_lock);
 
 	return ret;
@@ -154,19 +157,21 @@ EXPORT_SYMBOL_GPL(cpu_pm_exit);
  *
  * Return conditions are same as __raw_notifier_call_chain.
  */
-int cpu_cluster_pm_enter(void)
+int cpu_cluster_pm_enter(unsigned long aff_level)
 {
 	int nr_calls;
 	int ret = 0;
 
 	read_lock(&cpu_pm_notifier_lock);
-	ret = cpu_pm_notify(CPU_CLUSTER_PM_ENTER, -1, &nr_calls);
+	ret = cpu_pm_notify(CPU_CLUSTER_PM_ENTER, -1, &nr_calls,
+			(void *) aff_level);
 	if (ret)
 		/*
 		 * Inform listeners (nr_calls - 1) about failure of CPU cluster
 		 * PM entry who are notified earlier to prepare for it.
 		 */
-		cpu_pm_notify(CPU_CLUSTER_PM_ENTER_FAILED, nr_calls - 1, NULL);
+		cpu_pm_notify(CPU_CLUSTER_PM_ENTER_FAILED, nr_calls - 1, NULL,
+				(void *) aff_level);
 	read_unlock(&cpu_pm_notifier_lock);
 
 	return ret;
@@ -188,12 +193,12 @@ EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter);
  *
  * Return conditions are same as __raw_notifier_call_chain.
  */
-int cpu_cluster_pm_exit(void)
+int cpu_cluster_pm_exit(unsigned long aff_level)
 {
 	int ret;
 
 	read_lock(&cpu_pm_notifier_lock);
-	ret = cpu_pm_notify(CPU_CLUSTER_PM_EXIT, -1, NULL);
+	ret = cpu_pm_notify(CPU_CLUSTER_PM_EXIT, -1, NULL, (void *) aff_level);
 	read_unlock(&cpu_pm_notifier_lock);
 
 	return ret;
@@ -205,17 +210,19 @@ static int cpu_pm_suspend(void)
 {
 	int ret;
 
+	from_suspend = true;
 	ret = cpu_pm_enter();
 	if (ret)
 		return ret;
 
-	ret = cpu_cluster_pm_enter();
+	ret = cpu_cluster_pm_enter(0);
 	return ret;
 }
 
 static void cpu_pm_resume(void)
 {
-	cpu_cluster_pm_exit();
+	from_suspend = false;
+	cpu_cluster_pm_exit(0);
 	cpu_pm_exit();
 }
 
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index db86d6b609b6..a599351997ad 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -99,7 +99,7 @@ struct cpuset {
 
 	/* user-configured CPUs and Memory Nodes allow to tasks */
 	cpumask_var_t cpus_allowed;
-	cpumask_var_t cpus_requested;
+	cpumask_var_t cpus_requested;   /* CPUS requested, but not used because of hotplug */
 	nodemask_t mems_allowed;
 
 	/* effective CPUs and Memory Nodes allow to tasks */
@@ -807,16 +807,15 @@ done:
  * 'cpus' is removed, then call this routine to rebuild the
  * scheduler's dynamic sched domains.
  *
- * Call with cpuset_mutex held.  Takes get_online_cpus().
  */
-static void rebuild_sched_domains_locked(void)
+static void rebuild_sched_domains_unlocked(void)
 {
 	struct sched_domain_attr *attr;
 	cpumask_var_t *doms;
 	int ndoms;
 
+	cpu_hotplug_mutex_held();
 	lockdep_assert_held(&cpuset_mutex);
-	get_online_cpus();
 
 	/*
 	 * We have raced with CPU hotplug. Don't do anything to avoid
@@ -824,27 +823,27 @@ static void rebuild_sched_domains_locked(void)
 	 * Anyways, hotplug work item will rebuild sched domains.
 	 */
 	if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
-		goto out;
+		return;
 
 	/* Generate domain masks and attrs */
 	ndoms = generate_sched_domains(&doms, &attr);
 
 	/* Have scheduler rebuild the domains */
 	partition_sched_domains(ndoms, doms, attr);
-out:
-	put_online_cpus();
 }
 #else /* !CONFIG_SMP */
-static void rebuild_sched_domains_locked(void)
+static void rebuild_sched_domains_unlocked(void)
 {
 }
 #endif /* CONFIG_SMP */
 
 void rebuild_sched_domains(void)
 {
+	get_online_cpus();
 	mutex_lock(&cpuset_mutex);
-	rebuild_sched_domains_locked();
+	rebuild_sched_domains_unlocked();
 	mutex_unlock(&cpuset_mutex);
+	put_online_cpus();
 }
 
 /**
@@ -876,7 +875,6 @@ static void update_tasks_cpumask(struct cpuset *cs)
  *
  * On legacy hierachy, effective_cpus will be the same with cpu_allowed.
  *
- * Called with cpuset_mutex held
  */
 static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
 {
@@ -931,7 +929,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
 	rcu_read_unlock();
 
 	if (need_rebuild_sched_domains)
-		rebuild_sched_domains_locked();
+		rebuild_sched_domains_unlocked();
 }
 
 /**
@@ -1290,7 +1288,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
 		cs->relax_domain_level = val;
 		if (!cpumask_empty(cs->cpus_allowed) &&
 		    is_sched_load_balance(cs))
-			rebuild_sched_domains_locked();
+			rebuild_sched_domains_unlocked();
 	}
 
 	return 0;
@@ -1321,7 +1319,6 @@ static void update_tasks_flags(struct cpuset *cs)
  * cs:		the cpuset to update
  * turning_on: 	whether the flag is being set or cleared
  *
- * Call with cpuset_mutex held.
  */
 
 static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
@@ -1356,7 +1353,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
 	spin_unlock_irq(&callback_lock);
 
 	if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
-		rebuild_sched_domains_locked();
+		rebuild_sched_domains_unlocked();
 
 	if (spread_flag_changed)
 		update_tasks_flags(cs);
@@ -1621,6 +1618,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
 	cpuset_filetype_t type = cft->private;
 	int retval = 0;
 
+	get_online_cpus();
 	mutex_lock(&cpuset_mutex);
 	if (!is_cpuset_online(cs)) {
 		retval = -ENODEV;
@@ -1658,6 +1656,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
 	}
 out_unlock:
 	mutex_unlock(&cpuset_mutex);
+	put_online_cpus();
 	return retval;
 }
 
@@ -1668,6 +1667,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
 	cpuset_filetype_t type = cft->private;
 	int retval = -ENODEV;
 
+	get_online_cpus();
 	mutex_lock(&cpuset_mutex);
 	if (!is_cpuset_online(cs))
 		goto out_unlock;
@@ -1682,6 +1682,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
 	}
 out_unlock:
 	mutex_unlock(&cpuset_mutex);
+	put_online_cpus();
 	return retval;
 }
 
@@ -1720,6 +1721,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
 	kernfs_break_active_protection(of->kn);
 	flush_work(&cpuset_hotplug_work);
 
+	get_online_cpus();
 	mutex_lock(&cpuset_mutex);
 	if (!is_cpuset_online(cs))
 		goto out_unlock;
@@ -1745,6 +1747,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
 	free_trial_cpuset(trialcs);
 out_unlock:
 	mutex_unlock(&cpuset_mutex);
+	put_online_cpus();
 	kernfs_unbreak_active_protection(of->kn);
 	css_put(&cs->css);
 	flush_workqueue(cpuset_migrate_mm_wq);
@@ -1958,11 +1961,11 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
 	if (!cs)
 		return ERR_PTR(-ENOMEM);
 	if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL))
-		goto free_cs;
-	if (!alloc_cpumask_var(&cs->cpus_requested, GFP_KERNEL))
-		goto free_allowed;
+		goto error_allowed;
 	if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL))
-		goto free_requested;
+		goto error_effective;
+	if (!alloc_cpumask_var(&cs->cpus_requested, GFP_KERNEL))
+		goto error_requested;
 
 	set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
 	cpumask_clear(cs->cpus_allowed);
@@ -1975,11 +1978,11 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
 
 	return &cs->css;
 
-free_requested:
-	free_cpumask_var(cs->cpus_requested);
-free_allowed:
+error_requested:
+	free_cpumask_var(cs->effective_cpus);
+error_effective:
 	free_cpumask_var(cs->cpus_allowed);
-free_cs:
+error_allowed:
 	kfree(cs);
 	return ERR_PTR(-ENOMEM);
 }
@@ -2051,13 +2054,14 @@ out_unlock:
 /*
  * If the cpuset being removed has its flag 'sched_load_balance'
  * enabled, then simulate turning sched_load_balance off, which
- * will call rebuild_sched_domains_locked().
+ * will call rebuild_sched_domains_unlocked().
  */
 
 static void cpuset_css_offline(struct cgroup_subsys_state *css)
 {
 	struct cpuset *cs = css_cs(css);
 
+	get_online_cpus();
 	mutex_lock(&cpuset_mutex);
 
 	if (is_sched_load_balance(cs))
@@ -2067,6 +2071,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
 	clear_bit(CS_ONLINE, &cs->flags);
 
 	mutex_unlock(&cpuset_mutex);
+	put_online_cpus();
 }
 
 static void cpuset_css_free(struct cgroup_subsys_state *css)
@@ -2278,7 +2283,8 @@ retry:
 		goto retry;
 	}
 
-	cpumask_and(&new_cpus, cs->cpus_requested, parent_cs(cs)->effective_cpus);
+	cpumask_and(&new_cpus, cs->cpus_requested,
+						parent_cs(cs)->effective_cpus);
 	nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems);
 
 	cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0f9ada220789..66c1084752f9 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -158,6 +158,7 @@ enum event_type_t {
 struct static_key_deferred perf_sched_events __read_mostly;
 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
 static DEFINE_PER_CPU(int, perf_sched_cb_usages);
+static DEFINE_PER_CPU(bool, is_idle);
 
 static atomic_t nr_mmap_events __read_mostly;
 static atomic_t nr_comm_events __read_mostly;
@@ -177,7 +178,9 @@ static struct srcu_struct pmus_srcu;
  *   2 - disallow kernel profiling for unpriv
  *   3 - disallow all unpriv perf event use
  */
-#ifdef CONFIG_SECURITY_PERF_EVENTS_RESTRICT
+#ifdef CONFIG_PERF_EVENTS_USERMODE
+int sysctl_perf_event_paranoid __read_mostly = -1;
+#elif defined CONFIG_SECURITY_PERF_EVENTS_RESTRICT
 int sysctl_perf_event_paranoid __read_mostly = 3;
 #else
 int sysctl_perf_event_paranoid __read_mostly = 1;
@@ -1497,10 +1500,17 @@ static void perf_group_detach(struct perf_event *event)
 	 * If this was a group event with sibling events then
 	 * upgrade the siblings to singleton events by adding them
 	 * to whatever list we are on.
+	 * If this isn't on a list, make sure we still remove the sibling's
+	 * group_entry from this sibling_list; otherwise, when that sibling
+	 * is later deallocated, it will try to remove itself from this
+	 * sibling_list, which may well have been deallocated already,
+	 * resulting in a use-after-free.
 	 */
 	list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
 		if (list)
 			list_move_tail(&sibling->group_entry, list);
+		else
+			list_del_init(&sibling->group_entry);
 		sibling->group_leader = sibling;
 
 		/* Inherit group flags from the previous leader */
@@ -1695,7 +1705,32 @@ static int __perf_remove_from_context(void *info)
 }
 
 
-/*
+#ifdef CONFIG_SMP
+static void perf_retry_remove(struct perf_event *event,
+			      struct remove_event *rep)
+{
+	int up_ret;
+	/*
+	 * CPU was offline. Bring it online so we can
+	 * gracefully exit a perf context.
+	 */
+	up_ret = cpu_up(event->cpu);
+	if (!up_ret)
+		/* Try the remove call once again. */
+		cpu_function_call(event->cpu, __perf_remove_from_context,
+				  rep);
+	else
+		pr_err("Failed to bring up CPU: %d, ret: %d\n",
+		       event->cpu, up_ret);
+}
+#else
+static void perf_retry_remove(struct perf_event *event,
+			      struct remove_event *rep)
+{
+}
+#endif
+
+ /*
  * Remove the event from a task's (or a CPU's) list of events.
  *
  * CPU events are removed with a smp call. For task events we only
@@ -1708,7 +1743,8 @@ static int __perf_remove_from_context(void *info)
  * When called from perf_event_exit_task, it's OK because the
  * context has been detached from its task.
  */
-static void perf_remove_from_context(struct perf_event *event, bool detach_group)
+static void __ref perf_remove_from_context(struct perf_event *event,
+					   bool detach_group)
 {
 	struct perf_event_context *ctx = event->ctx;
 	struct task_struct *task = ctx->task;
@@ -1716,6 +1752,7 @@ static void perf_remove_from_context(struct perf_event *event, bool detach_group
 		.event = event,
 		.detach_group = detach_group,
 	};
+	int ret;
 
 	lockdep_assert_held(&ctx->mutex);
 
@@ -1726,7 +1763,11 @@ static void perf_remove_from_context(struct perf_event *event, bool detach_group
 		 * already called __perf_remove_from_context from
 		 * perf_event_exit_cpu.
 		 */
-		cpu_function_call(event->cpu, __perf_remove_from_context, &re);
+		ret = cpu_function_call(event->cpu, __perf_remove_from_context,
+					&re);
+		if (ret == -ENXIO)
+			perf_retry_remove(event, &re);
+
 		return;
 	}
 
@@ -1922,8 +1963,13 @@ event_sched_in(struct perf_event *event,
 	if (event->state <= PERF_EVENT_STATE_OFF)
 		return 0;
 
-	event->state = PERF_EVENT_STATE_ACTIVE;
-	event->oncpu = smp_processor_id();
+	WRITE_ONCE(event->oncpu, smp_processor_id());
+	/*
+	 * Order event::oncpu write to happen before the ACTIVE state
+	 * is visible.
+	 */
+	smp_wmb();
+	WRITE_ONCE(event->state, PERF_EVENT_STATE_ACTIVE);
 
 	/*
 	 * Unthrottle events, since we scheduled we might have missed several
@@ -2404,6 +2450,29 @@ void perf_event_enable(struct perf_event *event)
 }
 EXPORT_SYMBOL_GPL(perf_event_enable);
 
+static int __perf_event_stop(void *info)
+{
+	struct perf_event *event = info;
+
+	/* for AUX events, our job is done if the event is already inactive */
+	if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
+		return 0;
+
+	/* matches smp_wmb() in event_sched_in() */
+	smp_rmb();
+
+	/*
+	 * There is a window with interrupts enabled before we get here,
+	 * so we need to check again lest we try to stop another CPU's event.
+	 */
+	if (READ_ONCE(event->oncpu) != smp_processor_id())
+		return -EAGAIN;
+
+	event->pmu->stop(event, PERF_EF_UPDATE);
+
+	return 0;
+}
+
 static int _perf_event_refresh(struct perf_event *event, int refresh)
 {
 	/*
@@ -3379,21 +3448,30 @@ u64 perf_event_read_local(struct perf_event *event)
 
 static int perf_event_read(struct perf_event *event, bool group)
 {
-	int ret = 0;
+	int event_cpu, ret = 0;
 
 	/*
 	 * If event is enabled and currently active on a CPU, update the
 	 * value in the event structure:
 	 */
-	if (event->state == PERF_EVENT_STATE_ACTIVE) {
+	event_cpu = READ_ONCE(event->oncpu);
+
+	if (event->state == PERF_EVENT_STATE_ACTIVE &&
+						!cpu_isolated(event_cpu)) {
 		struct perf_read_data data = {
 			.event = event,
 			.group = group,
 			.ret = 0,
 		};
-		smp_call_function_single(event->oncpu,
-					 __perf_event_read, &data, 1);
-		ret = data.ret;
+
+		if ((unsigned int)event_cpu >= nr_cpu_ids)
+			return 0;
+		if (!event->attr.exclude_idle ||
+					!per_cpu(is_idle, event_cpu)) {
+			smp_call_function_single(event_cpu,
+				__perf_event_read, &data, 1);
+			ret = data.ret;
+		}
 	} else if (event->state == PERF_EVENT_STATE_INACTIVE) {
 		struct perf_event_context *ctx = event->ctx;
 		unsigned long flags;
@@ -3488,7 +3566,8 @@ find_get_context(struct pmu *pmu, struct task_struct *task,
 
 	if (!task) {
 		/* Must be root to operate on a CPU event: */
-		if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+		if (event->owner != EVENT_OWNER_KERNEL && perf_paranoid_cpu() &&
+			!capable(CAP_SYS_ADMIN))
 			return ERR_PTR(-EACCES);
 
 		/*
@@ -3729,6 +3808,9 @@ static void __free_event(struct perf_event *event)
 	if (event->destroy)
 		event->destroy(event);
 
+	if (event->pmu->free_drv_configs)
+		event->pmu->free_drv_configs(event);
+
 	if (event->ctx)
 		put_ctx(event->ctx);
 
@@ -3872,6 +3954,15 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel);
  */
 static int perf_release(struct inode *inode, struct file *file)
 {
+	struct perf_event *event = file->private_data;
+
+	/*
+	 * Event can be in state OFF because of a constraint check.
+	 * Change to ACTIVE so that it gets cleaned up correctly.
+	 */
+	if ((event->state == PERF_EVENT_STATE_OFF) &&
+	    event->attr.constraint_duplicate)
+		event->state = PERF_EVENT_STATE_ACTIVE;
 	put_event(file->private_data);
 	return 0;
 }
@@ -4281,6 +4372,8 @@ static int perf_event_set_output(struct perf_event *event,
 				 struct perf_event *output_event);
 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
+static int perf_event_drv_configs(struct perf_event *event,
+				  void __user *arg);
 
 static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
 {
@@ -4337,6 +4430,9 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
 	case PERF_EVENT_IOC_SET_BPF:
 		return perf_event_set_bpf_prog(event, arg);
 
+	case PERF_EVENT_IOC_SET_DRV_CONFIGS:
+		return perf_event_drv_configs(event, (void __user *)arg);
+
 	default:
 		return -ENOTTY;
 	}
@@ -4369,6 +4465,7 @@ static long perf_compat_ioctl(struct file *file, unsigned int cmd,
 	switch (_IOC_NR(cmd)) {
 	case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
 	case _IOC_NR(PERF_EVENT_IOC_ID):
+	case _IOC_NR(PERF_EVENT_IOC_SET_DRV_CONFIGS):
 		/* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
 		if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
 			cmd &= ~IOCSIZE_MASK;
@@ -4653,6 +4750,8 @@ static void perf_mmap_open(struct vm_area_struct *vma)
 		event->pmu->event_mapped(event);
 }
 
+static void perf_pmu_output_stop(struct perf_event *event);
+
 /*
  * A buffer can be mmap()ed multiple times; either directly through the same
  * event, or through other events by use of perf_event_set_output().
@@ -4680,10 +4779,22 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 	 */
 	if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
 	    atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
+		/*
+		 * Stop all AUX events that are writing to this buffer,
+		 * so that we can free its AUX pages and corresponding PMU
+		 * data. Note that after rb::aux_mmap_count dropped to zero,
+		 * they won't start any more (see perf_aux_output_begin()).
+		 */
+		perf_pmu_output_stop(event);
+
+		/* now it's safe to free the pages */
 		atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
 		vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
 
+		/* this has to be the last one */
 		rb_free_aux(rb);
+		WARN_ON_ONCE(atomic_read(&rb->aux_refcount));
+
 		mutex_unlock(&event->mmap_mutex);
 	}
 
@@ -5759,6 +5870,80 @@ next:
 	rcu_read_unlock();
 }
 
+struct remote_output {
+	struct ring_buffer	*rb;
+	int			err;
+};
+
+static void __perf_event_output_stop(struct perf_event *event, void *data)
+{
+	struct perf_event *parent = event->parent;
+	struct remote_output *ro = data;
+	struct ring_buffer *rb = ro->rb;
+
+	if (!has_aux(event))
+		return;
+
+	if (!parent)
+		parent = event;
+
+	/*
+	 * In case of inheritance, it will be the parent that links to the
+	 * ring-buffer, but it will be the child that's actually using it:
+	 */
+	if (rcu_dereference(parent->rb) == rb)
+		ro->err = __perf_event_stop(event);
+}
+
+static int __perf_pmu_output_stop(void *info)
+{
+	struct perf_event *event = info;
+	struct pmu *pmu = event->pmu;
+	struct perf_cpu_context *cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+	struct remote_output ro = {
+		.rb	= event->rb,
+	};
+
+	rcu_read_lock();
+	perf_event_aux_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro);
+	if (cpuctx->task_ctx)
+		perf_event_aux_ctx(cpuctx->task_ctx, __perf_event_output_stop,
+				   &ro);
+	rcu_read_unlock();
+
+	return ro.err;
+}
+
+static void perf_pmu_output_stop(struct perf_event *event)
+{
+	struct perf_event *iter;
+	int err, cpu;
+
+restart:
+	rcu_read_lock();
+	list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) {
+		/*
+		 * For per-CPU events, we need to make sure that neither they
+		 * nor their children are running; for cpu==-1 events it's
+		 * sufficient to stop the event itself if it's active, since
+		 * it can't have children.
+		 */
+		cpu = iter->cpu;
+		if (cpu == -1)
+			cpu = READ_ONCE(iter->oncpu);
+
+		if (cpu == -1)
+			continue;
+
+		err = cpu_function_call(cpu, __perf_pmu_output_stop, event);
+		if (err == -EAGAIN) {
+			rcu_read_unlock();
+			goto restart;
+		}
+	}
+	rcu_read_unlock();
+}
+
 /*
  * task tracking -- fork/exit
  *
@@ -6953,6 +7138,8 @@ static struct pmu perf_swevent = {
 	.start		= perf_swevent_start,
 	.stop		= perf_swevent_stop,
 	.read		= perf_swevent_read,
+
+	.events_across_hotplug = 1,
 };
 
 #ifdef CONFIG_EVENT_TRACING
@@ -7076,6 +7263,8 @@ static struct pmu perf_tracepoint = {
 	.start		= perf_swevent_start,
 	.stop		= perf_swevent_stop,
 	.read		= perf_swevent_read,
+
+	.events_across_hotplug = 1,
 };
 
 static inline void perf_tp_register(void)
@@ -7188,6 +7377,15 @@ void perf_bp_event(struct perf_event *bp, void *data)
 }
 #endif
 
+static int perf_event_drv_configs(struct perf_event *event,
+				  void __user *arg)
+{
+	if (!event->pmu->get_drv_configs)
+		return -EINVAL;
+
+	return event->pmu->get_drv_configs(event, arg);
+}
+
 /*
  * hrtimer based swevent callback
  */
@@ -7355,6 +7553,8 @@ static struct pmu perf_cpu_clock = {
 	.start		= cpu_clock_event_start,
 	.stop		= cpu_clock_event_stop,
 	.read		= cpu_clock_event_read,
+
+	.events_across_hotplug = 1,
 };
 
 /*
@@ -7436,6 +7636,8 @@ static struct pmu perf_task_clock = {
 	.start		= task_clock_event_start,
 	.stop		= task_clock_event_stop,
 	.read		= task_clock_event_read,
+
+	.events_across_hotplug = 1,
 };
 
 static void perf_pmu_nop_void(struct pmu *pmu)
@@ -7916,6 +8118,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 	if (!group_leader)
 		group_leader = event;
 
+	mutex_init(&event->group_leader_mutex);
 	mutex_init(&event->child_mutex);
 	INIT_LIST_HEAD(&event->child_list);
 
@@ -7924,6 +8127,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 	INIT_LIST_HEAD(&event->sibling_list);
 	INIT_LIST_HEAD(&event->rb_entry);
 	INIT_LIST_HEAD(&event->active_entry);
+	INIT_LIST_HEAD(&event->drv_configs);
 	INIT_HLIST_NODE(&event->hlist_entry);
 
 
@@ -8343,6 +8547,9 @@ SYSCALL_DEFINE5(perf_event_open,
 	if (err)
 		return err;
 
+	if (attr.constraint_duplicate || attr.__reserved_1)
+		return -EINVAL;
+
 	if (!attr.exclude_kernel) {
 		if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
 			return -EACCES;
@@ -8383,6 +8590,16 @@ SYSCALL_DEFINE5(perf_event_open,
 			group_leader = NULL;
 	}
 
+	/*
+	 * Take the group_leader's group_leader_mutex before observing
+	 * anything in the group leader that leads to changes in ctx,
+	 * many of which may be changing on another thread.
+	 * In particular, we want to take this lock before deciding
+	 * whether we need to move_group.
+	 */
+	if (group_leader)
+		mutex_lock(&group_leader->group_leader_mutex);
+
 	if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
 		task = find_lively_task_by_vpid(pid);
 		if (IS_ERR(task)) {
@@ -8661,6 +8878,8 @@ SYSCALL_DEFINE5(perf_event_open,
 	if (move_group)
 		perf_event_ctx_unlock(group_leader, gctx);
 	mutex_unlock(&ctx->mutex);
+	if (group_leader)
+		mutex_unlock(&group_leader->group_leader_mutex);
 
 	if (task) {
 		mutex_unlock(&task->signal->cred_guard_mutex);
@@ -8710,6 +8929,8 @@ err_task:
 	if (task)
 		put_task_struct(task);
 err_group_fd:
+	if (group_leader)
+		mutex_unlock(&group_leader->group_leader_mutex);
 	fdput(group);
 err_fd:
 	put_unused_fd(event_fd);
@@ -9421,29 +9642,90 @@ static void __perf_event_exit_context(void *__info)
 	rcu_read_unlock();
 }
 
+static void __perf_event_stop_swclock(void *__info)
+{
+	struct perf_event_context *ctx = __info;
+	struct perf_event *event, *tmp;
+
+	list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) {
+		if (event->attr.config == PERF_COUNT_SW_CPU_CLOCK &&
+		    event->attr.type == PERF_TYPE_SOFTWARE)
+			cpu_clock_event_stop(event, 0);
+	}
+}
+
 static void perf_event_exit_cpu_context(int cpu)
 {
+	struct perf_cpu_context *cpuctx;
 	struct perf_event_context *ctx;
+	unsigned long flags;
 	struct pmu *pmu;
 	int idx;
 
 	idx = srcu_read_lock(&pmus_srcu);
 	list_for_each_entry_rcu(pmu, &pmus, entry) {
-		ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
+		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+		ctx = &cpuctx->ctx;
+
+		/* Cancel the mux hrtimer to avoid CPU migration */
+		if (pmu->task_ctx_nr != perf_sw_context) {
+			raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
+			hrtimer_cancel(&cpuctx->hrtimer);
+			cpuctx->hrtimer_active = 0;
+			raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock,
+							flags);
+		}
 
 		mutex_lock(&ctx->mutex);
-		smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
+		/*
+		 * If keeping events across hotplugging is supported, do not
+		 * remove the event list, but keep it alive across CPU hotplug.
+		 * The context is exited via an fd close path when userspace
+		 * is done and the target CPU is online. If software clock
+		 * event is active, then stop hrtimer associated with it.
+		 * Start the timer when the CPU comes back online.
+		 */
+		if (!pmu->events_across_hotplug)
+			smp_call_function_single(cpu, __perf_event_exit_context,
+						 ctx, 1);
+		else
+			smp_call_function_single(cpu, __perf_event_stop_swclock,
+						 ctx, 1);
 		mutex_unlock(&ctx->mutex);
 	}
 	srcu_read_unlock(&pmus_srcu, idx);
 }
 
+static void perf_event_start_swclock(int cpu)
+{
+	struct perf_event_context *ctx;
+	struct pmu *pmu;
+	int idx;
+	struct perf_event *event, *tmp;
+
+	idx = srcu_read_lock(&pmus_srcu);
+	list_for_each_entry_rcu(pmu, &pmus, entry) {
+		if (pmu->events_across_hotplug) {
+			ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
+			list_for_each_entry_safe(event, tmp, &ctx->event_list,
+						 event_entry) {
+				if (event->attr.config ==
+				    PERF_COUNT_SW_CPU_CLOCK &&
+				    event->attr.type == PERF_TYPE_SOFTWARE)
+					cpu_clock_event_start(event, 0);
+			}
+		}
+	}
+	srcu_read_unlock(&pmus_srcu, idx);
+}
+
 static void perf_event_exit_cpu(int cpu)
 {
 	perf_event_exit_cpu_context(cpu);
 }
 #else
 static inline void perf_event_exit_cpu(int cpu) { }
+static inline void perf_event_start_swclock(int cpu) { }
 #endif
 
 static int
@@ -9482,6 +9764,11 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
 	case CPU_DOWN_PREPARE:
 		perf_event_exit_cpu(cpu);
 		break;
+
+	case CPU_STARTING:
+		perf_event_start_swclock(cpu);
+		break;
+
 	default:
 		break;
 	}
@@ -9489,6 +9776,25 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
 	return NOTIFY_OK;
 }
 
+static int event_idle_notif(struct notifier_block *nb, unsigned long action,
+							void *data)
+{
+	switch (action) {
+	case IDLE_START:
+		__this_cpu_write(is_idle, true);
+		break;
+	case IDLE_END:
+		__this_cpu_write(is_idle, false);
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block perf_event_idle_nb = {
+	.notifier_call = event_idle_notif,
+};
+
 void __init perf_event_init(void)
 {
 	int ret;
@@ -9502,6 +9808,7 @@ void __init perf_event_init(void)
 	perf_pmu_register(&perf_task_clock, NULL, -1);
 	perf_tp_register();
 	perf_cpu_notifier(perf_cpu_notify);
+	idle_notifier_register(&perf_event_idle_nb);
 	register_reboot_notifier(&perf_reboot_notifier);
 
 	ret = init_hw_breakpoint();
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index a27245fdcd81..d97a5430d580 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -598,6 +598,8 @@ static struct pmu perf_breakpoint = {
 	.start		= hw_breakpoint_start,
 	.stop		= hw_breakpoint_stop,
 	.read		= hw_breakpoint_pmu_read,
+
+	.events_across_hotplug = 1,
 };
 
 int __init init_hw_breakpoint(void)
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 2bbad9c1274c..2b229fdcfc09 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -11,7 +11,6 @@
 struct ring_buffer {
 	atomic_t			refcount;
 	struct rcu_head			rcu_head;
-	struct irq_work			irq_work;
 #ifdef CONFIG_PERF_USE_VMALLOC
 	struct work_struct		work;
 	int				page_order;	/* allocation order  */
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 58013ef228a1..f4b9a369c8c3 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -222,8 +222,6 @@ void perf_output_end(struct perf_output_handle *handle)
 	rcu_read_unlock();
 }
 
-static void rb_irq_work(struct irq_work *work);
-
 static void
 ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
 {
@@ -244,16 +242,6 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
 
 	INIT_LIST_HEAD(&rb->event_list);
 	spin_lock_init(&rb->event_lock);
-	init_irq_work(&rb->irq_work, rb_irq_work);
-}
-
-static void ring_buffer_put_async(struct ring_buffer *rb)
-{
-	if (!atomic_dec_and_test(&rb->refcount))
-		return;
-
-	rb->rcu_head.next = (void *)rb;
-	irq_work_queue(&rb->irq_work);
 }
 
 /*
@@ -265,6 +253,10 @@ static void ring_buffer_put_async(struct ring_buffer *rb)
  * The ordering is similar to that of perf_output_{begin,end}, with
  * the exception of (B), which should be taken care of by the pmu
  * driver, since ordering rules will differ depending on hardware.
+ *
+ * Call this from pmu::start(); see the comment in perf_aux_output_end()
+ * about its use in pmu callbacks. Both can also be called from the PMI
+ * handler if needed.
  */
 void *perf_aux_output_begin(struct perf_output_handle *handle,
 			    struct perf_event *event)
@@ -289,6 +281,13 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
 		goto err;
 
 	/*
+	 * If rb::aux_mmap_count is zero (and rb_has_aux() above went through),
+	 * the aux buffer is in perf_mmap_close(), about to get freed.
+	 */
+	if (!atomic_read(&rb->aux_mmap_count))
+		goto err_put;
+
+	/*
 	 * Nesting is not supported for AUX area, make sure nested
 	 * writers are caught early
 	 */
@@ -329,10 +328,11 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
 	return handle->rb->aux_priv;
 
 err_put:
+	/* can't be last */
 	rb_free_aux(rb);
 
 err:
-	ring_buffer_put_async(rb);
+	ring_buffer_put(rb);
 	handle->event = NULL;
 
 	return NULL;
@@ -343,6 +343,10 @@ err:
  * aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the
  * pmu driver's responsibility to observe ordering rules of the hardware,
  * so that all the data is externally visible before this is called.
+ *
+ * Note: this has to be called from pmu::stop() callback, as the assumption
+ * of the AUX buffer management code is that after pmu::stop(), the AUX
+ * transaction must be stopped and therefore drop the AUX reference count.
  */
 void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
 			 bool truncated)
@@ -390,8 +394,9 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
 	handle->event = NULL;
 
 	local_set(&rb->aux_nest, 0);
+	/* can't be last */
 	rb_free_aux(rb);
-	ring_buffer_put_async(rb);
+	ring_buffer_put(rb);
 }
 
 /*
@@ -468,6 +473,33 @@ static void rb_free_aux_page(struct ring_buffer *rb, int idx)
 	__free_page(page);
 }
 
+static void __rb_free_aux(struct ring_buffer *rb)
+{
+	int pg;
+
+	/*
+	 * Should never happen, the last reference should be dropped from
+	 * perf_mmap_close() path, which first stops aux transactions (which
+	 * in turn are the atomic holders of aux_refcount) and then does the
+	 * last rb_free_aux().
+	 */
+	WARN_ON_ONCE(in_atomic());
+
+	if (rb->aux_priv) {
+		rb->free_aux(rb->aux_priv);
+		rb->free_aux = NULL;
+		rb->aux_priv = NULL;
+	}
+
+	if (rb->aux_nr_pages) {
+		for (pg = 0; pg < rb->aux_nr_pages; pg++)
+			rb_free_aux_page(rb, pg);
+
+		kfree(rb->aux_pages);
+		rb->aux_nr_pages = 0;
+	}
+}
+
 int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
 		 pgoff_t pgoff, int nr_pages, long watermark, int flags)
 {
@@ -531,7 +563,7 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
 			goto out;
 	}
 
-	rb->aux_priv = event->pmu->setup_aux(event->cpu, rb->aux_pages, nr_pages,
+	rb->aux_priv = event->pmu->setup_aux(event, rb->aux_pages, nr_pages,
 					     overwrite);
 	if (!rb->aux_priv)
 		goto out;
@@ -556,45 +588,15 @@ out:
 	if (!ret)
 		rb->aux_pgoff = pgoff;
 	else
-		rb_free_aux(rb);
+		__rb_free_aux(rb);
 
 	return ret;
 }
 
-static void __rb_free_aux(struct ring_buffer *rb)
-{
-	int pg;
-
-	if (rb->aux_priv) {
-		rb->free_aux(rb->aux_priv);
-		rb->free_aux = NULL;
-		rb->aux_priv = NULL;
-	}
-
-	if (rb->aux_nr_pages) {
-		for (pg = 0; pg < rb->aux_nr_pages; pg++)
-			rb_free_aux_page(rb, pg);
-
-		kfree(rb->aux_pages);
-		rb->aux_nr_pages = 0;
-	}
-}
-
 void rb_free_aux(struct ring_buffer *rb)
 {
 	if (atomic_dec_and_test(&rb->aux_refcount))
-		irq_work_queue(&rb->irq_work);
-}
-
-static void rb_irq_work(struct irq_work *work)
-{
-	struct ring_buffer *rb = container_of(work, struct ring_buffer, irq_work);
-
-	if (!atomic_read(&rb->aux_refcount))
 		__rb_free_aux(rb);
-
-	if (rb->rcu_head.next == (void *)rb)
-		call_rcu(&rb->rcu_head, rb_free_rcu);
 }
 
 #ifndef CONFIG_PERF_USE_VMALLOC
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index aad43c88a668..26a4cba37128 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1693,8 +1693,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
 	int result;
 
 	pagefault_disable();
-	result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr,
-							sizeof(opcode));
+	result = __get_user(opcode, (uprobe_opcode_t __user *)vaddr);
 	pagefault_enable();
 
 	if (likely(result == 0))
diff --git a/kernel/exit.c b/kernel/exit.c
index e57bff761b88..fc82e495b729 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -391,6 +391,7 @@ static void exit_mm(struct task_struct *tsk)
 {
 	struct mm_struct *mm = tsk->mm;
 	struct core_state *core_state;
+	int mm_released;
 
 	mm_release(tsk, mm);
 	if (!mm)
@@ -437,9 +438,12 @@ static void exit_mm(struct task_struct *tsk)
 	enter_lazy_tlb(mm, current);
 	task_unlock(tsk);
 	mm_update_next_owner(mm);
-	mmput(mm);
+
+	mm_released = mmput(mm);
 	if (test_thread_flag(TIF_MEMDIE))
 		exit_oom_victim();
+	if (mm_released)
+		set_tsk_thread_flag(tsk, TIF_MM_RELEASED);
 }
 
 static struct task_struct *find_alive_thread(struct task_struct *p)
@@ -635,6 +639,7 @@ static void check_stack_usage(void)
 	static DEFINE_SPINLOCK(low_water_lock);
 	static int lowest_to_date = THREAD_SIZE;
 	unsigned long free;
+	int islower = false;
 
 	free = stack_not_used(current);
 
@@ -643,11 +648,16 @@ static void check_stack_usage(void)
 
 	spin_lock(&low_water_lock);
 	if (free < lowest_to_date) {
-		pr_warn("%s (%d) used greatest stack depth: %lu bytes left\n",
-			current->comm, task_pid_nr(current), free);
 		lowest_to_date = free;
+		islower = true;
 	}
 	spin_unlock(&low_water_lock);
+
+	if (islower) {
+		printk(KERN_WARNING "%s (%d) used greatest stack depth: "
+				"%lu bytes left\n",
+				current->comm, task_pid_nr(current), free);
+	}
 }
 #else
 static inline void check_stack_usage(void) {}
@@ -704,6 +714,7 @@ void do_exit(long code)
 
 	exit_signals(tsk);  /* sets PF_EXITING */
 
+	sched_exit(tsk);
 	schedtune_exit_task(tsk);
 
 	/*
diff --git a/kernel/fork.c b/kernel/fork.c
index 5a90b0f6d668..77a21c3d02e3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -23,6 +23,7 @@
 #include <linux/file.h>
 #include <linux/fdtable.h>
 #include <linux/iocontext.h>
+#include <linux/kasan.h>
 #include <linux/key.h>
 #include <linux/binfmts.h>
 #include <linux/mman.h>
@@ -174,6 +175,7 @@ static inline void free_thread_stack(unsigned long *stack)
 {
 	struct page *page = virt_to_page(stack);
 
+	kasan_alloc_pages(page, THREAD_SIZE_ORDER);
 	kaiser_unmap_thread_stack(stack);
 	__free_kmem_pages(page, THREAD_SIZE_ORDER);
 }
@@ -729,12 +731,16 @@ static inline void __mmput(struct mm_struct *mm)
 /*
  * Decrement the use count and release all resources for an mm.
  */
-void mmput(struct mm_struct *mm)
+int mmput(struct mm_struct *mm)
 {
+	int mm_freed = 0;
 	might_sleep();
 
-	if (atomic_dec_and_test(&mm->mm_users))
+	if (atomic_dec_and_test(&mm->mm_users)) {
 		__mmput(mm);
+		mm_freed = 1;
+	}
+	return mm_freed;
 }
 EXPORT_SYMBOL_GPL(mmput);
 
@@ -1722,6 +1728,7 @@ bad_fork_cleanup_audit:
 bad_fork_cleanup_perf:
 	perf_event_free_task(p);
 bad_fork_cleanup_policy:
+	free_task_load_ptrs(p);
 #ifdef CONFIG_NUMA
 	mpol_put(p->mempolicy);
 bad_fork_cleanup_threadgroup_lock:
@@ -1753,7 +1760,7 @@ struct task_struct *fork_idle(int cpu)
 			    cpu_to_node(cpu));
 	if (!IS_ERR(task)) {
 		init_idle_pids(task->pids);
-		init_idle(task, cpu);
+		init_idle(task, cpu, false);
 	}
 
 	return task;
diff --git a/kernel/futex.c b/kernel/futex.c
index a26d217c99fe..aedb36c0fd92 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -766,7 +766,7 @@ static int get_futex_value_locked(u32 *dest, u32 __user *from)
 	int ret;
 
 	pagefault_disable();
-	ret = __copy_from_user_inatomic(dest, from, sizeof(u32));
+	ret = __get_user(*dest, from);
 	pagefault_enable();
 
 	return ret ? -EFAULT : 0;
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 3c74e13a95dc..825ae448c7a9 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -836,7 +836,8 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
 	irq_settings_clr_and_set(desc, clr, set);
 
 	irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU |
-		   IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT);
+		   IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT |
+		   IRQD_AFFINITY_MANAGED);
 	if (irq_settings_has_no_balance_set(desc))
 		irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
 	if (irq_settings_is_per_cpu(desc))
@@ -845,6 +846,8 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
 		irqd_set(&desc->irq_data, IRQD_MOVE_PCNTXT);
 	if (irq_settings_is_level(desc))
 		irqd_set(&desc->irq_data, IRQD_LEVEL);
+	if (irq_settings_has_affinity_managed_set(desc))
+		irqd_set(&desc->irq_data, IRQD_AFFINITY_MANAGED);
 
 	irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc));
 
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index 011f8c4c63da..4684b7595e63 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -11,6 +11,7 @@
 #include <linux/interrupt.h>
 #include <linux/ratelimit.h>
 #include <linux/irq.h>
+#include <linux/cpumask.h>
 
 #include "internals.h"
 
@@ -20,6 +21,7 @@ static bool migrate_one_irq(struct irq_desc *desc)
 	const struct cpumask *affinity = d->common->affinity;
 	struct irq_chip *c;
 	bool ret = false;
+	struct cpumask available_cpus;
 
 	/*
 	 * If this is a per-CPU interrupt, or the affinity does not
@@ -29,8 +31,37 @@ static bool migrate_one_irq(struct irq_desc *desc)
 	    !cpumask_test_cpu(smp_processor_id(), affinity))
 		return false;
 
+	cpumask_copy(&available_cpus, affinity);
+	cpumask_andnot(&available_cpus, &available_cpus, cpu_isolated_mask);
+	affinity = &available_cpus;
+
 	if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
-		affinity = cpu_online_mask;
+		/*
+		 * The order of preference for selecting a fallback CPU is
+		 *
+		 * (1) online and un-isolated CPU from default affinity
+		 * (2) online and un-isolated CPU
+		 * (3) online CPU
+		 */
+		cpumask_andnot(&available_cpus, cpu_online_mask,
+							cpu_isolated_mask);
+		if (cpumask_intersects(&available_cpus, irq_default_affinity))
+			cpumask_and(&available_cpus, &available_cpus,
+							irq_default_affinity);
+		else if (cpumask_empty(&available_cpus))
+			affinity = cpu_online_mask;
+
+		/*
+		 * We are overriding the affinity with all online and
+		 * un-isolated cpus. irq_set_affinity_locked() call
+		 * below notify this mask to PM QOS affinity listener.
+		 * That results in applying the CPU_DMA_LATENCY QOS
+		 * to all the CPUs specified in the mask. But the low
+		 * level irqchip driver sets the affinity of an irq
+		 * to only one CPU. So pick only one CPU from the
+		 * prepared mask while overriding the user affinity.
+		 */
+		affinity = cpumask_of(cpumask_any(affinity));
 		ret = true;
 	}
 
@@ -38,7 +69,7 @@ static bool migrate_one_irq(struct irq_desc *desc)
 	if (!c->irq_set_affinity) {
 		pr_debug("IRQ%u: unable to set affinity\n", d->irq);
 	} else {
-		int r = irq_do_set_affinity(d, affinity, false);
+		int r = irq_set_affinity_locked(d, affinity, false);
 		if (r)
 			pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n",
 					    d->irq, r);
@@ -69,6 +100,9 @@ void irq_migrate_all_off_this_cpu(void)
 		bool affinity_broken;
 
 		desc = irq_to_desc(irq);
+		if (!desc)
+			continue;
+
 		raw_spin_lock(&desc->lock);
 		affinity_broken = migrate_one_irq(desc);
 		raw_spin_unlock(&desc->lock);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index fcab63c66905..56afc0be6289 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -105,6 +105,8 @@ static inline void unregister_handler_proc(unsigned int irq,
 					   struct irqaction *action) { }
 #endif
 
+extern bool irq_can_set_affinity_usr(unsigned int irq);
+
 extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask);
 
 extern void irq_set_thread_affinity(struct irq_desc *desc);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 83cea913983c..36a13b75b48f 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -115,12 +115,12 @@ EXPORT_SYMBOL(synchronize_irq);
 #ifdef CONFIG_SMP
 cpumask_var_t irq_default_affinity;
 
-static int __irq_can_set_affinity(struct irq_desc *desc)
+static bool __irq_can_set_affinity(struct irq_desc *desc)
 {
 	if (!desc || !irqd_can_balance(&desc->irq_data) ||
 	    !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity)
-		return 0;
-	return 1;
+		return false;
+	return true;
 }
 
 /**
@@ -134,6 +134,21 @@ int irq_can_set_affinity(unsigned int irq)
 }
 
 /**
+ * irq_can_set_affinity_usr - Check if affinity of a irq can be set from user space
+ * @irq:	Interrupt to check
+ *
+ * Like irq_can_set_affinity() above, but additionally checks for the
+ * AFFINITY_MANAGED flag.
+ */
+bool irq_can_set_affinity_usr(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	return __irq_can_set_affinity(desc) &&
+		!irqd_affinity_is_managed(&desc->irq_data);
+}
+
+/**
  *	irq_set_thread_affinity - Notify irq threads to adjust affinity
  *	@desc:		irq descriptor which has affitnity changed
  *
@@ -319,6 +334,9 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
 	desc->affinity_notify = notify;
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
 
+	if (!notify && old_notify)
+		cancel_work_sync(&old_notify->work);
+
 	if (old_notify)
 		kref_put(&old_notify->kref, old_notify->release);
 
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index cd6009006510..41b40f310c28 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -268,7 +268,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 	struct msi_domain_ops *ops = info->ops;
 	msi_alloc_info_t arg;
 	struct msi_desc *desc;
-	int i, ret, virq;
+	int i, ret, virq = 0;
 
 	ret = ops->msi_check(domain, info, dev);
 	if (ret == 0)
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index a2c02fd5d6d0..b05509af0352 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -96,7 +96,7 @@ static ssize_t write_irq_affinity(int type, struct file *file,
 	cpumask_var_t new_value;
 	int err;
 
-	if (!irq_can_set_affinity(irq) || no_irq_affinity)
+	if (!irq_can_set_affinity_usr(irq) || no_irq_affinity)
 		return -EIO;
 
 	if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
@@ -114,6 +114,11 @@ static ssize_t write_irq_affinity(int type, struct file *file,
 		goto free_cpumask;
 	}
 
+	if (cpumask_subset(new_value, cpu_isolated_mask)) {
+		err = -EINVAL;
+		goto free_cpumask;
+	}
+
 	/*
 	 * Do not allow disabling IRQs completely - it's a too easy
 	 * way to make the system unusable accidentally :-) At least
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index 320579d89091..f0964f058521 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -17,6 +17,7 @@ enum {
 	_IRQ_IS_POLLED		= IRQ_IS_POLLED,
 	_IRQ_DISABLE_UNLAZY	= IRQ_DISABLE_UNLAZY,
 	_IRQF_MODIFY_MASK	= IRQF_MODIFY_MASK,
+	_IRQ_AFFINITY_MANAGED	= IRQ_AFFINITY_MANAGED,
 };
 
 #define IRQ_PER_CPU		GOT_YOU_MORON
@@ -32,6 +33,7 @@ enum {
 #define IRQ_DISABLE_UNLAZY	GOT_YOU_MORON
 #undef IRQF_MODIFY_MASK
 #define IRQF_MODIFY_MASK	GOT_YOU_MORON
+#define IRQ_AFFINITY_MANAGED	GOT_YOU_MORON
 
 static inline void
 irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set)
@@ -65,6 +67,16 @@ static inline bool irq_settings_has_no_balance_set(struct irq_desc *desc)
 	return desc->status_use_accessors & _IRQ_NO_BALANCING;
 }
 
+static inline void irq_settings_set_affinity_managed(struct irq_desc *desc)
+{
+	desc->status_use_accessors |= _IRQ_AFFINITY_MANAGED;
+}
+
+static inline bool irq_settings_has_affinity_managed_set(struct irq_desc *desc)
+{
+	return desc->status_use_accessors & _IRQ_AFFINITY_MANAGED;
+}
+
 static inline u32 irq_settings_get_trigger_mask(struct irq_desc *desc)
 {
 	return desc->status_use_accessors & IRQ_TYPE_SENSE_MASK;
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index a70b90db3909..c61c56f05dfa 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -26,6 +26,7 @@
 #include <linux/interrupt.h>
 #include <linux/debug_locks.h>
 #include <linux/osq_lock.h>
+#include <linux/delay.h>
 
 /*
  * In the DEBUG case we are using the "NULL fastpath" for mutexes,
@@ -378,6 +379,17 @@ static bool mutex_optimistic_spin(struct mutex *lock,
 		 * values at the cost of a few extra spins.
 		 */
 		cpu_relax_lowlatency();
+
+		/*
+		 * On arm systems, we must slow down the waiter's repeated
+		 * aquisition of spin_mlock and atomics on the lock count, or
+		 * we risk starving out a thread attempting to release the
+		 * mutex. The mutex slowpath release must take spin lock
+		 * wait_lock. This spin lock can share a monitor with the
+		 * other waiter atomics in the mutex data structure, so must
+		 * take care to rate limit the waiters.
+		 */
+		udelay(1);
 	}
 
 	osq_unlock(&lock->osq);
diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index 8d7047ecef4e..0befa20ce96e 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -1,6 +1,7 @@
 #include <linux/percpu.h>
 #include <linux/sched.h>
 #include <linux/osq_lock.h>
+#include <linux/sched/rt.h>
 
 /*
  * An MCS like lock especially tailored for optimistic spinning for sleeping
@@ -85,6 +86,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
 {
 	struct optimistic_spin_node *node = this_cpu_ptr(&osq_node);
 	struct optimistic_spin_node *prev, *next;
+	struct task_struct *task = current;
 	int curr = encode_cpu(smp_processor_id());
 	int old;
 
@@ -131,8 +133,13 @@ bool osq_lock(struct optimistic_spin_queue *lock)
 	while (!READ_ONCE(node->locked)) {
 		/*
 		 * If we need to reschedule bail... so we can block.
+		 * If a task spins on owner on a CPU after acquiring
+		 * osq_lock while a RT task spins on another CPU  to
+		 * acquire osq_lock, it will starve the owner from
+		 * completing if owner is to be scheduled on the same CPU.
+		 * It will be a live lock.
 		 */
-		if (need_resched())
+		if (need_resched() || rt_task(task))
 			goto unqueue;
 
 		cpu_relax_lowlatency();
diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
index 0374a596cffa..d381f559e0ce 100644
--- a/kernel/locking/spinlock_debug.c
+++ b/kernel/locking/spinlock_debug.c
@@ -12,6 +12,8 @@
 #include <linux/debug_locks.h>
 #include <linux/delay.h>
 #include <linux/export.h>
+#include <linux/bug.h>
+#include <soc/qcom/watchdog.h>
 
 void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
 			  struct lock_class_key *key)
@@ -64,6 +66,11 @@ static void spin_dump(raw_spinlock_t *lock, const char *msg)
 		owner ? owner->comm : "<none>",
 		owner ? task_pid_nr(owner) : -1,
 		lock->owner_cpu);
+#ifdef CONFIG_DEBUG_SPINLOCK_BITE_ON_BUG
+	msm_trigger_wdog_bite();
+#elif defined(CONFIG_DEBUG_SPINLOCK_PANIC_ON_BUG)
+	BUG();
+#endif
 	dump_stack();
 }
 
@@ -114,7 +121,7 @@ static void __spin_lock_debug(raw_spinlock_t *lock)
 		__delay(1);
 	}
 	/* lockup suspected: */
-	spin_dump(lock, "lockup suspected");
+	spin_bug(lock, "lockup suspected");
 #ifdef CONFIG_SMP
 	trigger_all_cpu_backtrace();
 #endif
@@ -167,6 +174,11 @@ static void rwlock_bug(rwlock_t *lock, const char *msg)
 	printk(KERN_EMERG "BUG: rwlock %s on CPU#%d, %s/%d, %p\n",
 		msg, raw_smp_processor_id(), current->comm,
 		task_pid_nr(current), lock);
+#ifdef CONFIG_DEBUG_SPINLOCK_BITE_ON_BUG
+	msm_trigger_wdog_bite();
+#elif defined(CONFIG_DEBUG_SPINLOCK_PANIC_ON_BUG)
+	BUG();
+#endif
 	dump_stack();
 }
 
diff --git a/kernel/module.c b/kernel/module.c
index bcc78f4c15e9..57809d2b4271 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2505,7 +2505,7 @@ static void layout_symtab(struct module *mod, struct load_info *info)
 
 	/* We'll tack temporary mod_kallsyms on the end. */
 	mod->init_size = ALIGN(mod->init_size,
-			       __alignof__(struct mod_kallsyms));
+				      __alignof__(struct mod_kallsyms));
 	info->mod_kallsyms_init_off = mod->init_size;
 	mod->init_size += sizeof(struct mod_kallsyms);
 	mod->init_size = debug_align(mod->init_size);
@@ -2585,7 +2585,13 @@ void * __weak module_alloc(unsigned long size)
 	return vmalloc_exec(size);
 }
 
-#ifdef CONFIG_DEBUG_KMEMLEAK
+#if defined(CONFIG_DEBUG_KMEMLEAK) && defined(CONFIG_DEBUG_MODULE_SCAN_OFF)
+static void kmemleak_load_module(const struct module *mod,
+				 const struct load_info *info)
+{
+	kmemleak_no_scan(mod->module_core);
+}
+#elif defined(CONFIG_DEBUG_KMEMLEAK)
 static void kmemleak_load_module(const struct module *mod,
 				 const struct load_info *info)
 {
diff --git a/kernel/panic.c b/kernel/panic.c
index 1d07cf9af849..75f564a94a82 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -24,6 +24,10 @@
 #include <linux/init.h>
 #include <linux/nmi.h>
 #include <linux/console.h>
+#include <soc/qcom/minidump.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/exception.h>
 
 #define PANIC_TIMER_STEP 100
 #define PANIC_BLINK_SPD 18
@@ -77,6 +81,8 @@ void panic(const char *fmt, ...)
 	long i, i_next = 0;
 	int state = 0;
 
+	trace_kernel_panic(0);
+
 	/*
 	 * Disable local interrupts. This will prevent panic_smp_self_stop
 	 * from deadlocking the first cpu that invokes the panic, since
@@ -103,6 +109,7 @@ void panic(const char *fmt, ...)
 	va_start(args, fmt);
 	vsnprintf(buf, sizeof(buf), fmt, args);
 	va_end(args);
+	dump_stack_minidump(0);
 	pr_emerg("Kernel panic - not syncing: %s\n", buf);
 #ifdef CONFIG_DEBUG_BUGVERBOSE
 	/*
@@ -178,6 +185,9 @@ void panic(const char *fmt, ...)
 			mdelay(PANIC_TIMER_STEP);
 		}
 	}
+
+	trace_kernel_panic_late(0);
+
 	if (panic_timeout != 0) {
 		/*
 		 * This will not be a clean reboot, with everything
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 797f19e2aaa9..29771c69383b 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -30,6 +30,7 @@
 #include <linux/genhd.h>
 #include <linux/ktime.h>
 #include <trace/events/power.h>
+#include <soc/qcom/boot_stats.h>
 
 #include "power.h"
 
@@ -469,6 +470,7 @@ static int resume_target_kernel(bool platform_mode)
 	touch_softlockup_watchdog();
 
 	syscore_resume();
+	place_marker("PM: Image Restoration failed!");
 
  Enable_irqs:
 	local_irq_enable();
@@ -705,6 +707,7 @@ int hibernate(void)
 		pm_restore_gfp_mask();
 	} else {
 		pr_debug("PM: Image restored successfully.\n");
+		place_marker("PM: Image restored!");
 	}
 
  Free_bitmaps:
@@ -1163,6 +1166,22 @@ static int __init kaslr_nohibernate_setup(char *str)
 	return nohibernate_setup(str);
 }
 
+static int __init page_poison_nohibernate_setup(char *str)
+{
+#ifdef CONFIG_PAGE_POISONING_ZERO
+	/*
+	 * The zeroing option for page poison skips the checks on alloc.
+	 * since hibernation doesn't save free pages there's no way to
+	 * guarantee the pages will still be zeroed.
+	 */
+	if (!strcmp(str, "on")) {
+		pr_info("Disabling hibernation due to page poisoning\n");
+		return nohibernate_setup(str);
+	}
+#endif
+	return 1;
+}
+
 __setup("noresume", noresume_setup);
 __setup("resume_offset=", resume_offset_setup);
 __setup("resume=", resume_setup);
@@ -1171,3 +1190,4 @@ __setup("resumewait", resumewait_setup);
 __setup("resumedelay=", resumedelay_setup);
 __setup("nohibernate", nohibernate_setup);
 __setup("kaslr", kaslr_nohibernate_setup);
+__setup("page_poison=", page_poison_nohibernate_setup);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 68c0eaae8034..5ea50b1b7595 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -287,13 +287,7 @@ static ssize_t pm_wakeup_irq_show(struct kobject *kobj,
 	return pm_wakeup_irq ? sprintf(buf, "%u\n", pm_wakeup_irq) : -ENODATA;
 }
 
-static ssize_t pm_wakeup_irq_store(struct kobject *kobj,
-					struct kobj_attribute *attr,
-					const char *buf, size_t n)
-{
-	return -EINVAL;
-}
-power_attr(pm_wakeup_irq);
+power_attr_ro(pm_wakeup_irq);
 
 #else /* !CONFIG_PM_SLEEP_DEBUG */
 static inline void pm_print_times_init(void) {}
@@ -571,14 +565,7 @@ static ssize_t pm_trace_dev_match_show(struct kobject *kobj,
 	return show_trace_dev_match(buf, PAGE_SIZE);
 }
 
-static ssize_t
-pm_trace_dev_match_store(struct kobject *kobj, struct kobj_attribute *attr,
-			 const char *buf, size_t n)
-{
-	return -EINVAL;
-}
-
-power_attr(pm_trace_dev_match);
+power_attr_ro(pm_trace_dev_match);
 
 #endif /* CONFIG_PM_TRACE */
 
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 436e302eb024..2610516601ee 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -77,6 +77,15 @@ static struct kobj_attribute _name##_attr = {	\
 	.store	= _name##_store,		\
 }
 
+#define power_attr_ro(_name) \
+static struct kobj_attribute _name##_attr = {	\
+	.attr	= {				\
+		.name = __stringify(_name),	\
+		.mode = S_IRUGO,		\
+	},					\
+	.show	= _name##_show,			\
+}
+
 /* Preferred image size in bytes (default 500 MB) */
 extern unsigned long image_size;
 /* Size of memory reserved for drivers (default SPARE_PAGES x PAGE_SIZE) */
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 97b0df71303e..e6eceb0aa496 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -43,6 +43,9 @@
 #include <linux/kernel.h>
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
+#include <linux/irq.h>
+#include <linux/irqdesc.h>
+#include <linux/cpumask.h>
 
 #include <linux/uaccess.h>
 #include <linux/export.h>
@@ -67,6 +70,8 @@ static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier);
 static struct pm_qos_constraints cpu_dma_constraints = {
 	.list = PLIST_HEAD_INIT(cpu_dma_constraints.list),
 	.target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
+	.target_per_cpu = { [0 ... (NR_CPUS - 1)] =
+				PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE },
 	.default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
 	.no_constraint_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
 	.type = PM_QOS_MIN,
@@ -81,6 +86,8 @@ static BLOCKING_NOTIFIER_HEAD(network_lat_notifier);
 static struct pm_qos_constraints network_lat_constraints = {
 	.list = PLIST_HEAD_INIT(network_lat_constraints.list),
 	.target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
+	.target_per_cpu = { [0 ... (NR_CPUS - 1)] =
+				PM_QOS_NETWORK_LAT_DEFAULT_VALUE },
 	.default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
 	.no_constraint_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
 	.type = PM_QOS_MIN,
@@ -91,11 +98,12 @@ static struct pm_qos_object network_lat_pm_qos = {
 	.name = "network_latency",
 };
 
-
 static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier);
 static struct pm_qos_constraints network_tput_constraints = {
 	.list = PLIST_HEAD_INIT(network_tput_constraints.list),
 	.target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
+	.target_per_cpu = { [0 ... (NR_CPUS - 1)] =
+				PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE },
 	.default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
 	.no_constraint_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
 	.type = PM_QOS_MAX,
@@ -259,22 +267,60 @@ static const struct file_operations pm_qos_debug_fops = {
 	.release        = single_release,
 };
 
+static inline void pm_qos_set_value_for_cpus(struct pm_qos_constraints *c,
+		struct cpumask *cpus)
+{
+	struct pm_qos_request *req = NULL;
+	int cpu;
+	s32 qos_val[NR_CPUS] = { [0 ... (NR_CPUS - 1)] = c->default_value };
+
+	plist_for_each_entry(req, &c->list, node) {
+		for_each_cpu(cpu, &req->cpus_affine) {
+			switch (c->type) {
+			case PM_QOS_MIN:
+				if (qos_val[cpu] > req->node.prio)
+					qos_val[cpu] = req->node.prio;
+				break;
+			case PM_QOS_MAX:
+				if (req->node.prio > qos_val[cpu])
+					qos_val[cpu] = req->node.prio;
+				break;
+			case PM_QOS_SUM:
+				qos_val[cpu] += req->node.prio;
+				break;
+			default:
+				BUG();
+				break;
+			}
+		}
+	}
+
+	for_each_possible_cpu(cpu) {
+		if (c->target_per_cpu[cpu] != qos_val[cpu])
+			cpumask_set_cpu(cpu, cpus);
+		c->target_per_cpu[cpu] = qos_val[cpu];
+	}
+}
+
 /**
  * pm_qos_update_target - manages the constraints list and calls the notifiers
  *  if needed
  * @c: constraints data struct
- * @node: request to add to the list, to update or to remove
+ * @req: request to add to the list, to update or to remove
  * @action: action to take on the constraints list
  * @value: value of the request to add or update
  *
  * This function returns 1 if the aggregated constraint value has changed, 0
  *  otherwise.
  */
-int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node,
-			 enum pm_qos_req_action action, int value)
+int pm_qos_update_target(struct pm_qos_constraints *c,
+				struct pm_qos_request *req,
+				enum pm_qos_req_action action, int value)
 {
 	unsigned long flags;
 	int prev_value, curr_value, new_value;
+	struct plist_node *node = &req->node;
+	struct cpumask cpus;
 	int ret;
 
 	spin_lock_irqsave(&pm_qos_lock, flags);
@@ -305,17 +351,23 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node,
 	}
 
 	curr_value = pm_qos_get_value(c);
+	cpumask_clear(&cpus);
 	pm_qos_set_value(c, curr_value);
+	pm_qos_set_value_for_cpus(c, &cpus);
 
 	spin_unlock_irqrestore(&pm_qos_lock, flags);
 
 	trace_pm_qos_update_target(action, prev_value, curr_value);
-	if (prev_value != curr_value) {
+	/*
+	 * if cpu mask bits are set, call the notifier call chain
+	 * to update the new qos restriction for the cores
+	 */
+	if (!cpumask_empty(&cpus)) {
 		ret = 1;
 		if (c->notifiers)
 			blocking_notifier_call_chain(c->notifiers,
 						     (unsigned long)curr_value,
-						     NULL);
+						     &cpus);
 	} else {
 		ret = 0;
 	}
@@ -398,12 +450,56 @@ int pm_qos_request(int pm_qos_class)
 }
 EXPORT_SYMBOL_GPL(pm_qos_request);
 
+int pm_qos_request_for_cpu(int pm_qos_class, int cpu)
+{
+	if (cpu_isolated(cpu))
+		return INT_MAX;
+
+	return pm_qos_array[pm_qos_class]->constraints->target_per_cpu[cpu];
+}
+EXPORT_SYMBOL(pm_qos_request_for_cpu);
+
 int pm_qos_request_active(struct pm_qos_request *req)
 {
 	return req->pm_qos_class != 0;
 }
 EXPORT_SYMBOL_GPL(pm_qos_request_active);
 
+int pm_qos_request_for_cpumask(int pm_qos_class, struct cpumask *mask)
+{
+	unsigned long irqflags;
+	int cpu;
+	struct pm_qos_constraints *c = NULL;
+	int val;
+
+	spin_lock_irqsave(&pm_qos_lock, irqflags);
+	c = pm_qos_array[pm_qos_class]->constraints;
+	val = c->default_value;
+
+	for_each_cpu(cpu, mask) {
+		if (cpu_isolated(cpu))
+			continue;
+
+		switch (c->type) {
+		case PM_QOS_MIN:
+			if (c->target_per_cpu[cpu] < val)
+				val = c->target_per_cpu[cpu];
+			break;
+		case PM_QOS_MAX:
+			if (c->target_per_cpu[cpu] > val)
+				val = c->target_per_cpu[cpu];
+			break;
+		default:
+			BUG();
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&pm_qos_lock, irqflags);
+
+	return val;
+}
+EXPORT_SYMBOL(pm_qos_request_for_cpumask);
+
 static void __pm_qos_update_request(struct pm_qos_request *req,
 			   s32 new_value)
 {
@@ -412,7 +508,7 @@ static void __pm_qos_update_request(struct pm_qos_request *req,
 	if (new_value != req->node.prio)
 		pm_qos_update_target(
 			pm_qos_array[req->pm_qos_class]->constraints,
-			&req->node, PM_QOS_UPDATE_REQ, new_value);
+			req, PM_QOS_UPDATE_REQ, new_value);
 }
 
 /**
@@ -430,6 +526,41 @@ static void pm_qos_work_fn(struct work_struct *work)
 	__pm_qos_update_request(req, PM_QOS_DEFAULT_VALUE);
 }
 
+#ifdef CONFIG_SMP
+static void pm_qos_irq_release(struct kref *ref)
+{
+	unsigned long flags;
+	struct irq_affinity_notify *notify = container_of(ref,
+					struct irq_affinity_notify, kref);
+	struct pm_qos_request *req = container_of(notify,
+					struct pm_qos_request, irq_notify);
+	struct pm_qos_constraints *c =
+				pm_qos_array[req->pm_qos_class]->constraints;
+
+	spin_lock_irqsave(&pm_qos_lock, flags);
+	cpumask_setall(&req->cpus_affine);
+	spin_unlock_irqrestore(&pm_qos_lock, flags);
+
+	pm_qos_update_target(c, req, PM_QOS_UPDATE_REQ, c->default_value);
+}
+
+static void pm_qos_irq_notify(struct irq_affinity_notify *notify,
+		const cpumask_t *mask)
+{
+	unsigned long flags;
+	struct pm_qos_request *req = container_of(notify,
+					struct pm_qos_request, irq_notify);
+	struct pm_qos_constraints *c =
+				pm_qos_array[req->pm_qos_class]->constraints;
+
+	spin_lock_irqsave(&pm_qos_lock, flags);
+	cpumask_copy(&req->cpus_affine, mask);
+	spin_unlock_irqrestore(&pm_qos_lock, flags);
+
+	pm_qos_update_target(c, req, PM_QOS_UPDATE_REQ, req->node.prio);
+}
+#endif
+
 /**
  * pm_qos_add_request - inserts new qos request into the list
  * @req: pointer to a preallocated handle
@@ -453,11 +584,70 @@ void pm_qos_add_request(struct pm_qos_request *req,
 		WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n");
 		return;
 	}
+
+	switch (req->type) {
+	case PM_QOS_REQ_AFFINE_CORES:
+		if (cpumask_empty(&req->cpus_affine)) {
+			req->type = PM_QOS_REQ_ALL_CORES;
+			cpumask_setall(&req->cpus_affine);
+			WARN(1, KERN_ERR "Affine cores not set for request with affinity flag\n");
+		}
+		break;
+#ifdef CONFIG_SMP
+	case PM_QOS_REQ_AFFINE_IRQ:
+		if (irq_can_set_affinity(req->irq)) {
+			struct irq_desc *desc = irq_to_desc(req->irq);
+			struct cpumask *mask;
+
+			if (!desc)
+				return;
+			mask = desc->irq_data.common->affinity;
+
+			/* Get the current affinity */
+			cpumask_copy(&req->cpus_affine, mask);
+			req->irq_notify.irq = req->irq;
+			req->irq_notify.notify = pm_qos_irq_notify;
+			req->irq_notify.release = pm_qos_irq_release;
+
+		} else {
+			req->type = PM_QOS_REQ_ALL_CORES;
+			cpumask_setall(&req->cpus_affine);
+			WARN(1, KERN_ERR "IRQ-%d not set for request with affinity flag\n",
+					req->irq);
+		}
+		break;
+#endif
+	default:
+		WARN(1, KERN_ERR "Unknown request type %d\n", req->type);
+		/* fall through */
+	case PM_QOS_REQ_ALL_CORES:
+		cpumask_setall(&req->cpus_affine);
+		break;
+	}
+
 	req->pm_qos_class = pm_qos_class;
 	INIT_DELAYED_WORK(&req->work, pm_qos_work_fn);
 	trace_pm_qos_add_request(pm_qos_class, value);
 	pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints,
-			     &req->node, PM_QOS_ADD_REQ, value);
+			     req, PM_QOS_ADD_REQ, value);
+
+#ifdef CONFIG_SMP
+	if (req->type == PM_QOS_REQ_AFFINE_IRQ &&
+			irq_can_set_affinity(req->irq)) {
+		int ret = 0;
+
+		ret = irq_set_affinity_notifier(req->irq,
+					&req->irq_notify);
+		if (ret) {
+			WARN(1, "IRQ affinity notify set failed\n");
+			req->type = PM_QOS_REQ_ALL_CORES;
+			cpumask_setall(&req->cpus_affine);
+			pm_qos_update_target(
+				pm_qos_array[pm_qos_class]->constraints,
+				req, PM_QOS_UPDATE_REQ, value);
+		}
+	}
+#endif
 }
 EXPORT_SYMBOL_GPL(pm_qos_add_request);
 
@@ -511,7 +701,7 @@ void pm_qos_update_request_timeout(struct pm_qos_request *req, s32 new_value,
 	if (new_value != req->node.prio)
 		pm_qos_update_target(
 			pm_qos_array[req->pm_qos_class]->constraints,
-			&req->node, PM_QOS_UPDATE_REQ, new_value);
+			req, PM_QOS_UPDATE_REQ, new_value);
 
 	schedule_delayed_work(&req->work, usecs_to_jiffies(timeout_us));
 }
@@ -531,15 +721,25 @@ void pm_qos_remove_request(struct pm_qos_request *req)
 		/* silent return to keep pcm code cleaner */
 
 	if (!pm_qos_request_active(req)) {
-		WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n");
+		WARN(1, "pm_qos_remove_request() called for unknown object\n");
 		return;
 	}
 
 	cancel_delayed_work_sync(&req->work);
 
+#ifdef CONFIG_SMP
+	if (req->type == PM_QOS_REQ_AFFINE_IRQ) {
+		int ret = 0;
+		/* Get the current affinity */
+		ret = irq_set_affinity_notifier(req->irq, NULL);
+		if (ret)
+			WARN(1, "IRQ affinity notify set failed\n");
+	}
+#endif
+
 	trace_pm_qos_remove_request(req->pm_qos_class, PM_QOS_DEFAULT_VALUE);
 	pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints,
-			     &req->node, PM_QOS_REMOVE_REQ,
+			     req, PM_QOS_REMOVE_REQ,
 			     PM_QOS_DEFAULT_VALUE);
 	memset(req, 0, sizeof(*req));
 }
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 12cd989dadf6..1a2efc07c1d0 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -36,6 +36,15 @@
 
 #define HIBERNATE_SIG	"S1SUSPEND"
 
+static int goldenimage;
+/*
+ * When reading an {un,}compressed image, we may restore pages in place,
+ * in which case some architectures need these pages cleaning before they
+ * can be executed. We don't know which pages these may be, so clean the lot.
+ */
+static bool clean_pages_on_read;
+static bool clean_pages_on_decompress;
+
 /*
  *	The swap map is a data structure used for keeping track of each page
  *	written to a swap partition.  It consists of many swap_map_page
@@ -241,6 +250,9 @@ static void hib_end_io(struct bio *bio)
 
 	if (bio_data_dir(bio) == WRITE)
 		put_page(page);
+	else if (clean_pages_on_read)
+		flush_icache_range((unsigned long)page_address(page),
+				   (unsigned long)page_address(page) + PAGE_SIZE);
 
 	if (bio->bi_error && !hb->error)
 		hb->error = bio->bi_error;
@@ -1049,6 +1061,7 @@ static int load_image(struct swap_map_handle *handle,
 
 	hib_init_batch(&hb);
 
+	clean_pages_on_read = true;
 	printk(KERN_INFO "PM: Loading image data pages (%u pages)...\n",
 		nr_to_read);
 	m = nr_to_read / 10;
@@ -1124,6 +1137,10 @@ static int lzo_decompress_threadfn(void *data)
 		d->unc_len = LZO_UNC_SIZE;
 		d->ret = lzo1x_decompress_safe(d->cmp + LZO_HEADER, d->cmp_len,
 		                               d->unc, &d->unc_len);
+		if (clean_pages_on_decompress)
+			flush_icache_range((unsigned long)d->unc,
+					   (unsigned long)d->unc + d->unc_len);
+
 		atomic_set(&d->stop, 1);
 		wake_up(&d->done);
 	}
@@ -1189,6 +1206,8 @@ static int load_image_lzo(struct swap_map_handle *handle,
 	}
 	memset(crc, 0, offsetof(struct crc_data, go));
 
+	clean_pages_on_decompress = true;
+
 	/*
 	 * Start the decompression threads.
 	 */
@@ -1513,7 +1532,13 @@ int swsusp_check(void)
 			goto put;
 
 		if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) {
-			memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
+			if (!goldenimage) {
+				pr_debug("PM: corrupt hibernate image header\n");
+				memcpy(swsusp_header->sig,
+					swsusp_header->orig_sig, 10);
+			} else {
+				pr_debug("PM: Header corruption avoided\n");
+			}
 			/* Reset swap signature now */
 			error = hib_submit_io(WRITE_SYNC, swsusp_resume_block,
 						swsusp_header, NULL);
@@ -1587,3 +1612,10 @@ static int swsusp_header_init(void)
 }
 
 core_initcall(swsusp_header_init);
+
+static int __init golden_image_setup(char *str)
+{
+	goldenimage = 1;
+	return 1;
+}
+__setup("golden_image", golden_image_setup);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 62ffd417622c..eb6a190c7a3a 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -236,7 +236,11 @@ struct printk_log {
 	u8 facility;		/* syslog facility */
 	u8 flags:5;		/* internal record flags */
 	u8 level:3;		/* syslog level */
-};
+}
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+__packed __aligned(4)
+#endif
+;
 
 /*
  * The logbuf_lock protects kmsg buffer, indices, counters.  This can be taken
@@ -277,11 +281,7 @@ static u32 clear_idx;
 #define LOG_FACILITY(v)		((v) >> 3 & 0xff)
 
 /* record buffer */
-#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
-#define LOG_ALIGN 4
-#else
 #define LOG_ALIGN __alignof__(struct printk_log)
-#endif
 #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
 static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
 static char *log_buf = __log_buf;
@@ -2124,6 +2124,8 @@ void resume_console(void)
 	console_unlock();
 }
 
+#ifdef CONFIG_CONSOLE_FLUSH_ON_HOTPLUG
+
 /**
  * console_cpu_notify - print deferred console messages after CPU hotplug
  * @self: notifier struct
@@ -2143,12 +2145,18 @@ static int console_cpu_notify(struct notifier_block *self,
 	case CPU_DEAD:
 	case CPU_DOWN_FAILED:
 	case CPU_UP_CANCELED:
+	case CPU_DYING:
+#ifdef CONFIG_CONSOLE_FLUSH_ON_HOTPLUG
 		console_lock();
 		console_unlock();
+#endif
+		break;
 	}
 	return NOTIFY_OK;
 }
 
+#endif
+
 /**
  * console_lock - lock the console system for exclusive use.
  *
@@ -2713,7 +2721,9 @@ static int __init printk_late_init(void)
 			unregister_console(con);
 		}
 	}
+#ifdef CONFIG_CONSOLE_FLUSH_ON_HOTPLUG
 	hotcpu_notifier(console_cpu_notify, 0);
+#endif
 	return 0;
 }
 late_initcall(printk_late_init);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 8a62cbfe1f2f..3decfbc88308 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -57,6 +57,8 @@
 #include <linux/trace_events.h>
 #include <linux/suspend.h>
 
+#include <soc/qcom/watchdog.h>
+
 #include "tree.h"
 #include "rcu.h"
 
@@ -246,24 +248,17 @@ static int rcu_gp_in_progress(struct rcu_state *rsp)
  */
 void rcu_sched_qs(void)
 {
-	unsigned long flags;
-
-	if (__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) {
-		trace_rcu_grace_period(TPS("rcu_sched"),
-				       __this_cpu_read(rcu_sched_data.gpnum),
-				       TPS("cpuqs"));
-		__this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false);
-		if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))
-			return;
-		local_irq_save(flags);
-		if (__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) {
-			__this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false);
-			rcu_report_exp_rdp(&rcu_sched_state,
-					   this_cpu_ptr(&rcu_sched_data),
-					   true);
-		}
-		local_irq_restore(flags);
-	}
+	if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.s))
+		return;
+	trace_rcu_grace_period(TPS("rcu_sched"),
+			       __this_cpu_read(rcu_sched_data.gpnum),
+			       TPS("cpuqs"));
+	__this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false);
+	if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))
+		return;
+	__this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false);
+	rcu_report_exp_rdp(&rcu_sched_state,
+			   this_cpu_ptr(&rcu_sched_data), true);
 }
 
 void rcu_bh_qs(void)
@@ -300,17 +295,16 @@ EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr);
  * We inform the RCU core by emulating a zero-duration dyntick-idle
  * period, which we in turn do by incrementing the ->dynticks counter
  * by two.
+ *
+ * The caller must have disabled interrupts.
  */
 static void rcu_momentary_dyntick_idle(void)
 {
-	unsigned long flags;
 	struct rcu_data *rdp;
 	struct rcu_dynticks *rdtp;
 	int resched_mask;
 	struct rcu_state *rsp;
 
-	local_irq_save(flags);
-
 	/*
 	 * Yes, we can lose flag-setting operations.  This is OK, because
 	 * the flag will be set again after some delay.
@@ -340,13 +334,12 @@ static void rcu_momentary_dyntick_idle(void)
 		smp_mb__after_atomic(); /* Later stuff after QS. */
 		break;
 	}
-	local_irq_restore(flags);
 }
 
 /*
  * Note a context switch.  This is a quiescent state for RCU-sched,
  * and requires special handling for preemptible RCU.
- * The caller must have disabled preemption.
+ * The caller must have disabled interrupts.
  */
 void rcu_note_context_switch(void)
 {
@@ -376,9 +369,14 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch);
  */
 void rcu_all_qs(void)
 {
+	unsigned long flags;
+
 	barrier(); /* Avoid RCU read-side critical sections leaking down. */
-	if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
+	if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) {
+		local_irq_save(flags);
 		rcu_momentary_dyntick_idle();
+		local_irq_restore(flags);
+	}
 	this_cpu_inc(rcu_qs_ctr);
 	barrier(); /* Avoid RCU read-side critical sections leaking up. */
 }
@@ -1310,6 +1308,11 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
 
 	rcu_check_gp_kthread_starvation(rsp);
 
+#ifdef CONFIG_RCU_STALL_WATCHDOG_BITE
+	/* Induce watchdog bite */
+	msm_trigger_wdog_bite();
+#endif
+
 	force_quiescent_state(rsp);  /* Kick them all. */
 }
 
@@ -1345,6 +1348,11 @@ static void print_cpu_stall(struct rcu_state *rsp)
 			   jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 
+#ifdef CONFIG_RCU_STALL_WATCHDOG_BITE
+	/* Induce non secure watchdog bite to collect context */
+	msm_trigger_wdog_bite();
+#endif
+
 	/*
 	 * Attempt to revive the RCU machinery by forcing a context switch.
 	 *
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 32cbe72bf545..c6fc11d626f8 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -147,8 +147,8 @@ static void __init rcu_bootup_announce(void)
  * the corresponding expedited grace period will also be the end of the
  * normal grace period.
  */
-static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp,
-				   unsigned long flags) __releases(rnp->lock)
+static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
+	__releases(rnp->lock) /* But leaves rrupts disabled. */
 {
 	int blkd_state = (rnp->gp_tasks ? RCU_GP_TASKS : 0) +
 			 (rnp->exp_tasks ? RCU_EXP_TASKS : 0) +
@@ -236,7 +236,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp,
 		rnp->gp_tasks = &t->rcu_node_entry;
 	if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD))
 		rnp->exp_tasks = &t->rcu_node_entry;
-	raw_spin_unlock(&rnp->lock);
+	raw_spin_unlock(&rnp->lock); /* rrupts remain disabled. */
 
 	/*
 	 * Report the quiescent state for the expedited GP.  This expedited
@@ -251,7 +251,6 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp,
 	} else {
 		WARN_ON_ONCE(t->rcu_read_unlock_special.b.exp_need_qs);
 	}
-	local_irq_restore(flags);
 }
 
 /*
@@ -286,12 +285,11 @@ static void rcu_preempt_qs(void)
  * predating the current grace period drain, in other words, until
  * rnp->gp_tasks becomes NULL.
  *
- * Caller must disable preemption.
+ * Caller must disable interrupts.
  */
 static void rcu_preempt_note_context_switch(void)
 {
 	struct task_struct *t = current;
-	unsigned long flags;
 	struct rcu_data *rdp;
 	struct rcu_node *rnp;
 
@@ -301,7 +299,7 @@ static void rcu_preempt_note_context_switch(void)
 		/* Possibly blocking in an RCU read-side critical section. */
 		rdp = this_cpu_ptr(rcu_state_p->rda);
 		rnp = rdp->mynode;
-		raw_spin_lock_irqsave(&rnp->lock, flags);
+		raw_spin_lock(&rnp->lock);
 		smp_mb__after_unlock_lock();
 		t->rcu_read_unlock_special.b.blocked = true;
 		t->rcu_blocked_node = rnp;
@@ -318,7 +316,7 @@ static void rcu_preempt_note_context_switch(void)
 				       (rnp->qsmask & rdp->grpmask)
 				       ? rnp->gpnum
 				       : rnp->gpnum + 1);
-		rcu_preempt_ctxt_queue(rnp, rdp, flags);
+		rcu_preempt_ctxt_queue(rnp, rdp);
 	} else if (t->rcu_read_lock_nesting < 0 &&
 		   t->rcu_read_unlock_special.s) {
 
diff --git a/kernel/resource.c b/kernel/resource.c
index 41718cd8cab5..73348f574163 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -172,7 +172,7 @@ static const struct file_operations proc_iomem_operations = {
 static int __init ioresources_init(void)
 {
 	proc_create("ioports", 0, NULL, &proc_ioports_operations);
-	proc_create("iomem", 0, NULL, &proc_iomem_operations);
+	proc_create("iomem", S_IRUSR, NULL, &proc_iomem_operations);
 	return 0;
 }
 __initcall(ioresources_init);
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 99378130a42f..7dde1b9918e4 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -17,13 +17,14 @@ endif
 
 obj-y += core.o loadavg.o clock.o cputime.o
 obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
-obj-y += wait.o completion.o idle.o
+obj-y += wait.o completion.o idle.o sched_avg.o
 obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o energy.o
-obj-$(CONFIG_SCHED_WALT) += walt.o
+obj-$(CONFIG_SCHED_HMP) += hmp.o boost.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
 obj-$(CONFIG_SCHED_DEBUG) += debug.o
 obj-$(CONFIG_SCHED_TUNE) += tune.o
 obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
+obj-$(CONFIG_SCHED_CORE_CTL) += core_ctl.o
 obj-$(CONFIG_CPU_FREQ) += cpufreq.o
 obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
diff --git a/kernel/sched/boost.c b/kernel/sched/boost.c
new file mode 100644
index 000000000000..5bdd51b1e55e
--- /dev/null
+++ b/kernel/sched/boost.c
@@ -0,0 +1,217 @@
+/* Copyright (c) 2012-2016, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include "sched.h"
+#include <linux/of.h>
+#include <linux/sched/core_ctl.h>
+#include <trace/events/sched.h>
+
+/*
+ * Scheduler boost is a mechanism to temporarily place tasks on CPUs
+ * with higher capacity than those where a task would have normally
+ * ended up with their load characteristics. Any entity enabling
+ * boost is responsible for disabling it as well.
+ */
+
+unsigned int sysctl_sched_boost;
+static enum sched_boost_policy boost_policy;
+static enum sched_boost_policy boost_policy_dt = SCHED_BOOST_NONE;
+static DEFINE_MUTEX(boost_mutex);
+static unsigned int freq_aggr_threshold_backup;
+
+static inline void boost_kick(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+
+	if (!test_and_set_bit(BOOST_KICK, &rq->hmp_flags))
+		smp_send_reschedule(cpu);
+}
+
+static void boost_kick_cpus(void)
+{
+	int i;
+	struct cpumask kick_mask;
+
+	if (boost_policy != SCHED_BOOST_ON_BIG)
+		return;
+
+	cpumask_andnot(&kick_mask, cpu_online_mask, cpu_isolated_mask);
+
+	for_each_cpu(i, &kick_mask) {
+		if (cpu_capacity(i) != max_capacity)
+			boost_kick(i);
+	}
+}
+
+int got_boost_kick(void)
+{
+	int cpu = smp_processor_id();
+	struct rq *rq = cpu_rq(cpu);
+
+	return test_bit(BOOST_KICK, &rq->hmp_flags);
+}
+
+void clear_boost_kick(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+
+	clear_bit(BOOST_KICK, &rq->hmp_flags);
+}
+
+/*
+ * Scheduler boost type and boost policy might at first seem unrelated,
+ * however, there exists a connection between them that will allow us
+ * to use them interchangeably during placement decisions. We'll explain
+ * the connection here in one possible way so that the implications are
+ * clear when looking at placement policies.
+ *
+ * When policy = SCHED_BOOST_NONE, type is either none or RESTRAINED
+ * When policy = SCHED_BOOST_ON_ALL or SCHED_BOOST_ON_BIG, type can
+ * neither be none nor RESTRAINED.
+ */
+static void set_boost_policy(int type)
+{
+	if (type == SCHED_BOOST_NONE || type == RESTRAINED_BOOST) {
+		boost_policy = SCHED_BOOST_NONE;
+		return;
+	}
+
+	if (boost_policy_dt) {
+		boost_policy = boost_policy_dt;
+		return;
+	}
+
+	if (min_possible_efficiency != max_possible_efficiency) {
+		boost_policy = SCHED_BOOST_ON_BIG;
+		return;
+	}
+
+	boost_policy = SCHED_BOOST_ON_ALL;
+}
+
+enum sched_boost_policy sched_boost_policy(void)
+{
+	return boost_policy;
+}
+
+static bool verify_boost_params(int old_val, int new_val)
+{
+	/*
+	 * Boost can only be turned on or off. There is no possiblity of
+	 * switching from one boost type to another or to set the same
+	 * kind of boost several times.
+	 */
+	return !(!!old_val == !!new_val);
+}
+
+static void _sched_set_boost(int old_val, int type)
+{
+	switch (type) {
+	case NO_BOOST:
+		if (old_val == FULL_THROTTLE_BOOST)
+			core_ctl_set_boost(false);
+		else if (old_val == CONSERVATIVE_BOOST)
+			restore_cgroup_boost_settings();
+		else
+			update_freq_aggregate_threshold(
+				freq_aggr_threshold_backup);
+		break;
+
+	case FULL_THROTTLE_BOOST:
+		core_ctl_set_boost(true);
+		boost_kick_cpus();
+		break;
+
+	case CONSERVATIVE_BOOST:
+		update_cgroup_boost_settings();
+		boost_kick_cpus();
+		break;
+
+	case RESTRAINED_BOOST:
+		freq_aggr_threshold_backup =
+			update_freq_aggregate_threshold(1);
+		break;
+
+	default:
+		WARN_ON(1);
+		return;
+	}
+
+	set_boost_policy(type);
+	sysctl_sched_boost = type;
+	trace_sched_set_boost(type);
+}
+
+void sched_boost_parse_dt(void)
+{
+	struct device_node *sn;
+	const char *boost_policy;
+
+	sn = of_find_node_by_path("/sched-hmp");
+	if (!sn)
+		return;
+
+	if (!of_property_read_string(sn, "boost-policy", &boost_policy)) {
+		if (!strcmp(boost_policy, "boost-on-big"))
+			boost_policy_dt = SCHED_BOOST_ON_BIG;
+		else if (!strcmp(boost_policy, "boost-on-all"))
+			boost_policy_dt = SCHED_BOOST_ON_ALL;
+	}
+}
+
+int sched_set_boost(int type)
+{
+	int ret = 0;
+
+	mutex_lock(&boost_mutex);
+
+	if (verify_boost_params(sysctl_sched_boost, type))
+		_sched_set_boost(sysctl_sched_boost, type);
+	else
+		ret = -EINVAL;
+
+	mutex_unlock(&boost_mutex);
+	return ret;
+}
+
+int sched_boost_handler(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp,
+		loff_t *ppos)
+{
+	int ret;
+	unsigned int *data = (unsigned int *)table->data;
+	unsigned int old_val;
+
+	mutex_lock(&boost_mutex);
+
+	old_val = *data;
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+	if (ret || !write)
+		goto done;
+
+	if (verify_boost_params(old_val, *data)) {
+		_sched_set_boost(old_val, *data);
+	} else {
+		*data = old_val;
+		ret = -EINVAL;
+	}
+
+done:
+	mutex_unlock(&boost_mutex);
+	return ret;
+}
+
+int sched_boost(void)
+{
+	return sysctl_sched_boost;
+}
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index caf4041f5b0a..bc54e84675da 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -354,7 +354,7 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
 		return;
 
 	sched_clock_tick();
-	touch_softlockup_watchdog();
+	touch_softlockup_watchdog_sched();
 }
 EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 328e17dc2f9b..6b9021b9bc26 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -26,6 +26,7 @@
  *              Thomas Gleixner, Mike Kravetz
  */
 
+#include <linux/kasan.h>
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/nmi.h>
@@ -74,6 +75,8 @@
 #include <linux/binfmts.h>
 #include <linux/context_tracking.h>
 #include <linux/compiler.h>
+#include <linux/irq.h>
+#include <linux/sched/core_ctl.h>
 #include <linux/cpufreq_times.h>
 
 #include <asm/switch_to.h>
@@ -83,14 +86,19 @@
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #endif
+#ifdef CONFIG_MSM_APP_SETTINGS
+#include <asm/app_api.h>
+#endif
 
 #include "sched.h"
 #include "../workqueue_internal.h"
 #include "../smpboot.h"
+#include "../time/tick-internal.h"
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
-#include "walt.h"
+
+ATOMIC_NOTIFIER_HEAD(load_alert_notifier_head);
 
 DEFINE_MUTEX(sched_domains_mutex);
 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -856,6 +864,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 	if (!(flags & ENQUEUE_RESTORE))
 		sched_info_queued(rq, p);
 	p->sched_class->enqueue_task(rq, p, flags);
+	trace_sched_enq_deq_task(p, 1, cpumask_bits(&p->cpus_allowed)[0]);
 }
 
 static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -864,6 +873,7 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 	if (!(flags & DEQUEUE_SAVE))
 		sched_info_dequeued(rq, p);
 	p->sched_class->dequeue_task(rq, p, flags);
+	trace_sched_enq_deq_task(p, 0, cpumask_bits(&p->cpus_allowed)[0]);
 }
 
 void activate_task(struct rq *rq, struct task_struct *p, int flags)
@@ -879,6 +889,9 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
 	if (task_contributes_to_load(p))
 		rq->nr_uninterruptible++;
 
+	if (flags & DEQUEUE_SLEEP)
+		clear_ed_task(p, rq);
+
 	dequeue_task(rq, p, flags);
 }
 
@@ -1094,8 +1107,8 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new
 {
 	lockdep_assert_held(&rq->lock);
 
-	dequeue_task(rq, p, 0);
 	p->on_rq = TASK_ON_RQ_MIGRATING;
+	dequeue_task(rq, p, 0);
 	double_lock_balance(rq, cpu_rq(new_cpu));
 	set_task_cpu(p, new_cpu);
 	double_unlock_balance(rq, cpu_rq(new_cpu));
@@ -1105,8 +1118,8 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new
 
 	raw_spin_lock(&rq->lock);
 	BUG_ON(task_cpu(p) != new_cpu);
-	p->on_rq = TASK_ON_RQ_QUEUED;
 	enqueue_task(rq, p, 0);
+	p->on_rq = TASK_ON_RQ_QUEUED;
 	check_preempt_curr(rq, p, 0);
 
 	return rq;
@@ -1128,6 +1141,8 @@ struct migration_arg {
  */
 static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_cpu)
 {
+	int src_cpu;
+
 	if (unlikely(!cpu_active(dest_cpu)))
 		return rq;
 
@@ -1135,6 +1150,7 @@ static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_
 	if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
 		return rq;
 
+	src_cpu = cpu_of(rq);
 	rq = move_queued_task(rq, p, dest_cpu);
 
 	return rq;
@@ -1150,6 +1166,8 @@ static int migration_cpu_stop(void *data)
 	struct migration_arg *arg = data;
 	struct task_struct *p = arg->task;
 	struct rq *rq = this_rq();
+	int src_cpu = cpu_of(rq);
+	bool moved = false;
 
 	/*
 	 * The original target cpu might have gone down and we might
@@ -1170,12 +1188,18 @@ static int migration_cpu_stop(void *data)
 	 * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
 	 * we're holding p->pi_lock.
 	 */
-	if (task_rq(p) == rq && task_on_rq_queued(p))
+	if (task_rq(p) == rq && task_on_rq_queued(p)) {
 		rq = __migrate_task(rq, p, arg->dest_cpu);
+		moved = true;
+	}
 	raw_spin_unlock(&rq->lock);
 	raw_spin_unlock(&p->pi_lock);
 
 	local_irq_enable();
+
+	if (moved)
+		notify_migration(src_cpu, arg->dest_cpu, false, p);
+
 	return 0;
 }
 
@@ -1234,6 +1258,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
 	struct rq *rq;
 	unsigned int dest_cpu;
 	int ret = 0;
+	cpumask_t allowed_mask;
 
 	rq = task_rq_lock(p, &flags);
 
@@ -1249,18 +1274,25 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
 	if (cpumask_equal(&p->cpus_allowed, new_mask))
 		goto out;
 
-	if (!cpumask_intersects(new_mask, cpu_active_mask)) {
-		ret = -EINVAL;
-		goto out;
+	cpumask_andnot(&allowed_mask, new_mask, cpu_isolated_mask);
+	cpumask_and(&allowed_mask, &allowed_mask, cpu_active_mask);
+
+	dest_cpu = cpumask_any(&allowed_mask);
+	if (dest_cpu >= nr_cpu_ids) {
+		cpumask_and(&allowed_mask, cpu_active_mask, new_mask);
+		dest_cpu = cpumask_any(&allowed_mask);
+		if (dest_cpu >= nr_cpu_ids) {
+			ret = -EINVAL;
+			goto out;
+		}
 	}
 
 	do_set_cpus_allowed(p, new_mask);
 
 	/* Can the task run on the task's current CPU? If so, we're done */
-	if (cpumask_test_cpu(task_cpu(p), new_mask))
+	if (cpumask_test_cpu(task_cpu(p), &allowed_mask))
 		goto out;
 
-	dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
 	if (task_running(rq, p) || p->state == TASK_WAKING) {
 		struct migration_arg arg = { p, dest_cpu };
 		/* Need help from migration thread: drop lock and wait. */
@@ -1299,6 +1331,15 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 	WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
 			!p->on_rq);
 
+	/*
+	 * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING,
+	 * because schedstat_wait_{start,end} rebase migrating task's wait_start
+	 * time relying on p->on_rq.
+	 */
+	WARN_ON_ONCE(p->state == TASK_RUNNING &&
+		     p->sched_class == &fair_sched_class &&
+		     (p->on_rq && !task_on_rq_migrating(p)));
+
 #ifdef CONFIG_LOCKDEP
 	/*
 	 * The caller should hold either p->pi_lock or rq->lock, when changing
@@ -1315,7 +1356,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 #endif
 #endif
 
-	trace_sched_migrate_task(p, new_cpu);
+	trace_sched_migrate_task(p, new_cpu, pct_task_load(p));
 
 	if (task_cpu(p) != new_cpu) {
 		if (p->sched_class->migrate_task_rq)
@@ -1323,7 +1364,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 		p->se.nr_migrations++;
 		perf_event_task_migrate(p);
 
-		walt_fixup_busy_time(p, new_cpu);
+		fixup_busy_time(p, new_cpu);
 	}
 
 	__set_task_cpu(p, new_cpu);
@@ -1337,11 +1378,13 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
 		src_rq = task_rq(p);
 		dst_rq = cpu_rq(cpu);
 
+		p->on_rq = TASK_ON_RQ_MIGRATING;
 		deactivate_task(src_rq, p, 0);
 		p->on_rq = TASK_ON_RQ_MIGRATING;
 		set_task_cpu(p, cpu);
 		p->on_rq = TASK_ON_RQ_QUEUED;
 		activate_task(dst_rq, p, 0);
+		p->on_rq = TASK_ON_RQ_QUEUED;
 		check_preempt_curr(dst_rq, p, 0);
 	} else {
 		/*
@@ -1527,7 +1570,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 		 * yield - it could be a while.
 		 */
 		if (unlikely(queued)) {
-			ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
+			ktime_t to = ktime_set(0, NSEC_PER_MSEC);
 
 			set_current_state(TASK_UNINTERRUPTIBLE);
 			schedule_hrtimeout(&to, HRTIMER_MODE_REL);
@@ -1573,12 +1616,13 @@ EXPORT_SYMBOL_GPL(kick_process);
 /*
  * ->cpus_allowed is protected by both rq->lock and p->pi_lock
  */
-static int select_fallback_rq(int cpu, struct task_struct *p)
+static int select_fallback_rq(int cpu, struct task_struct *p, bool allow_iso)
 {
 	int nid = cpu_to_node(cpu);
 	const struct cpumask *nodemask = NULL;
-	enum { cpuset, possible, fail } state = cpuset;
+	enum { cpuset, possible, fail, bug } state = cpuset;
 	int dest_cpu;
+	int isolated_candidate = -1;
 
 	/*
 	 * If the node that the cpu is on has been offlined, cpu_to_node()
@@ -1594,6 +1638,8 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
 				continue;
 			if (!cpu_active(dest_cpu))
 				continue;
+			if (cpu_isolated(dest_cpu))
+				continue;
 			if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
 				return dest_cpu;
 		}
@@ -1606,6 +1652,16 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
 				continue;
 			if (!cpu_active(dest_cpu))
 				continue;
+			if (cpu_isolated(dest_cpu)) {
+				if (allow_iso)
+					isolated_candidate = dest_cpu;
+				continue;
+			}
+			goto out;
+		}
+
+		if (isolated_candidate != -1) {
+			dest_cpu = isolated_candidate;
 			goto out;
 		}
 
@@ -1624,6 +1680,11 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
 			break;
 
 		case fail:
+			allow_iso = true;
+			state = bug;
+			break;
+
+		case bug:
 			BUG();
 			break;
 		}
@@ -1652,6 +1713,8 @@ static inline
 int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags,
 		   int sibling_count_hint)
 {
+	bool allow_isolated = (p->flags & PF_KTHREAD);
+
 	lockdep_assert_held(&p->pi_lock);
 
 	if (p->nr_cpus_allowed > 1)
@@ -1669,13 +1732,14 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags,
 	 *   not worry about this generic constraint ]
 	 */
 	if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
-		     !cpu_online(cpu)))
-		cpu = select_fallback_rq(task_cpu(p), p);
+		     !cpu_online(cpu)) ||
+		     (cpu_isolated(cpu) && !allow_isolated))
+		cpu = select_fallback_rq(task_cpu(p), p, allow_isolated);
 
 	return cpu;
 }
 
-static void update_avg(u64 *avg, u64 sample)
+void update_avg(u64 *avg, u64 sample)
 {
 	s64 diff = sample - *avg;
 	*avg += diff >> 3;
@@ -1748,6 +1812,7 @@ static void
 ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
 {
 	check_preempt_curr(rq, p, wake_flags);
+
 	p->state = TASK_RUNNING;
 	trace_sched_wakeup(p);
 
@@ -1839,6 +1904,8 @@ void sched_ttwu_pending(void)
 
 void scheduler_ipi(void)
 {
+	int cpu = smp_processor_id();
+
 	/*
 	 * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
 	 * TIF_NEED_RESCHED remotely (for the first time) will also send
@@ -1846,9 +1913,18 @@ void scheduler_ipi(void)
 	 */
 	preempt_fold_need_resched();
 
-	if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
+	if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick() &&
+							!got_boost_kick())
 		return;
 
+	if (got_boost_kick()) {
+		struct rq *rq = cpu_rq(cpu);
+
+		if (rq->curr->sched_class == &fair_sched_class)
+			check_for_migration(rq, rq->curr);
+		clear_boost_kick(cpu);
+	}
+
 	/*
 	 * Not all reschedule IPI handlers call irq_enter/irq_exit, since
 	 * traditionally all their work was done from the interrupt return
@@ -1868,7 +1944,7 @@ void scheduler_ipi(void)
 	/*
 	 * Check if someone kicked us for doing the nohz idle load balance.
 	 */
-	if (unlikely(got_nohz_idle_kick())) {
+	if (unlikely(got_nohz_idle_kick()) && !cpu_isolated(cpu)) {
 		this_rq()->idle_balance = 1;
 		raise_softirq_irqoff(SCHED_SOFTIRQ);
 	}
@@ -1958,11 +2034,17 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
 	       int sibling_count_hint)
 {
 	unsigned long flags;
-	int cpu, success = 0;
+	int cpu, src_cpu, success = 0;
 #ifdef CONFIG_SMP
+	unsigned int old_load;
 	struct rq *rq;
 	u64 wallclock;
+	struct related_thread_group *grp = NULL;
 #endif
+	bool freq_notif_allowed = !(wake_flags & WF_NO_NOTIFIER);
+	bool check_group = false;
+
+	wake_flags &= ~WF_NO_NOTIFIER;
 
 	/*
 	 * If we are going to wake up a thread waiting for CONDITION we
@@ -1972,13 +2054,14 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
 	 */
 	smp_mb__before_spinlock();
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
+	src_cpu = cpu = task_cpu(p);
+
 	if (!(p->state & state))
 		goto out;
 
 	trace_sched_waking(p);
 
 	success = 1; /* we're going to change ->state */
-	cpu = task_cpu(p);
 
 	/*
 	 * Ensure we load p->on_rq _after_ p->state, otherwise it would
@@ -2045,11 +2128,19 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
 	rq = cpu_rq(task_cpu(p));
 
 	raw_spin_lock(&rq->lock);
-	wallclock = walt_ktime_clock();
-	walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
-	walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
+	old_load = task_load(p);
+	wallclock = sched_ktime_clock();
+	update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+	update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
 	raw_spin_unlock(&rq->lock);
 
+	rcu_read_lock();
+	grp = task_related_thread_group(p);
+	if (update_preferred_cluster(grp, p, old_load))
+		set_preferred_cluster(grp);
+	rcu_read_unlock();
+	check_group = grp != NULL;
+
 	p->sched_contributes_to_load = !!task_contributes_to_load(p);
 	p->state = TASK_WAKING;
 
@@ -2058,19 +2149,33 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
 
 	cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags,
 			     sibling_count_hint);
-	if (task_cpu(p) != cpu) {
+
+	/* Refresh src_cpu as it could have changed since we last read it */
+	src_cpu = task_cpu(p);
+	if (src_cpu != cpu) {
 		wake_flags |= WF_MIGRATED;
 		set_task_cpu(p, cpu);
 	}
 
+	note_task_waking(p, wallclock);
 #endif /* CONFIG_SMP */
-
 	ttwu_queue(p, cpu);
 stat:
 	ttwu_stat(p, cpu, wake_flags);
 out:
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 
+	if (freq_notif_allowed) {
+		if (!same_freq_domain(src_cpu, cpu)) {
+			check_for_freq_change(cpu_rq(cpu),
+						false, check_group);
+			check_for_freq_change(cpu_rq(src_cpu),
+						false, check_group);
+		} else if (success) {
+			check_for_freq_change(cpu_rq(cpu), true, false);
+		}
+	}
+
 	return success;
 }
 
@@ -2086,9 +2191,13 @@ static void try_to_wake_up_local(struct task_struct *p)
 {
 	struct rq *rq = task_rq(p);
 
-	if (WARN_ON_ONCE(rq != this_rq()) ||
-	    WARN_ON_ONCE(p == current))
+	if (rq != this_rq() || p == current) {
+		printk_deferred("%s: Failed to wakeup task %d (%s), rq = %p,"
+				" this_rq = %p, p = %p, current = %p\n",
+			__func__, task_pid_nr(p), p->comm, rq,
+			this_rq(), p, current);
 		return;
+	}
 
 	lockdep_assert_held(&rq->lock);
 
@@ -2112,17 +2221,19 @@ static void try_to_wake_up_local(struct task_struct *p)
 	trace_sched_waking(p);
 
 	if (!task_on_rq_queued(p)) {
-		u64 wallclock = walt_ktime_clock();
+		u64 wallclock = sched_ktime_clock();
 
-		walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
-		walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
+		update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+		update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
 		ttwu_activate(rq, p, ENQUEUE_WAKEUP);
+		note_task_waking(p, wallclock);
 	}
 
 	ttwu_do_wakeup(rq, p, 0);
 	ttwu_stat(p, smp_processor_id(), 0);
 out:
 	raw_spin_unlock(&p->pi_lock);
+	/* Todo : Send cpufreq notifier */
 }
 
 /**
@@ -2143,6 +2254,26 @@ int wake_up_process(struct task_struct *p)
 }
 EXPORT_SYMBOL(wake_up_process);
 
+/**
+ * wake_up_process_no_notif - Wake up a specific process without notifying
+ * governor
+ * @p: The process to be woken up.
+ *
+ * Attempt to wake up the nominated process and move it to the set of runnable
+ * processes.
+ *
+ * Return: 1 if the process was woken up, 0 if it was already running.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+int wake_up_process_no_notif(struct task_struct *p)
+{
+	WARN_ON(task_is_stopped_or_traced(p));
+	return try_to_wake_up(p, TASK_NORMAL, WF_NO_NOTIFIER, 1);
+}
+EXPORT_SYMBOL(wake_up_process_no_notif);
+
 int wake_up_state(struct task_struct *p, unsigned int state)
 {
 	return try_to_wake_up(p, state, 0, 1);
@@ -2167,6 +2298,44 @@ void __dl_clear_params(struct task_struct *p)
 	dl_se->dl_yielded = 0;
 }
 
+#ifdef CONFIG_SCHED_HMP
+/*
+ * sched_exit() - Set EXITING_TASK_MARKER in task's ravg.demand field
+ *
+ * Stop accounting (exiting) task's future cpu usage
+ *
+ * We need this so that reset_all_windows_stats() can function correctly.
+ * reset_all_window_stats() depends on do_each_thread/for_each_thread task
+ * iterators to reset *all* task's statistics. Exiting tasks however become
+ * invisible to those iterators. sched_exit() is called on a exiting task prior
+ * to being removed from task_list, which will let reset_all_window_stats()
+ * function correctly.
+ */
+void sched_exit(struct task_struct *p)
+{
+	unsigned long flags;
+	struct rq *rq;
+	u64 wallclock;
+
+	sched_set_group_id(p, 0);
+
+	rq = task_rq_lock(p, &flags);
+
+	/* rq->curr == p */
+	wallclock = sched_ktime_clock();
+	update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+	dequeue_task(rq, p, 0);
+	reset_task_stats(p);
+	p->ravg.mark_start = wallclock;
+	p->ravg.sum_history[0] = EXITING_TASK_MARKER;
+
+	enqueue_task(rq, p, 0);
+	clear_ed_task(p, rq);
+	task_rq_unlock(rq, p, &flags);
+	free_task_load_ptrs(p);
+}
+#endif /* CONFIG_SCHED_HMP */
+
 /*
  * Perform scheduler related setup for a newly forked process p.
  * p is forked by current.
@@ -2188,7 +2357,6 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 #endif
 
 	INIT_LIST_HEAD(&p->se.group_node);
-	walt_init_new_task_load(p);
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	p->se.cfs_rq			= NULL;
@@ -2204,6 +2372,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 
 	init_rt_schedtune_timer(&p->rt);
 	INIT_LIST_HEAD(&p->rt.run_list);
+	p->rt.timeout		= 0;
+	p->rt.time_slice	= sched_rr_timeslice;
+	p->rt.on_rq		= 0;
+	p->rt.on_list		= 0;
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 	INIT_HLIST_HEAD(&p->preempt_notifiers);
@@ -2273,7 +2445,10 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
 int sched_fork(unsigned long clone_flags, struct task_struct *p)
 {
 	unsigned long flags;
-	int cpu = get_cpu();
+	int cpu;
+
+	init_new_task_load(p, false);
+	cpu = get_cpu();
 
 	__sched_fork(clone_flags, p);
 	/*
@@ -2465,11 +2640,10 @@ void wake_up_new_task(struct task_struct *p)
 	unsigned long flags;
 	struct rq *rq;
 
+	add_new_task_to_grp(p);
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
 	p->state = TASK_RUNNING;
 
-	walt_init_new_task_load(p);
-
 	/* Initialize new task's runnable average */
 	init_entity_runnable_average(&p->se);
 #ifdef CONFIG_SMP
@@ -2484,10 +2658,9 @@ void wake_up_new_task(struct task_struct *p)
 	__set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0, 1));
 #endif
 	rq = __task_rq_lock(p);
+	mark_task_starting(p);
 	update_rq_clock(rq);
 	post_init_entity_util_avg(&p->se);
-
-	walt_mark_task_starting(p);
 	activate_task(rq, p, ENQUEUE_WAKEUP_NEW);
 	p->on_rq = TASK_ON_RQ_QUEUED;
 	trace_sched_wakeup_new(p);
@@ -2615,6 +2788,14 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
 	fire_sched_out_preempt_notifiers(prev, next);
 	prepare_lock_switch(rq, next);
 	prepare_arch_switch(next);
+
+#ifdef CONFIG_MSM_APP_SETTINGS
+	if (use_app_setting)
+		switch_app_setting_bit(prev, next);
+
+	if (use_32bit_app_setting || use_32bit_app_setting_pro)
+		switch_32bit_app_setting_bit(prev, next);
+#endif
 }
 
 /**
@@ -2906,7 +3087,7 @@ void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
 	*load = rq->load.weight;
 }
 
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP)
 
 /*
  * sched_exec - execve() is a valuable balancing opportunity, because at
@@ -2916,18 +3097,23 @@ void sched_exec(void)
 {
 	struct task_struct *p = current;
 	unsigned long flags;
-	int dest_cpu;
+	int dest_cpu, curr_cpu;
+
+#ifdef CONFIG_SCHED_HMP
+	return;
+#endif
 
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
+	curr_cpu = task_cpu(p);
 	dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0, 1);
 	if (dest_cpu == smp_processor_id())
 		goto unlock;
 
-	if (likely(cpu_active(dest_cpu))) {
+	if (likely(cpu_active(dest_cpu) && likely(!cpu_isolated(dest_cpu)))) {
 		struct migration_arg arg = { p, dest_cpu };
 
 		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-		stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
+		stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
 		return;
 	}
 unlock:
@@ -2994,19 +3180,29 @@ void scheduler_tick(void)
 	int cpu = smp_processor_id();
 	struct rq *rq = cpu_rq(cpu);
 	struct task_struct *curr = rq->curr;
+	u64 wallclock;
+	bool early_notif;
+	u32 old_load;
+	struct related_thread_group *grp;
 
 	sched_clock_tick();
 
 	raw_spin_lock(&rq->lock);
-	walt_set_window_start(rq);
-	walt_update_task_ravg(rq->curr, rq, TASK_UPDATE,
-			walt_ktime_clock(), 0);
+	old_load = task_load(curr);
+	set_window_start(rq);
 	update_rq_clock(rq);
 	curr->sched_class->task_tick(rq, curr, 0);
 	update_cpu_load_active(rq);
 	calc_global_load_tick(rq);
+	wallclock = sched_ktime_clock();
+	update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+	early_notif = early_detection_notify(rq, wallclock);
 	raw_spin_unlock(&rq->lock);
 
+	if (early_notif)
+		atomic_notifier_call_chain(&load_alert_notifier_head,
+					0, (void *)(long)cpu);
+
 	perf_event_task_tick();
 
 #ifdef CONFIG_SMP
@@ -3015,8 +3211,17 @@ void scheduler_tick(void)
 #endif
 	rq_last_tick_reset(rq);
 
+	rcu_read_lock();
+	grp = task_related_thread_group(curr);
+	if (update_preferred_cluster(grp, curr, old_load))
+		set_preferred_cluster(grp);
+	rcu_read_unlock();
+
 	if (curr->sched_class == &fair_sched_class)
 		check_for_migration(rq, curr);
+
+	if (cpu == tick_do_timer_cpu)
+		core_ctl_check(wallclock);
 }
 
 #ifdef CONFIG_NO_HZ_FULL
@@ -3059,9 +3264,24 @@ notrace unsigned long get_parent_ip(unsigned long addr)
 
 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
 				defined(CONFIG_PREEMPT_TRACER))
+/*
+ * preemptoff stack tracing threshold in ns.
+ * default: 1ms
+ */
+unsigned int sysctl_preemptoff_tracing_threshold_ns = 1000000UL;
+
+struct preempt_store {
+	u64 ts;
+	unsigned long caddr[4];
+	bool irqs_disabled;
+};
+
+static DEFINE_PER_CPU(struct preempt_store, the_ps);
 
 void preempt_count_add(int val)
 {
+	struct preempt_store *ps = &per_cpu(the_ps, raw_smp_processor_id());
+
 #ifdef CONFIG_DEBUG_PREEMPT
 	/*
 	 * Underflow?
@@ -3082,6 +3302,13 @@ void preempt_count_add(int val)
 #ifdef CONFIG_DEBUG_PREEMPT
 		current->preempt_disable_ip = ip;
 #endif
+		ps->ts = sched_clock();
+		ps->caddr[0] = CALLER_ADDR0;
+		ps->caddr[1] = CALLER_ADDR1;
+		ps->caddr[2] = CALLER_ADDR2;
+		ps->caddr[3] = CALLER_ADDR3;
+		ps->irqs_disabled = irqs_disabled();
+
 		trace_preempt_off(CALLER_ADDR0, ip);
 	}
 }
@@ -3104,8 +3331,22 @@ void preempt_count_sub(int val)
 		return;
 #endif
 
-	if (preempt_count() == val)
+	if (preempt_count() == val) {
+		struct preempt_store *ps = &per_cpu(the_ps,
+				raw_smp_processor_id());
+		u64 delta = sched_clock() - ps->ts;
+
+		/*
+		 * Trace preempt disable stack if preemption
+		 * is disabled for more than the threshold.
+		 */
+		if (delta > sysctl_preemptoff_tracing_threshold_ns)
+			trace_sched_preempt_disable(delta, ps->irqs_disabled,
+						ps->caddr[0], ps->caddr[1],
+						ps->caddr[2], ps->caddr[3]);
+
 		trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
+	}
 	__preempt_count_sub(val);
 }
 EXPORT_SYMBOL(preempt_count_sub);
@@ -3118,6 +3359,9 @@ NOKPROBE_SYMBOL(preempt_count_sub);
  */
 static noinline void __schedule_bug(struct task_struct *prev)
 {
+	/* Save this before calling printk(), since that will clobber it */
+	unsigned long preempt_disable_ip = get_preempt_disable_ip(current);
+
 	if (oops_in_progress)
 		return;
 
@@ -3128,12 +3372,14 @@ static noinline void __schedule_bug(struct task_struct *prev)
 	print_modules();
 	if (irqs_disabled())
 		print_irqtrace_events(prev);
-#ifdef CONFIG_DEBUG_PREEMPT
-	if (in_atomic_preempt_off()) {
+	if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
+	    && in_atomic_preempt_off()) {
 		pr_err("Preemption disabled at:");
-		print_ip_sym(current->preempt_disable_ip);
+		print_ip_sym(preempt_disable_ip);
 		pr_cont("\n");
 	}
+#ifdef CONFIG_PANIC_ON_SCHED_BUG
+	BUG();
 #endif
 	dump_stack();
 	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
@@ -3248,7 +3494,6 @@ static void __sched notrace __schedule(bool preempt)
 
 	cpu = smp_processor_id();
 	rq = cpu_rq(cpu);
-	rcu_note_context_switch();
 	prev = rq->curr;
 
 	/*
@@ -3267,13 +3512,16 @@ static void __sched notrace __schedule(bool preempt)
 	if (sched_feat(HRTICK))
 		hrtick_clear(rq);
 
+	local_irq_disable();
+	rcu_note_context_switch();
+
 	/*
 	 * Make sure that signal_pending_state()->signal_pending() below
 	 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
 	 * done by the caller to avoid the race with signal_wake_up().
 	 */
 	smp_mb__before_spinlock();
-	raw_spin_lock_irq(&rq->lock);
+	raw_spin_lock(&rq->lock);
 	lockdep_pin_lock(&rq->lock);
 
 	rq->clock_skip_update <<= 1; /* promote REQ to ACT */
@@ -3306,14 +3554,19 @@ static void __sched notrace __schedule(bool preempt)
 		update_rq_clock(rq);
 
 	next = pick_next_task(rq, prev);
-	wallclock = walt_ktime_clock();
-	walt_update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0);
-	walt_update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0);
 	clear_tsk_need_resched(prev);
 	clear_preempt_need_resched();
 	rq->clock_skip_update = 0;
 
+	BUG_ON(task_cpu(next) != cpu_of(rq));
+
+	wallclock = sched_ktime_clock();
 	if (likely(prev != next)) {
+		update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0);
+		update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0);
+		if (!is_idle_task(prev) && !prev->on_rq)
+			update_avg_burst(prev);
+
 #ifdef CONFIG_SCHED_WALT
 		if (!prev->on_rq)
 			prev->last_sleep_ts = wallclock;
@@ -3322,10 +3575,13 @@ static void __sched notrace __schedule(bool preempt)
 		rq->curr = next;
 		++*switch_count;
 
+		set_task_last_switch_out(prev, wallclock);
+
 		trace_sched_switch(preempt, prev, next);
 		rq = context_switch(rq, prev, next); /* unlocks the rq */
 		cpu = cpu_of(rq);
 	} else {
+		update_task_ravg(prev, rq, TASK_UPDATE, wallclock, 0);
 		lockdep_unpin_lock(&rq->lock);
 		raw_spin_unlock_irq(&rq->lock);
 	}
@@ -3510,7 +3766,7 @@ EXPORT_SYMBOL(default_wake_function);
  */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
-	int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE;
+	int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
 	struct rq *rq;
 	const struct sched_class *prev_class;
 
@@ -3539,11 +3795,15 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 
 	trace_sched_pi_setprio(p, prio);
 	oldprio = p->prio;
+
+	if (oldprio == prio)
+		queue_flag &= ~DEQUEUE_MOVE;
+
 	prev_class = p->sched_class;
 	queued = task_on_rq_queued(p);
 	running = task_current(rq, p);
 	if (queued)
-		dequeue_task(rq, p, DEQUEUE_SAVE);
+		dequeue_task(rq, p, queue_flag);
 	if (running)
 		put_prev_task(rq, p);
 
@@ -3561,7 +3821,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 		if (!dl_prio(p->normal_prio) ||
 		    (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
 			p->dl.dl_boosted = 1;
-			enqueue_flag |= ENQUEUE_REPLENISH;
+			queue_flag |= ENQUEUE_REPLENISH;
 		} else
 			p->dl.dl_boosted = 0;
 		p->sched_class = &dl_sched_class;
@@ -3569,7 +3829,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 		if (dl_prio(oldprio))
 			p->dl.dl_boosted = 0;
 		if (oldprio < prio)
-			enqueue_flag |= ENQUEUE_HEAD;
+			queue_flag |= ENQUEUE_HEAD;
 		p->sched_class = &rt_sched_class;
 	} else {
 		if (dl_prio(oldprio))
@@ -3584,7 +3844,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 	if (running)
 		p->sched_class->set_curr_task(rq);
 	if (queued)
-		enqueue_task(rq, p, enqueue_flag);
+		enqueue_task(rq, p, queue_flag);
 
 	check_class_changed(rq, p, prev_class, oldprio);
 out_unlock:
@@ -3943,6 +4203,7 @@ static int __sched_setscheduler(struct task_struct *p,
 	const struct sched_class *prev_class;
 	struct rq *rq;
 	int reset_on_fork;
+	int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
 
 	/* The pi code expects interrupts enabled */
 	BUG_ON(pi && in_interrupt());
@@ -4126,17 +4387,14 @@ change:
 		 * itself.
 		 */
 		new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
-		if (new_effective_prio == oldprio) {
-			__setscheduler_params(p, attr);
-			task_rq_unlock(rq, p, &flags);
-			return 0;
-		}
+		if (new_effective_prio == oldprio)
+			queue_flags &= ~DEQUEUE_MOVE;
 	}
 
 	queued = task_on_rq_queued(p);
 	running = task_current(rq, p);
 	if (queued)
-		dequeue_task(rq, p, DEQUEUE_SAVE);
+		dequeue_task(rq, p, queue_flags);
 	if (running)
 		put_prev_task(rq, p);
 
@@ -4146,15 +4404,14 @@ change:
 	if (running)
 		p->sched_class->set_curr_task(rq);
 	if (queued) {
-		int enqueue_flags = ENQUEUE_RESTORE;
 		/*
 		 * We enqueue to tail when the priority of a task is
 		 * increased (user space view).
 		 */
-		if (oldprio <= p->prio)
-			enqueue_flags |= ENQUEUE_HEAD;
+		if (oldprio < p->prio)
+			queue_flags |= ENQUEUE_HEAD;
 
-		enqueue_task(rq, p, enqueue_flags);
+		enqueue_task(rq, p, queue_flags);
 	}
 
 	check_class_changed(rq, p, prev_class, oldprio);
@@ -4232,7 +4489,7 @@ int sched_setscheduler_nocheck(struct task_struct *p, int policy,
 {
 	return _sched_setscheduler(p, policy, param, false);
 }
-EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
+EXPORT_SYMBOL(sched_setscheduler_nocheck);
 
 static int
 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
@@ -4552,6 +4809,8 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 	cpumask_var_t cpus_allowed, new_mask;
 	struct task_struct *p;
 	int retval;
+	int dest_cpu;
+	cpumask_t allowed_mask;
 
 	rcu_read_lock();
 
@@ -4613,20 +4872,26 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 	}
 #endif
 again:
-	retval = __set_cpus_allowed_ptr(p, new_mask, true);
-
-	if (!retval) {
-		cpuset_cpus_allowed(p, cpus_allowed);
-		if (!cpumask_subset(new_mask, cpus_allowed)) {
-			/*
-			 * We must have raced with a concurrent cpuset
-			 * update. Just reset the cpus_allowed to the
-			 * cpuset's cpus_allowed
-			 */
-			cpumask_copy(new_mask, cpus_allowed);
-			goto again;
+	cpumask_andnot(&allowed_mask, new_mask, cpu_isolated_mask);
+	dest_cpu = cpumask_any_and(cpu_active_mask, &allowed_mask);
+	if (dest_cpu < nr_cpu_ids) {
+		retval = __set_cpus_allowed_ptr(p, new_mask, true);
+		if (!retval) {
+			cpuset_cpus_allowed(p, cpus_allowed);
+			if (!cpumask_subset(new_mask, cpus_allowed)) {
+				/*
+				 * We must have raced with a concurrent cpuset
+				 * update. Just reset the cpus_allowed to the
+				 * cpuset's cpus_allowed
+				 */
+				cpumask_copy(new_mask, cpus_allowed);
+				goto again;
+			}
 		}
+	} else {
+		retval = -EINVAL;
 	}
+
 out_free_new_mask:
 	free_cpumask_var(new_mask);
 out_free_cpus_allowed:
@@ -4690,6 +4955,15 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
 
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
 	cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
+
+	/*
+	 * The userspace tasks are forbidden to run on
+	 * isolated CPUs. So exclude isolated CPUs from
+	 * the getaffinity.
+	 */
+	if (!(p->flags & PF_KTHREAD))
+		cpumask_andnot(mask, mask, cpu_isolated_mask);
+
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 
 out_unlock:
@@ -5111,6 +5385,8 @@ void show_state_filter(unsigned long state_filter)
 			sched_show_task(p);
 	}
 
+	touch_all_softlockup_watchdogs();
+
 #ifdef CONFIG_SCHED_DEBUG
 	sysrq_sched_debug_show();
 #endif
@@ -5131,15 +5407,19 @@ void init_idle_bootup_task(struct task_struct *idle)
  * init_idle - set up an idle thread for a given CPU
  * @idle: task in question
  * @cpu: cpu the idle task belongs to
+ * @cpu_up: differentiate between initial boot vs hotplug
  *
  * NOTE: this function does not set the idle thread's NEED_RESCHED
  * flag, to make booting more robust.
  */
-void init_idle(struct task_struct *idle, int cpu)
+void init_idle(struct task_struct *idle, int cpu, bool cpu_up)
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long flags;
 
+	if (!cpu_up)
+		init_new_task_load(idle, true);
+
 	raw_spin_lock_irqsave(&idle->pi_lock, flags);
 	raw_spin_lock(&rq->lock);
 
@@ -5148,6 +5428,8 @@ void init_idle(struct task_struct *idle, int cpu)
 	idle->state = TASK_RUNNING;
 	idle->se.exec_start = sched_clock();
 
+	kasan_unpoison_task_stack(idle);
+
 #ifdef CONFIG_SMP
 	/*
 	 * Its possible that init_idle() gets called multiple times on a task,
@@ -5370,18 +5652,54 @@ static struct task_struct fake_task = {
 };
 
 /*
- * Migrate all tasks from the rq, sleeping tasks will be migrated by
- * try_to_wake_up()->select_task_rq().
+ * Remove a task from the runqueue and pretend that it's migrating. This
+ * should prevent migrations for the detached task and disallow further
+ * changes to tsk_cpus_allowed.
+ */
+static void
+detach_one_task(struct task_struct *p, struct rq *rq, struct list_head *tasks)
+{
+	lockdep_assert_held(&rq->lock);
+
+	p->on_rq = TASK_ON_RQ_MIGRATING;
+	deactivate_task(rq, p, 0);
+	list_add(&p->se.group_node, tasks);
+}
+
+static void attach_tasks(struct list_head *tasks, struct rq *rq)
+{
+	struct task_struct *p;
+
+	lockdep_assert_held(&rq->lock);
+
+	while (!list_empty(tasks)) {
+		p = list_first_entry(tasks, struct task_struct, se.group_node);
+		list_del_init(&p->se.group_node);
+
+		BUG_ON(task_rq(p) != rq);
+		activate_task(rq, p, 0);
+		p->on_rq = TASK_ON_RQ_QUEUED;
+	}
+}
+
+/*
+ * Migrate all tasks (not pinned if pinned argument say so) from the rq,
+ * sleeping tasks will be migrated by try_to_wake_up()->select_task_rq().
  *
  * Called with rq->lock held even though we'er in stop_machine() and
  * there's no concurrency possible, we hold the required locks anyway
  * because of lock validation efforts.
  */
-static void migrate_tasks(struct rq *dead_rq)
+static void migrate_tasks(struct rq *dead_rq, bool migrate_pinned_tasks)
 {
 	struct rq *rq = dead_rq;
 	struct task_struct *next, *stop = rq->stop;
 	int dest_cpu;
+	unsigned int num_pinned_kthreads = 1; /* this thread */
+	LIST_HEAD(tasks);
+	cpumask_t avail_cpus;
+
+	cpumask_andnot(&avail_cpus, cpu_online_mask, cpu_isolated_mask);
 
 	/*
 	 * Fudge the rq selection such that the below task selection loop
@@ -5417,6 +5735,14 @@ static void migrate_tasks(struct rq *dead_rq)
 		BUG_ON(!next);
 		next->sched_class->put_prev_task(rq, next);
 
+		if (!migrate_pinned_tasks && next->flags & PF_KTHREAD &&
+			!cpumask_intersects(&avail_cpus, &next->cpus_allowed)) {
+			detach_one_task(next, rq, &tasks);
+			num_pinned_kthreads += 1;
+			lockdep_unpin_lock(&rq->lock);
+			continue;
+		}
+
 		/*
 		 * Rules for changing task_struct::cpus_allowed are holding
 		 * both pi_lock and rq->lock, such that holding either
@@ -5435,26 +5761,271 @@ static void migrate_tasks(struct rq *dead_rq)
 		 * Since we're inside stop-machine, _nothing_ should have
 		 * changed the task, WARN if weird stuff happened, because in
 		 * that case the above rq->lock drop is a fail too.
+		 * However, during cpu isolation the load balancer might have
+		 * interferred since we don't stop all CPUs. Ignore warning for
+		 * this case.
 		 */
-		if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {
+		if (task_rq(next) != rq || !task_on_rq_queued(next)) {
+			WARN_ON(migrate_pinned_tasks);
 			raw_spin_unlock(&next->pi_lock);
 			continue;
 		}
 
 		/* Find suitable destination for @next, with force if needed. */
-		dest_cpu = select_fallback_rq(dead_rq->cpu, next);
+		dest_cpu = select_fallback_rq(dead_rq->cpu, next, false);
 
 		rq = __migrate_task(rq, next, dest_cpu);
 		if (rq != dead_rq) {
+			raw_spin_unlock(&next->pi_lock);
 			raw_spin_unlock(&rq->lock);
+			notify_migration(dead_rq->cpu, dest_cpu, true, next);
 			rq = dead_rq;
+			raw_spin_lock(&next->pi_lock);
 			raw_spin_lock(&rq->lock);
 		}
 		raw_spin_unlock(&next->pi_lock);
 	}
 
 	rq->stop = stop;
+
+	if (num_pinned_kthreads > 1)
+		attach_tasks(&tasks, rq);
+}
+
+static void set_rq_online(struct rq *rq);
+static void set_rq_offline(struct rq *rq);
+
+int do_isolation_work_cpu_stop(void *data)
+{
+	unsigned int cpu = smp_processor_id();
+	struct rq *rq = cpu_rq(cpu);
+
+	watchdog_disable(cpu);
+
+	irq_migrate_all_off_this_cpu();
+
+	local_irq_disable();
+
+	sched_ttwu_pending();
+
+	raw_spin_lock(&rq->lock);
+
+	/*
+	 * Temporarily mark the rq as offline. This will allow us to
+	 * move tasks off the CPU.
+	 */
+	if (rq->rd) {
+		BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+		set_rq_offline(rq);
+	}
+
+	migrate_tasks(rq, false);
+
+	if (rq->rd)
+		set_rq_online(rq);
+	raw_spin_unlock(&rq->lock);
+
+	/*
+	 * We might have been in tickless state. Clear NOHZ flags to avoid
+	 * us being kicked for helping out with balancing
+	 */
+	nohz_balance_clear_nohz_mask(cpu);
+
+	clear_hmp_request(cpu);
+	local_irq_enable();
+	return 0;
+}
+
+int do_unisolation_work_cpu_stop(void *data)
+{
+	watchdog_enable(smp_processor_id());
+	return 0;
+}
+
+static void init_sched_groups_capacity(int cpu, struct sched_domain *sd);
+
+static void sched_update_group_capacities(int cpu)
+{
+	struct sched_domain *sd;
+
+	mutex_lock(&sched_domains_mutex);
+	rcu_read_lock();
+
+	for_each_domain(cpu, sd) {
+		int balance_cpu = group_balance_cpu(sd->groups);
+
+		init_sched_groups_capacity(cpu, sd);
+		/*
+		 * Need to ensure this is also called with balancing
+		 * cpu.
+		*/
+		if (cpu != balance_cpu)
+			init_sched_groups_capacity(balance_cpu, sd);
+	}
+
+	rcu_read_unlock();
+	mutex_unlock(&sched_domains_mutex);
+}
+
+static unsigned int cpu_isolation_vote[NR_CPUS];
+
+int sched_isolate_count(const cpumask_t *mask, bool include_offline)
+{
+	cpumask_t count_mask = CPU_MASK_NONE;
+
+	if (include_offline) {
+		cpumask_complement(&count_mask, cpu_online_mask);
+		cpumask_or(&count_mask, &count_mask, cpu_isolated_mask);
+		cpumask_and(&count_mask, &count_mask, mask);
+	} else {
+		cpumask_and(&count_mask, mask, cpu_isolated_mask);
+	}
+
+	return cpumask_weight(&count_mask);
 }
+
+/*
+ * 1) CPU is isolated and cpu is offlined:
+ *	Unisolate the core.
+ * 2) CPU is not isolated and CPU is offlined:
+ *	No action taken.
+ * 3) CPU is offline and request to isolate
+ *	Request ignored.
+ * 4) CPU is offline and isolated:
+ *	Not a possible state.
+ * 5) CPU is online and request to isolate
+ *	Normal case: Isolate the CPU
+ * 6) CPU is not isolated and comes back online
+ *	Nothing to do
+ *
+ * Note: The client calling sched_isolate_cpu() is repsonsible for ONLY
+ * calling sched_unisolate_cpu() on a CPU that the client previously isolated.
+ * Client is also responsible for unisolating when a core goes offline
+ * (after CPU is marked offline).
+ */
+int sched_isolate_cpu(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	cpumask_t avail_cpus;
+	int ret_code = 0;
+	u64 start_time = 0;
+
+	if (trace_sched_isolate_enabled())
+		start_time = sched_clock();
+
+	cpu_maps_update_begin();
+
+	cpumask_andnot(&avail_cpus, cpu_online_mask, cpu_isolated_mask);
+
+	/* We cannot isolate ALL cpus in the system */
+	if (cpumask_weight(&avail_cpus) == 1) {
+		ret_code = -EINVAL;
+		goto out;
+	}
+
+	if (!cpu_online(cpu)) {
+		ret_code = -EINVAL;
+		goto out;
+	}
+
+	if (++cpu_isolation_vote[cpu] > 1)
+		goto out;
+
+	/*
+	 * There is a race between watchdog being enabled by hotplug and
+	 * core isolation disabling the watchdog. When a CPU is hotplugged in
+	 * and the hotplug lock has been released the watchdog thread might
+	 * not have run yet to enable the watchdog.
+	 * We have to wait for the watchdog to be enabled before proceeding.
+	 */
+	if (!watchdog_configured(cpu)) {
+		msleep(20);
+		if (!watchdog_configured(cpu)) {
+			--cpu_isolation_vote[cpu];
+			ret_code = -EBUSY;
+			goto out;
+		}
+	}
+
+	set_cpu_isolated(cpu, true);
+	cpumask_clear_cpu(cpu, &avail_cpus);
+
+	/* Migrate timers */
+	smp_call_function_any(&avail_cpus, hrtimer_quiesce_cpu, &cpu, 1);
+	smp_call_function_any(&avail_cpus, timer_quiesce_cpu, &cpu, 1);
+
+	stop_cpus(cpumask_of(cpu), do_isolation_work_cpu_stop, 0);
+
+	calc_load_migrate(rq);
+	update_max_interval();
+	sched_update_group_capacities(cpu);
+
+out:
+	cpu_maps_update_done();
+	trace_sched_isolate(cpu, cpumask_bits(cpu_isolated_mask)[0],
+			    start_time, 1);
+	return ret_code;
+}
+
+/*
+ * Note: The client calling sched_isolate_cpu() is repsonsible for ONLY
+ * calling sched_unisolate_cpu() on a CPU that the client previously isolated.
+ * Client is also responsible for unisolating when a core goes offline
+ * (after CPU is marked offline).
+ */
+int sched_unisolate_cpu_unlocked(int cpu)
+{
+	int ret_code = 0;
+	struct rq *rq = cpu_rq(cpu);
+	u64 start_time = 0;
+
+	if (trace_sched_isolate_enabled())
+		start_time = sched_clock();
+
+	if (!cpu_isolation_vote[cpu]) {
+		ret_code = -EINVAL;
+		goto out;
+	}
+
+	if (--cpu_isolation_vote[cpu])
+		goto out;
+
+	if (cpu_online(cpu)) {
+		unsigned long flags;
+
+		raw_spin_lock_irqsave(&rq->lock, flags);
+		rq->age_stamp = sched_clock_cpu(cpu);
+		raw_spin_unlock_irqrestore(&rq->lock, flags);
+	}
+
+	set_cpu_isolated(cpu, false);
+	update_max_interval();
+	sched_update_group_capacities(cpu);
+
+	if (cpu_online(cpu)) {
+		stop_cpus(cpumask_of(cpu), do_unisolation_work_cpu_stop, 0);
+
+		/* Kick CPU to immediately do load balancing */
+		if (!test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)))
+			smp_send_reschedule(cpu);
+	}
+
+out:
+	trace_sched_isolate(cpu, cpumask_bits(cpu_isolated_mask)[0],
+			    start_time, 0);
+	return ret_code;
+}
+
+int sched_unisolate_cpu(int cpu)
+{
+	int ret_code;
+
+	cpu_maps_update_begin();
+	ret_code = sched_unisolate_cpu_unlocked(cpu);
+	cpu_maps_update_done();
+	return ret_code;
+}
+
 #endif /* CONFIG_HOTPLUG_CPU */
 
 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -5742,7 +6313,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 
 	case CPU_UP_PREPARE:
 		raw_spin_lock_irqsave(&rq->lock, flags);
-		walt_set_window_start(rq);
+		set_window_start(rq);
 		raw_spin_unlock_irqrestore(&rq->lock, flags);
 		rq->calc_load_update = calc_load_update;
 		break;
@@ -5763,17 +6334,18 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		sched_ttwu_pending();
 		/* Update our root-domain */
 		raw_spin_lock_irqsave(&rq->lock, flags);
-		walt_migrate_sync_cpu(cpu);
+
 		if (rq->rd) {
 			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
 			set_rq_offline(rq);
 		}
-		migrate_tasks(rq);
+		migrate_tasks(rq, true);
 		BUG_ON(rq->nr_running != 1); /* the migration thread */
 		raw_spin_unlock_irqrestore(&rq->lock, flags);
 		break;
 
 	case CPU_DEAD:
+		clear_hmp_request(cpu);
 		calc_load_migrate(rq);
 		break;
 #endif
@@ -6294,6 +6866,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	struct sched_domain *tmp;
+	unsigned long next_balance = rq->next_balance;
 
 	/* Remove the sched domains which do not contribute to scheduling. */
 	for (tmp = sd; tmp; ) {
@@ -6325,6 +6898,17 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 			sd->child = NULL;
 	}
 
+	for (tmp = sd; tmp; ) {
+		unsigned long interval;
+
+		interval = msecs_to_jiffies(tmp->balance_interval);
+		if (time_after(next_balance, tmp->last_balance + interval))
+			next_balance = tmp->last_balance + interval;
+
+		tmp = tmp->parent;
+	}
+	rq->next_balance = next_balance;
+
 	sched_domain_debug(sd, cpu);
 
 	rq_attach_root(rq, rd);
@@ -6574,11 +7158,14 @@ build_sched_groups(struct sched_domain *sd, int cpu)
 static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
 {
 	struct sched_group *sg = sd->groups;
+	cpumask_t avail_mask;
 
 	WARN_ON(!sg);
 
 	do {
-		sg->group_weight = cpumask_weight(sched_group_cpus(sg));
+		cpumask_andnot(&avail_mask, sched_group_cpus(sg),
+							cpu_isolated_mask);
+		sg->group_weight = cpumask_weight(&avail_mask);
 		sg = sg->next;
 	} while (sg != sd->groups);
 
@@ -7311,6 +7898,9 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
 			pr_err("     the %s domain not a subset of the %s domain\n",
 					child->name, sd->name);
 #endif
+#ifdef CONFIG_PANIC_ON_SCHED_BUG
+			BUG();
+#endif
 			/* Fixup, ensure @sd has at least @child cpus. */
 			cpumask_or(sched_domain_span(sd),
 				   sched_domain_span(sd),
@@ -7375,7 +7965,8 @@ static int build_sched_domains(const struct cpumask *cpu_map,
 			continue;
 
 		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent, tl++) {
-			init_sched_energy(i, sd, tl->energy);
+			if (energy_aware())
+				init_sched_energy(i, sd, tl->energy);
 			claim_allocations(i, sd);
 			init_sched_groups_capacity(i, sd);
 		}
@@ -7697,6 +8288,8 @@ void __init sched_init_smp(void)
 	hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
 	hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
 
+	update_cluster_topology();
+
 	init_hrtick();
 
 	/* Move init over to a non-isolated CPU */
@@ -7715,6 +8308,7 @@ void __init sched_init_smp(void)
 }
 #endif /* CONFIG_SMP */
 
+
 int in_sched_functions(unsigned long addr)
 {
 	return in_lock_functions(addr) ||
@@ -7738,6 +8332,15 @@ void __init sched_init(void)
 	int i, j;
 	unsigned long alloc_size = 0, ptr;
 
+#ifdef CONFIG_SCHED_HMP
+	pr_info("HMP scheduling enabled.\n");
+#endif
+
+	BUG_ON(num_possible_cpus() > BITS_PER_LONG);
+
+	sched_boost_parse_dt();
+	init_clusters();
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	alloc_size += 2 * nr_cpu_ids * sizeof(void **);
 #endif
@@ -7854,12 +8457,53 @@ void __init sched_init(void)
 		rq->online = 0;
 		rq->idle_stamp = 0;
 		rq->avg_idle = 2*sysctl_sched_migration_cost;
-		rq->max_idle_balance_cost = sysctl_sched_migration_cost;
-#ifdef CONFIG_SCHED_WALT
+#ifdef CONFIG_SCHED_HMP
+		cpumask_set_cpu(i, &rq->freq_domain_cpumask);
+		rq->hmp_stats.cumulative_runnable_avg = 0;
+		rq->window_start = 0;
+		rq->hmp_stats.nr_big_tasks = 0;
+		rq->hmp_flags = 0;
 		rq->cur_irqload = 0;
 		rq->avg_irqload = 0;
 		rq->irqload_ts = 0;
+		rq->static_cpu_pwr_cost = 0;
+		rq->cc.cycles = 1;
+		rq->cc.time = 1;
+		rq->cstate = 0;
+		rq->wakeup_latency = 0;
+		rq->wakeup_energy = 0;
+
+		/*
+		 * All cpus part of same cluster by default. This avoids the
+		 * need to check for rq->cluster being non-NULL in hot-paths
+		 * like select_best_cpu()
+		 */
+		rq->cluster = &init_cluster;
+		rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
+		rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0;
+		memset(&rq->grp_time, 0, sizeof(struct group_cpu_time));
+		rq->old_busy_time = 0;
+		rq->old_estimated_time = 0;
+		rq->old_busy_time_group = 0;
+		rq->hmp_stats.pred_demands_sum = 0;
+		rq->curr_table = 0;
+		rq->prev_top = 0;
+		rq->curr_top = 0;
+
+		for (j = 0; j < NUM_TRACKED_WINDOWS; j++) {
+			memset(&rq->load_subs[j], 0,
+					sizeof(struct load_subtractions));
+
+			rq->top_tasks[j] = kcalloc(NUM_LOAD_INDICES,
+						sizeof(u8), GFP_NOWAIT);
+
+			/* No other choice */
+			BUG_ON(!rq->top_tasks[j]);
+
+			clear_top_tasks_bitmap(rq->top_tasks_bitmap[j]);
+		}
 #endif
+		rq->max_idle_balance_cost = sysctl_sched_migration_cost;
 
 		INIT_LIST_HEAD(&rq->cfs_tasks);
 
@@ -7875,6 +8519,11 @@ void __init sched_init(void)
 		atomic_set(&rq->nr_iowait, 0);
 	}
 
+	i = alloc_related_thread_groups();
+	BUG_ON(i);
+
+	set_hmp_defaults();
+
 	set_load_weight(&init_task);
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -7898,7 +8547,7 @@ void __init sched_init(void)
 	 * but because we are the idle thread, we just pick up running again
 	 * when this runqueue becomes "idle".
 	 */
-	init_idle(current, smp_processor_id());
+	init_idle(current, smp_processor_id(), false);
 
 	calc_load_update = jiffies + LOAD_FREQ;
 
@@ -7952,6 +8601,7 @@ EXPORT_SYMBOL(__might_sleep);
 void ___might_sleep(const char *file, int line, int preempt_offset)
 {
 	static unsigned long prev_jiffy;	/* ratelimiting */
+	unsigned long preempt_disable_ip;
 
 	rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
 	if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
@@ -7964,6 +8614,9 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
 		return;
 	prev_jiffy = jiffies;
 
+	/* Save this before calling printk(), since that will clobber it */
+	preempt_disable_ip = get_preempt_disable_ip(current);
+
 	printk(KERN_ERR
 		"BUG: sleeping function called from invalid context at %s:%d\n",
 			file, line);
@@ -7978,12 +8631,14 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
 	debug_show_held_locks(current);
 	if (irqs_disabled())
 		print_irqtrace_events(current);
-#ifdef CONFIG_DEBUG_PREEMPT
-	if (!preempt_count_equals(preempt_offset)) {
+	if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
+	    && !preempt_count_equals(preempt_offset)) {
 		pr_err("Preemption disabled at:");
-		print_ip_sym(current->preempt_disable_ip);
+		print_ip_sym(preempt_disable_ip);
 		pr_cont("\n");
 	}
+#ifdef CONFIG_PANIC_ON_SCHED_BUG
+	BUG();
 #endif
 	dump_stack();
 }
@@ -8195,7 +8850,7 @@ void sched_move_task(struct task_struct *tsk)
 	queued = task_on_rq_queued(tsk);
 
 	if (queued)
-		dequeue_task(rq, tsk, DEQUEUE_SAVE);
+		dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
 	if (unlikely(running))
 		put_prev_task(rq, tsk);
 
@@ -8204,7 +8859,7 @@ void sched_move_task(struct task_struct *tsk)
 	if (unlikely(running))
 		tsk->sched_class->set_curr_task(rq);
 	if (queued)
-		enqueue_task(rq, tsk, ENQUEUE_RESTORE);
+		enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE);
 
 	task_rq_unlock(rq, tsk, &flags);
 }
@@ -8585,7 +9240,7 @@ int sched_rr_handler(struct ctl_table *table, int write,
 
 #ifdef CONFIG_CGROUP_SCHED
 
-static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
+inline struct task_group *css_tg(struct cgroup_subsys_state *css)
 {
 	return css ? container_of(css, struct task_group, css) : NULL;
 }
@@ -8982,6 +9637,13 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
 #endif /* CONFIG_RT_GROUP_SCHED */
 
 static struct cftype cpu_files[] = {
+#ifdef CONFIG_SCHED_HMP
+	{
+		.name = "upmigrate_discourage",
+		.read_u64 = cpu_upmigrate_discourage_read_u64,
+		.write_u64 = cpu_upmigrate_discourage_write_u64,
+	},
+#endif
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	{
 		.name = "shares",
diff --git a/kernel/sched/core_ctl.c b/kernel/sched/core_ctl.c
new file mode 100644
index 000000000000..ce15ae7fe76b
--- /dev/null
+++ b/kernel/sched/core_ctl.c
@@ -0,0 +1,1169 @@
+/* Copyright (c) 2014-2017, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#define pr_fmt(fmt)	"core_ctl: " fmt
+
+#include <linux/init.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/cpufreq.h>
+#include <linux/kthread.h>
+#include <linux/sched.h>
+#include <linux/sched/rt.h>
+
+#include <trace/events/sched.h>
+
+#define MAX_CPUS_PER_CLUSTER 4
+#define MAX_CLUSTERS 2
+
+struct cluster_data {
+	bool inited;
+	unsigned int min_cpus;
+	unsigned int max_cpus;
+	unsigned int offline_delay_ms;
+	unsigned int busy_up_thres[MAX_CPUS_PER_CLUSTER];
+	unsigned int busy_down_thres[MAX_CPUS_PER_CLUSTER];
+	unsigned int active_cpus;
+	unsigned int num_cpus;
+	unsigned int nr_isolated_cpus;
+	cpumask_t cpu_mask;
+	unsigned int need_cpus;
+	unsigned int task_thres;
+	unsigned int max_nr;
+	s64 need_ts;
+	struct list_head lru;
+	bool pending;
+	spinlock_t pending_lock;
+	bool is_big_cluster;
+	bool enable;
+	int nrrun;
+	bool nrrun_changed;
+	struct task_struct *core_ctl_thread;
+	unsigned int first_cpu;
+	unsigned int boost;
+	struct kobject kobj;
+};
+
+struct cpu_data {
+	bool is_busy;
+	unsigned int busy;
+	unsigned int cpu;
+	bool not_preferred;
+	struct cluster_data *cluster;
+	struct list_head sib;
+	bool isolated_by_us;
+	unsigned int max_nr;
+};
+
+static DEFINE_PER_CPU(struct cpu_data, cpu_state);
+static struct cluster_data cluster_state[MAX_CLUSTERS];
+static unsigned int num_clusters;
+
+#define for_each_cluster(cluster, idx) \
+	for ((cluster) = &cluster_state[idx]; (idx) < num_clusters;\
+		(idx)++, (cluster) = &cluster_state[idx])
+
+static DEFINE_SPINLOCK(state_lock);
+static void apply_need(struct cluster_data *state);
+static void wake_up_core_ctl_thread(struct cluster_data *state);
+static bool initialized;
+
+static unsigned int get_active_cpu_count(const struct cluster_data *cluster);
+
+/* ========================= sysfs interface =========================== */
+
+static ssize_t store_min_cpus(struct cluster_data *state,
+				const char *buf, size_t count)
+{
+	unsigned int val;
+
+	if (sscanf(buf, "%u\n", &val) != 1)
+		return -EINVAL;
+
+	state->min_cpus = min(val, state->max_cpus);
+	wake_up_core_ctl_thread(state);
+
+	return count;
+}
+
+static ssize_t show_min_cpus(const struct cluster_data *state, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%u\n", state->min_cpus);
+}
+
+static ssize_t store_max_cpus(struct cluster_data *state,
+				const char *buf, size_t count)
+{
+	unsigned int val;
+
+	if (sscanf(buf, "%u\n", &val) != 1)
+		return -EINVAL;
+
+	val = min(val, state->num_cpus);
+	state->max_cpus = val;
+	state->min_cpus = min(state->min_cpus, state->max_cpus);
+	wake_up_core_ctl_thread(state);
+
+	return count;
+}
+
+static ssize_t show_max_cpus(const struct cluster_data *state, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%u\n", state->max_cpus);
+}
+
+static ssize_t store_offline_delay_ms(struct cluster_data *state,
+					const char *buf, size_t count)
+{
+	unsigned int val;
+
+	if (sscanf(buf, "%u\n", &val) != 1)
+		return -EINVAL;
+
+	state->offline_delay_ms = val;
+	apply_need(state);
+
+	return count;
+}
+
+static ssize_t show_task_thres(const struct cluster_data *state, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%u\n", state->task_thres);
+}
+
+static ssize_t store_task_thres(struct cluster_data *state,
+				const char *buf, size_t count)
+{
+	unsigned int val;
+
+	if (sscanf(buf, "%u\n", &val) != 1)
+		return -EINVAL;
+
+	if (val < state->num_cpus)
+		return -EINVAL;
+
+	state->task_thres = val;
+	apply_need(state);
+
+	return count;
+}
+
+static ssize_t show_offline_delay_ms(const struct cluster_data *state,
+				     char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%u\n", state->offline_delay_ms);
+}
+
+static ssize_t store_busy_up_thres(struct cluster_data *state,
+					const char *buf, size_t count)
+{
+	unsigned int val[MAX_CPUS_PER_CLUSTER];
+	int ret, i;
+
+	ret = sscanf(buf, "%u %u %u %u\n", &val[0], &val[1], &val[2], &val[3]);
+	if (ret != 1 && ret != state->num_cpus)
+		return -EINVAL;
+
+	if (ret == 1) {
+		for (i = 0; i < state->num_cpus; i++)
+			state->busy_up_thres[i] = val[0];
+	} else {
+		for (i = 0; i < state->num_cpus; i++)
+			state->busy_up_thres[i] = val[i];
+	}
+	apply_need(state);
+	return count;
+}
+
+static ssize_t show_busy_up_thres(const struct cluster_data *state, char *buf)
+{
+	int i, count = 0;
+
+	for (i = 0; i < state->num_cpus; i++)
+		count += snprintf(buf + count, PAGE_SIZE - count, "%u ",
+				  state->busy_up_thres[i]);
+
+	count += snprintf(buf + count, PAGE_SIZE - count, "\n");
+	return count;
+}
+
+static ssize_t store_busy_down_thres(struct cluster_data *state,
+					const char *buf, size_t count)
+{
+	unsigned int val[MAX_CPUS_PER_CLUSTER];
+	int ret, i;
+
+	ret = sscanf(buf, "%u %u %u %u\n", &val[0], &val[1], &val[2], &val[3]);
+	if (ret != 1 && ret != state->num_cpus)
+		return -EINVAL;
+
+	if (ret == 1) {
+		for (i = 0; i < state->num_cpus; i++)
+			state->busy_down_thres[i] = val[0];
+	} else {
+		for (i = 0; i < state->num_cpus; i++)
+			state->busy_down_thres[i] = val[i];
+	}
+	apply_need(state);
+	return count;
+}
+
+static ssize_t show_busy_down_thres(const struct cluster_data *state, char *buf)
+{
+	int i, count = 0;
+
+	for (i = 0; i < state->num_cpus; i++)
+		count += snprintf(buf + count, PAGE_SIZE - count, "%u ",
+				  state->busy_down_thres[i]);
+
+	count += snprintf(buf + count, PAGE_SIZE - count, "\n");
+	return count;
+}
+
+static ssize_t store_is_big_cluster(struct cluster_data *state,
+				const char *buf, size_t count)
+{
+	unsigned int val;
+
+	if (sscanf(buf, "%u\n", &val) != 1)
+		return -EINVAL;
+
+	state->is_big_cluster = val ? 1 : 0;
+	return count;
+}
+
+static ssize_t show_is_big_cluster(const struct cluster_data *state, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%u\n", state->is_big_cluster);
+}
+
+static ssize_t store_enable(struct cluster_data *state,
+				const char *buf, size_t count)
+{
+	unsigned int val;
+	bool bval;
+
+	if (sscanf(buf, "%u\n", &val) != 1)
+		return -EINVAL;
+
+	bval = !!val;
+	if (bval != state->enable) {
+		state->enable = bval;
+		apply_need(state);
+	}
+
+	return count;
+}
+
+static ssize_t show_enable(const struct cluster_data *state, char *buf)
+{
+	return scnprintf(buf, PAGE_SIZE, "%u\n", state->enable);
+}
+
+static ssize_t show_need_cpus(const struct cluster_data *state, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%u\n", state->need_cpus);
+}
+
+static ssize_t show_active_cpus(const struct cluster_data *state, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%u\n", state->active_cpus);
+}
+
+static ssize_t show_global_state(const struct cluster_data *state, char *buf)
+{
+	struct cpu_data *c;
+	struct cluster_data *cluster;
+	ssize_t count = 0;
+	unsigned int cpu;
+
+	spin_lock_irq(&state_lock);
+	for_each_possible_cpu(cpu) {
+		c = &per_cpu(cpu_state, cpu);
+		cluster = c->cluster;
+		if (!cluster || !cluster->inited)
+			continue;
+
+		count += snprintf(buf + count, PAGE_SIZE - count,
+					"CPU%u\n", cpu);
+		count += snprintf(buf + count, PAGE_SIZE - count,
+					"\tCPU: %u\n", c->cpu);
+		count += snprintf(buf + count, PAGE_SIZE - count,
+					"\tOnline: %u\n",
+					cpu_online(c->cpu));
+		count += snprintf(buf + count, PAGE_SIZE - count,
+					"\tIsolated: %u\n",
+					cpu_isolated(c->cpu));
+		count += snprintf(buf + count, PAGE_SIZE - count,
+					"\tFirst CPU: %u\n",
+						cluster->first_cpu);
+		count += snprintf(buf + count, PAGE_SIZE - count,
+					"\tBusy%%: %u\n", c->busy);
+		count += snprintf(buf + count, PAGE_SIZE - count,
+					"\tIs busy: %u\n", c->is_busy);
+		count += snprintf(buf + count, PAGE_SIZE - count,
+					"\tNot preferred: %u\n",
+						c->not_preferred);
+		count += snprintf(buf + count, PAGE_SIZE - count,
+					"\tNr running: %u\n", cluster->nrrun);
+		count += snprintf(buf + count, PAGE_SIZE - count,
+			"\tActive CPUs: %u\n", get_active_cpu_count(cluster));
+		count += snprintf(buf + count, PAGE_SIZE - count,
+				"\tNeed CPUs: %u\n", cluster->need_cpus);
+		count += snprintf(buf + count, PAGE_SIZE - count,
+				"\tNr isolated CPUs: %u\n",
+						cluster->nr_isolated_cpus);
+		count += snprintf(buf + count, PAGE_SIZE - count,
+				"\tBoost: %u\n", (unsigned int) cluster->boost);
+	}
+	spin_unlock_irq(&state_lock);
+
+	return count;
+}
+
+static ssize_t store_not_preferred(struct cluster_data *state,
+				   const char *buf, size_t count)
+{
+	struct cpu_data *c;
+	unsigned int i;
+	unsigned int val[MAX_CPUS_PER_CLUSTER];
+	unsigned long flags;
+	int ret;
+
+	ret = sscanf(buf, "%u %u %u %u\n", &val[0], &val[1], &val[2], &val[3]);
+	if (ret != state->num_cpus)
+		return -EINVAL;
+
+	spin_lock_irqsave(&state_lock, flags);
+	for (i = 0; i < state->num_cpus; i++) {
+		c = &per_cpu(cpu_state, i + state->first_cpu);
+		c->not_preferred = val[i];
+	}
+	spin_unlock_irqrestore(&state_lock, flags);
+
+	return count;
+}
+
+static ssize_t show_not_preferred(const struct cluster_data *state, char *buf)
+{
+	struct cpu_data *c;
+	ssize_t count = 0;
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&state_lock, flags);
+	for (i = 0; i < state->num_cpus; i++) {
+		c = &per_cpu(cpu_state, i + state->first_cpu);
+		count += scnprintf(buf + count, PAGE_SIZE - count,
+				"CPU#%d: %u\n", c->cpu, c->not_preferred);
+	}
+	spin_unlock_irqrestore(&state_lock, flags);
+
+	return count;
+}
+
+
+struct core_ctl_attr {
+	struct attribute attr;
+	ssize_t (*show)(const struct cluster_data *, char *);
+	ssize_t (*store)(struct cluster_data *, const char *, size_t count);
+};
+
+#define core_ctl_attr_ro(_name)		\
+static struct core_ctl_attr _name =	\
+__ATTR(_name, 0444, show_##_name, NULL)
+
+#define core_ctl_attr_rw(_name)			\
+static struct core_ctl_attr _name =		\
+__ATTR(_name, 0644, show_##_name, store_##_name)
+
+core_ctl_attr_rw(min_cpus);
+core_ctl_attr_rw(max_cpus);
+core_ctl_attr_rw(offline_delay_ms);
+core_ctl_attr_rw(busy_up_thres);
+core_ctl_attr_rw(busy_down_thres);
+core_ctl_attr_rw(task_thres);
+core_ctl_attr_rw(is_big_cluster);
+core_ctl_attr_ro(need_cpus);
+core_ctl_attr_ro(active_cpus);
+core_ctl_attr_ro(global_state);
+core_ctl_attr_rw(not_preferred);
+core_ctl_attr_rw(enable);
+
+static struct attribute *default_attrs[] = {
+	&min_cpus.attr,
+	&max_cpus.attr,
+	&offline_delay_ms.attr,
+	&busy_up_thres.attr,
+	&busy_down_thres.attr,
+	&task_thres.attr,
+	&is_big_cluster.attr,
+	&enable.attr,
+	&need_cpus.attr,
+	&active_cpus.attr,
+	&global_state.attr,
+	&not_preferred.attr,
+	NULL
+};
+
+#define to_cluster_data(k) container_of(k, struct cluster_data, kobj)
+#define to_attr(a) container_of(a, struct core_ctl_attr, attr)
+static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+	struct cluster_data *data = to_cluster_data(kobj);
+	struct core_ctl_attr *cattr = to_attr(attr);
+	ssize_t ret = -EIO;
+
+	if (cattr->show)
+		ret = cattr->show(data, buf);
+
+	return ret;
+}
+
+static ssize_t store(struct kobject *kobj, struct attribute *attr,
+		     const char *buf, size_t count)
+{
+	struct cluster_data *data = to_cluster_data(kobj);
+	struct core_ctl_attr *cattr = to_attr(attr);
+	ssize_t ret = -EIO;
+
+	if (cattr->store)
+		ret = cattr->store(data, buf, count);
+
+	return ret;
+}
+
+static const struct sysfs_ops sysfs_ops = {
+	.show	= show,
+	.store	= store,
+};
+
+static struct kobj_type ktype_core_ctl = {
+	.sysfs_ops	= &sysfs_ops,
+	.default_attrs	= default_attrs,
+};
+
+/* ==================== runqueue based core count =================== */
+
+#define RQ_AVG_TOLERANCE 2
+#define RQ_AVG_DEFAULT_MS 20
+static unsigned int rq_avg_period_ms = RQ_AVG_DEFAULT_MS;
+
+static s64 rq_avg_timestamp_ms;
+
+static void update_running_avg(bool trigger_update)
+{
+	int avg, iowait_avg, big_avg, old_nrrun;
+	int old_max_nr, max_nr, big_max_nr;
+	s64 now;
+	unsigned long flags;
+	struct cluster_data *cluster;
+	unsigned int index = 0;
+
+	spin_lock_irqsave(&state_lock, flags);
+
+	now = ktime_to_ms(ktime_get());
+	if (now - rq_avg_timestamp_ms < rq_avg_period_ms - RQ_AVG_TOLERANCE) {
+		spin_unlock_irqrestore(&state_lock, flags);
+		return;
+	}
+	rq_avg_timestamp_ms = now;
+	sched_get_nr_running_avg(&avg, &iowait_avg, &big_avg,
+				 &max_nr, &big_max_nr);
+
+	spin_unlock_irqrestore(&state_lock, flags);
+
+	for_each_cluster(cluster, index) {
+		if (!cluster->inited)
+			continue;
+
+		old_nrrun = cluster->nrrun;
+		old_max_nr = cluster->max_nr;
+		cluster->nrrun = cluster->is_big_cluster ? big_avg : avg;
+		cluster->max_nr = cluster->is_big_cluster ? big_max_nr : max_nr;
+
+		if (cluster->nrrun != old_nrrun ||
+			cluster->max_nr != old_max_nr) {
+
+			if (trigger_update)
+				apply_need(cluster);
+			else
+				cluster->nrrun_changed = true;
+		}
+	}
+	return;
+}
+
+#define MAX_NR_THRESHOLD	4
+/* adjust needed CPUs based on current runqueue information */
+static unsigned int apply_task_need(const struct cluster_data *cluster,
+				    unsigned int new_need)
+{
+	/* unisolate all cores if there are enough tasks */
+	if (cluster->nrrun >= cluster->task_thres)
+		return cluster->num_cpus;
+
+	/* only unisolate more cores if there are tasks to run */
+	if (cluster->nrrun > new_need)
+		new_need = new_need + 1;
+
+	/*
+	 * We don't want tasks to be overcrowded in a cluster.
+	 * If any CPU has more than MAX_NR_THRESHOLD in the last
+	 * window, bring another CPU to help out.
+	 */
+	if (cluster->max_nr > MAX_NR_THRESHOLD)
+		new_need = new_need + 1;
+
+	return new_need;
+}
+
+/* ======================= load based core count  ====================== */
+
+static unsigned int apply_limits(const struct cluster_data *cluster,
+				 unsigned int need_cpus)
+{
+	return min(max(cluster->min_cpus, need_cpus), cluster->max_cpus);
+}
+
+static unsigned int get_active_cpu_count(const struct cluster_data *cluster)
+{
+	return cluster->num_cpus -
+				sched_isolate_count(&cluster->cpu_mask, true);
+}
+
+static bool is_active(const struct cpu_data *state)
+{
+	return cpu_online(state->cpu) && !cpu_isolated(state->cpu);
+}
+
+static bool adjustment_possible(const struct cluster_data *cluster,
+							unsigned int need)
+{
+	return (need < cluster->active_cpus || (need > cluster->active_cpus &&
+						cluster->nr_isolated_cpus));
+}
+
+static bool eval_need(struct cluster_data *cluster)
+{
+	unsigned long flags;
+	struct cpu_data *c;
+	unsigned int need_cpus = 0, last_need, thres_idx;
+	int ret = 0;
+	bool need_flag = false;
+	unsigned int new_need;
+	s64 now, elapsed;
+
+	if (unlikely(!cluster->inited))
+		return 0;
+
+	spin_lock_irqsave(&state_lock, flags);
+
+	if (cluster->boost || !cluster->enable) {
+		need_cpus = cluster->max_cpus;
+	} else {
+		cluster->active_cpus = get_active_cpu_count(cluster);
+		thres_idx = cluster->active_cpus ? cluster->active_cpus - 1 : 0;
+		list_for_each_entry(c, &cluster->lru, sib) {
+			if (c->busy >= cluster->busy_up_thres[thres_idx])
+				c->is_busy = true;
+			else if (c->busy < cluster->busy_down_thres[thres_idx])
+				c->is_busy = false;
+			need_cpus += c->is_busy;
+		}
+		need_cpus = apply_task_need(cluster, need_cpus);
+	}
+	new_need = apply_limits(cluster, need_cpus);
+	need_flag = adjustment_possible(cluster, new_need);
+
+	last_need = cluster->need_cpus;
+	now = ktime_to_ms(ktime_get());
+
+	if (new_need > cluster->active_cpus) {
+		ret = 1;
+	} else {
+		if (new_need == last_need) {
+			cluster->need_ts = now;
+			spin_unlock_irqrestore(&state_lock, flags);
+			return 0;
+		}
+
+		elapsed =  now - cluster->need_ts;
+		ret = elapsed >= cluster->offline_delay_ms;
+	}
+
+	if (ret) {
+		cluster->need_ts = now;
+		cluster->need_cpus = new_need;
+	}
+	trace_core_ctl_eval_need(cluster->first_cpu, last_need, new_need,
+				 ret && need_flag);
+	spin_unlock_irqrestore(&state_lock, flags);
+
+	return ret && need_flag;
+}
+
+static void apply_need(struct cluster_data *cluster)
+{
+	if (eval_need(cluster))
+		wake_up_core_ctl_thread(cluster);
+}
+
+static int core_ctl_set_busy(unsigned int cpu, unsigned int busy)
+{
+	struct cpu_data *c = &per_cpu(cpu_state, cpu);
+	struct cluster_data *cluster = c->cluster;
+	unsigned int old_is_busy = c->is_busy;
+
+	if (!cluster || !cluster->inited)
+		return 0;
+
+	update_running_avg(false);
+	if (c->busy == busy && !cluster->nrrun_changed)
+		return 0;
+	c->busy = busy;
+	cluster->nrrun_changed = false;
+
+	apply_need(cluster);
+	trace_core_ctl_set_busy(cpu, busy, old_is_busy, c->is_busy);
+	return 0;
+}
+
+/* ========================= core count enforcement ==================== */
+
+static void wake_up_core_ctl_thread(struct cluster_data *cluster)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cluster->pending_lock, flags);
+	cluster->pending = true;
+	spin_unlock_irqrestore(&cluster->pending_lock, flags);
+
+	wake_up_process_no_notif(cluster->core_ctl_thread);
+}
+
+static u64 core_ctl_check_timestamp;
+static u64 core_ctl_check_interval;
+
+static bool do_check(u64 wallclock)
+{
+	bool do_check = false;
+	unsigned long flags;
+
+	spin_lock_irqsave(&state_lock, flags);
+	if ((wallclock - core_ctl_check_timestamp) >= core_ctl_check_interval) {
+		core_ctl_check_timestamp = wallclock;
+		do_check = true;
+	}
+	spin_unlock_irqrestore(&state_lock, flags);
+	return do_check;
+}
+
+int core_ctl_set_boost(bool boost)
+{
+	unsigned int index = 0;
+	struct cluster_data *cluster;
+	unsigned long flags;
+	int ret = 0;
+	bool boost_state_changed = false;
+
+	if (unlikely(!initialized))
+		return 0;
+
+	spin_lock_irqsave(&state_lock, flags);
+	for_each_cluster(cluster, index) {
+		if (cluster->is_big_cluster) {
+			if (boost) {
+				boost_state_changed = !cluster->boost;
+				++cluster->boost;
+			} else {
+				if (!cluster->boost) {
+					pr_err("Error turning off boost. Boost already turned off\n");
+					ret = -EINVAL;
+				} else {
+					--cluster->boost;
+					boost_state_changed = !cluster->boost;
+				}
+			}
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&state_lock, flags);
+
+	if (boost_state_changed)
+		apply_need(cluster);
+
+	trace_core_ctl_set_boost(cluster->boost, ret);
+
+	return ret;
+}
+EXPORT_SYMBOL(core_ctl_set_boost);
+
+void core_ctl_check(u64 wallclock)
+{
+	if (unlikely(!initialized))
+		return;
+
+	if (do_check(wallclock)) {
+		unsigned int index = 0;
+		struct cluster_data *cluster;
+
+		update_running_avg(true);
+
+		for_each_cluster(cluster, index) {
+			if (eval_need(cluster))
+				wake_up_core_ctl_thread(cluster);
+		}
+	}
+}
+
+static void move_cpu_lru(struct cpu_data *cpu_data)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&state_lock, flags);
+	list_del(&cpu_data->sib);
+	list_add_tail(&cpu_data->sib, &cpu_data->cluster->lru);
+	spin_unlock_irqrestore(&state_lock, flags);
+}
+
+static void try_to_isolate(struct cluster_data *cluster, unsigned int need)
+{
+	struct cpu_data *c, *tmp;
+	unsigned long flags;
+	unsigned int num_cpus = cluster->num_cpus;
+	unsigned int nr_isolated = 0;
+
+	/*
+	 * Protect against entry being removed (and added at tail) by other
+	 * thread (hotplug).
+	 */
+	spin_lock_irqsave(&state_lock, flags);
+	list_for_each_entry_safe(c, tmp, &cluster->lru, sib) {
+		if (!num_cpus--)
+			break;
+
+		if (!is_active(c))
+			continue;
+		if (cluster->active_cpus == need)
+			break;
+		/* Don't offline busy CPUs. */
+		if (c->is_busy)
+			continue;
+
+		spin_unlock_irqrestore(&state_lock, flags);
+
+		pr_debug("Trying to isolate CPU%u\n", c->cpu);
+		if (!sched_isolate_cpu(c->cpu)) {
+			c->isolated_by_us = true;
+			move_cpu_lru(c);
+			nr_isolated++;
+		} else {
+			pr_debug("Unable to isolate CPU%u\n", c->cpu);
+		}
+		cluster->active_cpus = get_active_cpu_count(cluster);
+		spin_lock_irqsave(&state_lock, flags);
+	}
+	cluster->nr_isolated_cpus += nr_isolated;
+	spin_unlock_irqrestore(&state_lock, flags);
+
+	/*
+	 * If the number of active CPUs is within the limits, then
+	 * don't force isolation of any busy CPUs.
+	 */
+	if (cluster->active_cpus <= cluster->max_cpus)
+		return;
+
+	nr_isolated = 0;
+	num_cpus = cluster->num_cpus;
+	spin_lock_irqsave(&state_lock, flags);
+	list_for_each_entry_safe(c, tmp, &cluster->lru, sib) {
+		if (!num_cpus--)
+			break;
+
+		if (!is_active(c))
+			continue;
+		if (cluster->active_cpus <= cluster->max_cpus)
+			break;
+
+		spin_unlock_irqrestore(&state_lock, flags);
+
+		pr_debug("Trying to isolate CPU%u\n", c->cpu);
+		if (!sched_isolate_cpu(c->cpu)) {
+			c->isolated_by_us = true;
+			move_cpu_lru(c);
+			nr_isolated++;
+		} else {
+			pr_debug("Unable to isolate CPU%u\n", c->cpu);
+		}
+		cluster->active_cpus = get_active_cpu_count(cluster);
+		spin_lock_irqsave(&state_lock, flags);
+	}
+	cluster->nr_isolated_cpus += nr_isolated;
+	spin_unlock_irqrestore(&state_lock, flags);
+
+}
+
+static void __try_to_unisolate(struct cluster_data *cluster,
+			       unsigned int need, bool force)
+{
+	struct cpu_data *c, *tmp;
+	unsigned long flags;
+	unsigned int num_cpus = cluster->num_cpus;
+	unsigned int nr_unisolated = 0;
+
+	/*
+	 * Protect against entry being removed (and added at tail) by other
+	 * thread (hotplug).
+	 */
+	spin_lock_irqsave(&state_lock, flags);
+	list_for_each_entry_safe(c, tmp, &cluster->lru, sib) {
+		if (!num_cpus--)
+			break;
+
+		if (!c->isolated_by_us)
+			continue;
+		if ((cpu_online(c->cpu) && !cpu_isolated(c->cpu)) ||
+			(!force && c->not_preferred))
+			continue;
+		if (cluster->active_cpus == need)
+			break;
+
+		spin_unlock_irqrestore(&state_lock, flags);
+
+		pr_debug("Trying to unisolate CPU%u\n", c->cpu);
+		if (!sched_unisolate_cpu(c->cpu)) {
+			c->isolated_by_us = false;
+			move_cpu_lru(c);
+			nr_unisolated++;
+		} else {
+			pr_debug("Unable to unisolate CPU%u\n", c->cpu);
+		}
+		cluster->active_cpus = get_active_cpu_count(cluster);
+		spin_lock_irqsave(&state_lock, flags);
+	}
+	cluster->nr_isolated_cpus -= nr_unisolated;
+	spin_unlock_irqrestore(&state_lock, flags);
+}
+
+static void try_to_unisolate(struct cluster_data *cluster, unsigned int need)
+{
+	bool force_use_non_preferred = false;
+
+	__try_to_unisolate(cluster, need, force_use_non_preferred);
+
+	if (cluster->active_cpus == need)
+		return;
+
+	force_use_non_preferred = true;
+	__try_to_unisolate(cluster, need, force_use_non_preferred);
+}
+
+static void __ref do_core_ctl(struct cluster_data *cluster)
+{
+	unsigned int need;
+
+	need = apply_limits(cluster, cluster->need_cpus);
+
+	if (adjustment_possible(cluster, need)) {
+		pr_debug("Trying to adjust group %u from %u to %u\n",
+				cluster->first_cpu, cluster->active_cpus, need);
+
+		if (cluster->active_cpus > need)
+			try_to_isolate(cluster, need);
+		else if (cluster->active_cpus < need)
+			try_to_unisolate(cluster, need);
+	}
+}
+
+static int __ref try_core_ctl(void *data)
+{
+	struct cluster_data *cluster = data;
+	unsigned long flags;
+
+	while (1) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		spin_lock_irqsave(&cluster->pending_lock, flags);
+		if (!cluster->pending) {
+			spin_unlock_irqrestore(&cluster->pending_lock, flags);
+			schedule();
+			if (kthread_should_stop())
+				break;
+			spin_lock_irqsave(&cluster->pending_lock, flags);
+		}
+		set_current_state(TASK_RUNNING);
+		cluster->pending = false;
+		spin_unlock_irqrestore(&cluster->pending_lock, flags);
+
+		do_core_ctl(cluster);
+	}
+
+	return 0;
+}
+
+static int __ref cpu_callback(struct notifier_block *nfb,
+				unsigned long action, void *hcpu)
+{
+	uint32_t cpu = (uintptr_t)hcpu;
+	struct cpu_data *state = &per_cpu(cpu_state, cpu);
+	struct cluster_data *cluster = state->cluster;
+	unsigned int need;
+	bool do_wakeup, unisolated = false;
+	unsigned long flags;
+
+	if (unlikely(!cluster || !cluster->inited))
+		return NOTIFY_DONE;
+
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_ONLINE:
+		cluster->active_cpus = get_active_cpu_count(cluster);
+
+		/*
+		 * Moving to the end of the list should only happen in
+		 * CPU_ONLINE and not on CPU_UP_PREPARE to prevent an
+		 * infinite list traversal when thermal (or other entities)
+		 * reject trying to online CPUs.
+		 */
+		move_cpu_lru(state);
+		break;
+
+	case CPU_DEAD:
+		/*
+		 * We don't want to have a CPU both offline and isolated.
+		 * So unisolate a CPU that went down if it was isolated by us.
+		 */
+		if (state->isolated_by_us) {
+			sched_unisolate_cpu_unlocked(cpu);
+			state->isolated_by_us = false;
+			unisolated = true;
+		}
+
+		/* Move a CPU to the end of the LRU when it goes offline. */
+		move_cpu_lru(state);
+
+		state->busy = 0;
+		cluster->active_cpus = get_active_cpu_count(cluster);
+		break;
+	default:
+		return NOTIFY_DONE;
+	}
+
+	need = apply_limits(cluster, cluster->need_cpus);
+	spin_lock_irqsave(&state_lock, flags);
+	if (unisolated)
+		cluster->nr_isolated_cpus--;
+	do_wakeup = adjustment_possible(cluster, need);
+	spin_unlock_irqrestore(&state_lock, flags);
+	if (do_wakeup)
+		wake_up_core_ctl_thread(cluster);
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __refdata cpu_notifier = {
+	.notifier_call = cpu_callback,
+};
+
+/* ============================ init code ============================== */
+
+static cpumask_var_t core_ctl_disable_cpumask;
+static bool core_ctl_disable_cpumask_present;
+
+static int __init core_ctl_disable_setup(char *str)
+{
+	if (!*str)
+		return -EINVAL;
+
+	alloc_bootmem_cpumask_var(&core_ctl_disable_cpumask);
+
+	if (cpulist_parse(str, core_ctl_disable_cpumask) < 0) {
+		free_bootmem_cpumask_var(core_ctl_disable_cpumask);
+		return -EINVAL;
+	}
+
+	core_ctl_disable_cpumask_present = true;
+	pr_info("disable_cpumask=%*pbl\n",
+			cpumask_pr_args(core_ctl_disable_cpumask));
+
+	return 0;
+}
+early_param("core_ctl_disable_cpumask", core_ctl_disable_setup);
+
+static bool should_skip(const struct cpumask *mask)
+{
+	if (!core_ctl_disable_cpumask_present)
+		return false;
+
+	/*
+	 * We operate on a cluster basis. Disable the core_ctl for
+	 * a cluster, if all of it's cpus are specified in
+	 * core_ctl_disable_cpumask
+	 */
+	return cpumask_subset(mask, core_ctl_disable_cpumask);
+}
+
+static struct cluster_data *find_cluster_by_first_cpu(unsigned int first_cpu)
+{
+	unsigned int i;
+
+	for (i = 0; i < num_clusters; ++i) {
+		if (cluster_state[i].first_cpu == first_cpu)
+			return &cluster_state[i];
+	}
+
+	return NULL;
+}
+
+static int cluster_init(const struct cpumask *mask)
+{
+	struct device *dev;
+	unsigned int first_cpu = cpumask_first(mask);
+	struct cluster_data *cluster;
+	struct cpu_data *state;
+	unsigned int cpu;
+	struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+
+	if (should_skip(mask))
+		return 0;
+
+	if (find_cluster_by_first_cpu(first_cpu))
+		return 0;
+
+	dev = get_cpu_device(first_cpu);
+	if (!dev)
+		return -ENODEV;
+
+	pr_info("Creating CPU group %d\n", first_cpu);
+
+	if (num_clusters == MAX_CLUSTERS) {
+		pr_err("Unsupported number of clusters. Only %u supported\n",
+								MAX_CLUSTERS);
+		return -EINVAL;
+	}
+	cluster = &cluster_state[num_clusters];
+	++num_clusters;
+
+	cpumask_copy(&cluster->cpu_mask, mask);
+	cluster->num_cpus = cpumask_weight(mask);
+	if (cluster->num_cpus > MAX_CPUS_PER_CLUSTER) {
+		pr_err("HW configuration not supported\n");
+		return -EINVAL;
+	}
+	cluster->first_cpu = first_cpu;
+	cluster->min_cpus = 1;
+	cluster->max_cpus = cluster->num_cpus;
+	cluster->need_cpus = cluster->num_cpus;
+	cluster->offline_delay_ms = 100;
+	cluster->task_thres = UINT_MAX;
+	cluster->nrrun = cluster->num_cpus;
+	cluster->enable = true;
+	INIT_LIST_HEAD(&cluster->lru);
+	spin_lock_init(&cluster->pending_lock);
+
+	for_each_cpu(cpu, mask) {
+		pr_info("Init CPU%u state\n", cpu);
+
+		state = &per_cpu(cpu_state, cpu);
+		state->cluster = cluster;
+		state->cpu = cpu;
+		list_add_tail(&state->sib, &cluster->lru);
+	}
+	cluster->active_cpus = get_active_cpu_count(cluster);
+
+	cluster->core_ctl_thread = kthread_run(try_core_ctl, (void *) cluster,
+					"core_ctl/%d", first_cpu);
+	if (IS_ERR(cluster->core_ctl_thread))
+		return PTR_ERR(cluster->core_ctl_thread);
+
+	sched_setscheduler_nocheck(cluster->core_ctl_thread, SCHED_FIFO,
+				   &param);
+
+	cluster->inited = true;
+
+	kobject_init(&cluster->kobj, &ktype_core_ctl);
+	return kobject_add(&cluster->kobj, &dev->kobj, "core_ctl");
+}
+
+static int cpufreq_policy_cb(struct notifier_block *nb, unsigned long val,
+				void *data)
+{
+	struct cpufreq_policy *policy = data;
+	int ret;
+
+	switch (val) {
+	case CPUFREQ_CREATE_POLICY:
+		ret = cluster_init(policy->related_cpus);
+		if (ret)
+			pr_warn("unable to create core ctl group: %d\n", ret);
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block cpufreq_pol_nb = {
+	.notifier_call = cpufreq_policy_cb,
+};
+
+static int cpufreq_gov_cb(struct notifier_block *nb, unsigned long val,
+				void *data)
+{
+	struct cpufreq_govinfo *info = data;
+
+	switch (val) {
+	case CPUFREQ_LOAD_CHANGE:
+		core_ctl_set_busy(info->cpu, info->load);
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block cpufreq_gov_nb = {
+	.notifier_call = cpufreq_gov_cb,
+};
+
+static int __init core_ctl_init(void)
+{
+	unsigned int cpu;
+
+	if (should_skip(cpu_possible_mask))
+		return 0;
+
+	core_ctl_check_interval = (rq_avg_period_ms - RQ_AVG_TOLERANCE)
+					* NSEC_PER_MSEC;
+
+	register_cpu_notifier(&cpu_notifier);
+	cpufreq_register_notifier(&cpufreq_pol_nb, CPUFREQ_POLICY_NOTIFIER);
+	cpufreq_register_notifier(&cpufreq_gov_nb, CPUFREQ_GOVINFO_NOTIFIER);
+
+	cpu_maps_update_begin();
+	for_each_online_cpu(cpu) {
+		struct cpufreq_policy *policy;
+		int ret;
+
+		policy = cpufreq_cpu_get(cpu);
+		if (policy) {
+			ret = cluster_init(policy->related_cpus);
+			if (ret)
+				pr_warn("unable to create core ctl group: %d\n"
+					, ret);
+			cpufreq_cpu_put(policy);
+		}
+	}
+	cpu_maps_update_done();
+	initialized = true;
+	return 0;
+}
+
+late_initcall(core_ctl_init);
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 981fcd7dc394..14225d5d8617 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -27,6 +27,8 @@
  *  of the License.
  */
 
+#include "sched.h"
+
 #include <linux/gfp.h>
 #include <linux/sched.h>
 #include <linux/sched/rt.h>
@@ -51,6 +53,27 @@ static int convert_prio(int prio)
 }
 
 /**
+ * drop_nopreempt_cpus - remove a cpu from the mask if it is likely
+ *			 non-preemptible
+ * @lowest_mask: mask with selected CPUs (non-NULL)
+ */
+static void
+drop_nopreempt_cpus(struct cpumask *lowest_mask)
+{
+	unsigned int cpu = cpumask_first(lowest_mask);
+
+	while (cpu < nr_cpu_ids) {
+		/* unlocked access */
+		struct task_struct *task = READ_ONCE(cpu_rq(cpu)->curr);
+
+		if (task_may_not_preempt(task, cpu))
+			cpumask_clear_cpu(cpu, lowest_mask);
+
+		cpu = cpumask_next(cpu, lowest_mask);
+	}
+}
+
+/**
  * cpupri_find - find the best (lowest-pri) CPU in the system
  * @cp: The cpupri context
  * @p: The task
@@ -70,9 +93,11 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
 {
 	int idx = 0;
 	int task_pri = convert_prio(p->prio);
+	bool drop_nopreempts = task_pri <= MAX_RT_PRIO;
 
 	BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES);
 
+retry:
 	for (idx = 0; idx < task_pri; idx++) {
 		struct cpupri_vec *vec  = &cp->pri_to_cpu[idx];
 		int skip = 0;
@@ -108,7 +133,8 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
 
 		if (lowest_mask) {
 			cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
-
+			if (drop_nopreempts)
+				drop_nopreempt_cpus(lowest_mask);
 			/*
 			 * We have to ensure that we have at least one bit
 			 * still set in the array, since the map could have
@@ -123,7 +149,14 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
 
 		return 1;
 	}
-
+	/*
+	 * If we can't find any non-preemptible cpu's, retry so we can
+	 * find the lowest priority target and avoid priority inversion.
+	 */
+	if (drop_nopreempts) {
+		drop_nopreempts = false;
+		goto retry;
+	}
 	return 0;
 }
 
@@ -246,3 +279,14 @@ void cpupri_cleanup(struct cpupri *cp)
 	for (i = 0; i < CPUPRI_NR_PRIORITIES; i++)
 		free_cpumask_var(cp->pri_to_cpu[i].mask);
 }
+
+/*
+ * cpupri_check_rt - check if CPU has a RT task
+ * should be called from rcu-sched read section.
+ */
+bool cpupri_check_rt(void)
+{
+	int cpu = raw_smp_processor_id();
+
+	return cpu_rq(cpu)->rd->cpupri.cpu_to_pri[cpu] > CPUPRI_NORMAL;
+}
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index fc2cfd6b2941..e6ec68c15aa3 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -6,7 +6,6 @@
 #include <linux/context_tracking.h>
 #include <linux/cpufreq_times.h>
 #include "sched.h"
-#include "walt.h"
 
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -51,10 +50,8 @@ void irqtime_account_irq(struct task_struct *curr)
 	unsigned long flags;
 	s64 delta;
 	int cpu;
-#ifdef CONFIG_SCHED_WALT
 	u64 wallclock;
 	bool account = true;
-#endif
 
 	if (!sched_clock_irqtime)
 		return;
@@ -62,10 +59,8 @@ void irqtime_account_irq(struct task_struct *curr)
 	local_irq_save(flags);
 
 	cpu = smp_processor_id();
-#ifdef CONFIG_SCHED_WALT
 	wallclock = sched_clock_cpu(cpu);
-#endif
-	delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
+	delta = wallclock - __this_cpu_read(irq_start_time);
 	__this_cpu_add(irq_start_time, delta);
 
 	irq_time_write_begin();
@@ -79,16 +74,16 @@ void irqtime_account_irq(struct task_struct *curr)
 		__this_cpu_add(cpu_hardirq_time, delta);
 	else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
 		__this_cpu_add(cpu_softirq_time, delta);
-#ifdef CONFIG_SCHED_WALT
 	else
 		account = false;
-#endif
 
 	irq_time_write_end();
-#ifdef CONFIG_SCHED_WALT
+
 	if (account)
-		walt_account_irqtime(cpu, curr, delta, wallclock);
-#endif
+		sched_account_irqtime(cpu, curr, delta, wallclock);
+	else if (curr != this_cpu_ksoftirqd())
+		sched_account_irqstart(cpu, curr, wallclock);
+
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(irqtime_account_irq);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 5c6ffddcafcd..188c8388a63f 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -291,9 +291,11 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
 	/*
 	 * By now the task is replenished and enqueued; migrate it.
 	 */
+	p->on_rq = TASK_ON_RQ_MIGRATING;
 	deactivate_task(rq, p, 0);
 	set_task_cpu(p, later_rq->cpu);
 	activate_task(later_rq, p, 0);
+	p->on_rq = TASK_ON_RQ_QUEUED;
 
 	if (!fallback)
 		resched_curr(later_rq);
@@ -992,6 +994,41 @@ static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {}
 
 #endif /* CONFIG_SMP */
 
+#ifdef CONFIG_SCHED_HMP
+
+static void
+inc_hmp_sched_stats_dl(struct rq *rq, struct task_struct *p)
+{
+	inc_cumulative_runnable_avg(&rq->hmp_stats, p);
+}
+
+static void
+dec_hmp_sched_stats_dl(struct rq *rq, struct task_struct *p)
+{
+	dec_cumulative_runnable_avg(&rq->hmp_stats, p);
+}
+
+static void
+fixup_hmp_sched_stats_dl(struct rq *rq, struct task_struct *p,
+			 u32 new_task_load, u32 new_pred_demand)
+{
+	s64 task_load_delta = (s64)new_task_load - task_load(p);
+	s64 pred_demand_delta = PRED_DEMAND_DELTA;
+
+	fixup_cumulative_runnable_avg(&rq->hmp_stats, p, task_load_delta,
+				      pred_demand_delta);
+}
+
+#else	/* CONFIG_SCHED_HMP */
+
+static inline void
+inc_hmp_sched_stats_dl(struct rq *rq, struct task_struct *p) { }
+
+static inline void
+dec_hmp_sched_stats_dl(struct rq *rq, struct task_struct *p) { }
+
+#endif	/* CONFIG_SCHED_HMP */
+
 static inline
 void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 {
@@ -1001,7 +1038,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 	WARN_ON(!dl_prio(prio));
 	dl_rq->dl_nr_running++;
 	add_nr_running(rq_of_dl_rq(dl_rq), 1);
-	walt_inc_cumulative_runnable_avg(rq_of_dl_rq(dl_rq), dl_task_of(dl_se));
+	inc_hmp_sched_stats_dl(rq_of_dl_rq(dl_rq), dl_task_of(dl_se));
 
 	inc_dl_deadline(dl_rq, deadline);
 	inc_dl_migration(dl_se, dl_rq);
@@ -1016,7 +1053,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 	WARN_ON(!dl_rq->dl_nr_running);
 	dl_rq->dl_nr_running--;
 	sub_nr_running(rq_of_dl_rq(dl_rq), 1);
-	walt_dec_cumulative_runnable_avg(rq_of_dl_rq(dl_rq), dl_task_of(dl_se));
+	dec_hmp_sched_stats_dl(rq_of_dl_rq(dl_rq), dl_task_of(dl_se));
 
 	dec_dl_deadline(dl_rq, dl_se->deadline);
 	dec_dl_migration(dl_se, dl_rq);
@@ -1712,6 +1749,7 @@ retry:
 		goto retry;
 	}
 
+	next_task->on_rq = TASK_ON_RQ_MIGRATING;
 	deactivate_task(rq, next_task, 0);
 	clear_average_bw(&next_task->dl, &rq->dl);
 	next_task->on_rq = TASK_ON_RQ_MIGRATING;
@@ -1719,6 +1757,7 @@ retry:
 	next_task->on_rq = TASK_ON_RQ_QUEUED;
 	add_average_bw(&next_task->dl, &later_rq->dl);
 	activate_task(later_rq, next_task, 0);
+	next_task->on_rq = TASK_ON_RQ_QUEUED;
 	ret = 1;
 
 	resched_curr(later_rq);
@@ -1804,6 +1843,7 @@ static void pull_dl_task(struct rq *this_rq)
 
 			resched = true;
 
+			p->on_rq = TASK_ON_RQ_MIGRATING;
 			deactivate_task(src_rq, p, 0);
 			clear_average_bw(&p->dl, &src_rq->dl);
 			p->on_rq = TASK_ON_RQ_MIGRATING;
@@ -1811,6 +1851,7 @@ static void pull_dl_task(struct rq *this_rq)
 			p->on_rq = TASK_ON_RQ_QUEUED;
 			add_average_bw(&p->dl, &this_rq->dl);
 			activate_task(this_rq, p, 0);
+			p->on_rq = TASK_ON_RQ_QUEUED;
 			dmin = p->dl.deadline;
 
 			/* Is there any other task even earlier? */
@@ -2012,6 +2053,11 @@ const struct sched_class dl_sched_class = {
 	.switched_to		= switched_to_dl,
 
 	.update_curr		= update_curr_dl,
+#ifdef CONFIG_SCHED_HMP
+	.inc_hmp_sched_stats	= inc_hmp_sched_stats_dl,
+	.dec_hmp_sched_stats	= dec_hmp_sched_stats_dl,
+	.fixup_hmp_sched_stats	= fixup_hmp_sched_stats_dl,
+#endif
 };
 
 #ifdef CONFIG_SCHED_DEBUG
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 7f7116622631..ed8e6bb4531b 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -227,6 +227,14 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 			cfs_rq->throttled);
 	SEQ_printf(m, "  .%-30s: %d\n", "throttle_count",
 			cfs_rq->throttle_count);
+	SEQ_printf(m, "  .%-30s: %d\n", "runtime_enabled",
+			cfs_rq->runtime_enabled);
+#ifdef CONFIG_SCHED_HMP
+	SEQ_printf(m, "  .%-30s: %d\n", "nr_big_tasks",
+			cfs_rq->hmp_stats.nr_big_tasks);
+	SEQ_printf(m, "  .%-30s: %llu\n", "cumulative_runnable_avg",
+			cfs_rq->hmp_stats.cumulative_runnable_avg);
+#endif
 #endif
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -306,6 +314,23 @@ do {									\
 	P(cpu_load[2]);
 	P(cpu_load[3]);
 	P(cpu_load[4]);
+#ifdef CONFIG_SMP
+	P(cpu_capacity);
+#endif
+#ifdef CONFIG_SCHED_HMP
+	P(static_cpu_pwr_cost);
+	P(cluster->static_cluster_pwr_cost);
+	P(cluster->load_scale_factor);
+	P(cluster->capacity);
+	P(cluster->max_possible_capacity);
+	P(cluster->efficiency);
+	P(cluster->cur_freq);
+	P(cluster->max_freq);
+	P(cluster->exec_scale_factor);
+	P(hmp_stats.nr_big_tasks);
+	SEQ_printf(m, "  .%-30s: %llu\n", "hmp_stats.cumulative_runnable_avg",
+			rq->hmp_stats.cumulative_runnable_avg);
+#endif
 #undef P
 #undef PN
 
@@ -386,6 +411,15 @@ static void sched_debug_header(struct seq_file *m)
 	PN(sysctl_sched_wakeup_granularity);
 	P(sysctl_sched_child_runs_first);
 	P(sysctl_sched_features);
+#ifdef CONFIG_SCHED_HMP
+	P(sched_upmigrate);
+	P(sched_downmigrate);
+	P(sched_init_task_load_windows);
+	P(min_capacity);
+	P(max_capacity);
+	P(sched_ravg_window);
+	P(sched_load_granule);
+#endif
 #undef PN
 #undef P
 
@@ -408,6 +442,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
 	return 0;
 }
 
+#ifdef CONFIG_SYSRQ_SCHED_DEBUG
 void sysrq_sched_debug_show(void)
 {
 	int cpu;
@@ -417,6 +452,7 @@ void sysrq_sched_debug_show(void)
 		print_cpu(NULL, cpu);
 
 }
+#endif
 
 /*
  * This itererator needs some explanation.
@@ -547,6 +583,9 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
 void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 {
 	unsigned long nr_switches;
+	unsigned int load_avg;
+
+	load_avg = pct_task_load(p);
 
 	SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr(p),
 						get_nr_threads(p));
@@ -624,6 +663,13 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 	P(se.statistics.nr_wakeups_cas_attempts);
 	P(se.statistics.nr_wakeups_cas_count);
 
+#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
+	__P(load_avg);
+#ifdef CONFIG_SCHED_HMP
+	P(ravg.demand);
+#endif
+#endif
+
 	{
 		u64 avg_atom, avg_per_cpu;
 
diff --git a/kernel/sched/energy.c b/kernel/sched/energy.c
index b0656b7a93e3..50d183b1e156 100644
--- a/kernel/sched/energy.c
+++ b/kernel/sched/energy.c
@@ -27,7 +27,10 @@
 #include <linux/sched_energy.h>
 #include <linux/stddef.h>
 
+#include "sched.h"
+
 struct sched_group_energy *sge_array[NR_CPUS][NR_SD_LEVELS];
+bool sched_energy_aware;
 
 static void free_resources(void)
 {
@@ -56,6 +59,13 @@ void init_sched_energy_costs(void)
 	int sd_level, i, nstates, cpu;
 	const __be32 *val;
 
+	if (!energy_aware()) {
+		sched_energy_aware = false;
+		return;
+	}
+
+	sched_energy_aware = true;
+
 	for_each_possible_cpu(cpu) {
 		cn = of_get_cpu_node(cpu, NULL);
 		if (!cn) {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 54cf946d1a4c..ddd3e34ee82a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -32,9 +32,8 @@
 #include <linux/task_work.h>
 #include <linux/module.h>
 
-#include <trace/events/sched.h>
-
 #include "sched.h"
+#include <trace/events/sched.h>
 #include "tune.h"
 #include "walt.h"
 
@@ -56,12 +55,6 @@ unsigned int normalized_sysctl_sched_latency = 6000000ULL;
 unsigned int sysctl_sched_sync_hint_enable = 1;
 unsigned int sysctl_sched_cstate_aware = 1;
 
-#ifdef CONFIG_SCHED_WALT
-unsigned int sysctl_sched_use_walt_cpu_util = 1;
-unsigned int sysctl_sched_use_walt_task_util = 1;
-__read_mostly unsigned int sysctl_sched_walt_cpu_high_irqload =
-    (10 * NSEC_PER_MSEC);
-#endif
 /*
  * The initial- and re-scaling of tunables is configurable
  * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
@@ -254,6 +247,9 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight
 	return mul_u64_u32_shr(delta_exec, fact, shift);
 }
 
+#ifdef CONFIG_SMP
+static int active_load_balance_cpu_stop(void *data);
+#endif
 
 const struct sched_class fair_sched_class;
 
@@ -891,12 +887,56 @@ static void update_curr_fair(struct rq *rq)
 	update_curr(cfs_rq_of(&rq->curr->se));
 }
 
+#ifdef CONFIG_SCHEDSTATS
+static inline void
+update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	u64 wait_start = rq_clock(rq_of(cfs_rq));
+
+	if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
+	    likely(wait_start > se->statistics.wait_start))
+		wait_start -= se->statistics.wait_start;
+
+	se->statistics.wait_start = wait_start;
+}
+
+static void
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	struct task_struct *p;
+	u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
+
+	if (entity_is_task(se)) {
+		p = task_of(se);
+		if (task_on_rq_migrating(p)) {
+			/*
+			 * Preserve migrating task's wait time so wait_start
+			 * time stamp can be adjusted to accumulate wait time
+			 * prior to migration.
+			 */
+			se->statistics.wait_start = delta;
+			return;
+		}
+		trace_sched_stat_wait(p, delta);
+	}
+
+	se->statistics.wait_max = max(se->statistics.wait_max, delta);
+	se->statistics.wait_count++;
+	se->statistics.wait_sum += delta;
+	se->statistics.wait_start = 0;
+}
+#else
 static inline void
 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
 }
 
+static inline void
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+}
+#endif
+
 /*
  * Task is being enqueued - update stats:
  */
@@ -910,23 +950,6 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		update_stats_wait_start(cfs_rq, se);
 }
 
-static void
-update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-	schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
-			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
-	schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
-	schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
-			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
-#ifdef CONFIG_SCHEDSTATS
-	if (entity_is_task(se)) {
-		trace_sched_stat_wait(task_of(se),
-			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
-	}
-#endif
-	schedstat_set(se->statistics.wait_start, 0);
-}
-
 static inline void
 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
@@ -2613,7 +2636,27 @@ static inline void update_cfs_shares(struct sched_entity *se)
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
 #ifdef CONFIG_SMP
-/* Precomputed fixed inverse multiplies for multiplication by y^n */
+u32 sched_get_wake_up_idle(struct task_struct *p)
+{
+	u32 enabled = p->flags & PF_WAKE_UP_IDLE;
+
+	return !!enabled;
+}
+EXPORT_SYMBOL(sched_get_wake_up_idle);
+
+int sched_set_wake_up_idle(struct task_struct *p, int wake_up_idle)
+{
+	int enable = !!wake_up_idle;
+
+	if (enable)
+		p->flags |= PF_WAKE_UP_IDLE;
+	else
+		p->flags &= ~PF_WAKE_UP_IDLE;
+
+	return 0;
+}
+EXPORT_SYMBOL(sched_set_wake_up_idle);
+
 static const u32 runnable_avg_yN_inv[] = {
 	0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
 	0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
@@ -2693,6 +2736,1064 @@ static u32 __compute_runnable_contrib(u64 n)
 	return contrib + runnable_avg_yN_sum[n];
 }
 
+#ifdef CONFIG_SCHED_HMP
+
+/* CPU selection flag */
+#define SBC_FLAG_PREV_CPU				0x1
+#define SBC_FLAG_BEST_CAP_CPU				0x2
+#define SBC_FLAG_CPU_COST				0x4
+#define SBC_FLAG_MIN_COST				0x8
+#define SBC_FLAG_IDLE_LEAST_LOADED			0x10
+#define SBC_FLAG_IDLE_CSTATE				0x20
+#define SBC_FLAG_COST_CSTATE_TIE_BREAKER		0x40
+#define SBC_FLAG_COST_CSTATE_PREV_CPU_TIE_BREAKER	0x80
+#define SBC_FLAG_CSTATE_LOAD				0x100
+#define SBC_FLAG_BEST_SIBLING				0x200
+#define SBC_FLAG_WAKER_CPU				0x400
+#define SBC_FLAG_PACK_TASK				0x800
+
+/* Cluster selection flag */
+#define SBC_FLAG_COLOC_CLUSTER				0x10000
+#define SBC_FLAG_WAKER_CLUSTER				0x20000
+#define SBC_FLAG_BACKUP_CLUSTER				0x40000
+#define SBC_FLAG_BOOST_CLUSTER				0x80000
+
+struct cpu_select_env {
+	struct task_struct *p;
+	struct related_thread_group *rtg;
+	u8 reason;
+	u8 need_idle:1;
+	u8 need_waker_cluster:1;
+	u8 sync:1;
+	enum sched_boost_policy boost_policy;
+	u8 pack_task:1;
+	int prev_cpu;
+	DECLARE_BITMAP(candidate_list, NR_CPUS);
+	DECLARE_BITMAP(backup_list, NR_CPUS);
+	u64 task_load;
+	u64 cpu_load;
+	u32 sbc_best_flag;
+	u32 sbc_best_cluster_flag;
+	struct cpumask search_cpus;
+};
+
+struct cluster_cpu_stats {
+	int best_idle_cpu, least_loaded_cpu;
+	int best_capacity_cpu, best_cpu, best_sibling_cpu;
+	int min_cost, best_sibling_cpu_cost;
+	int best_cpu_wakeup_latency;
+	u64 min_load, best_load, best_sibling_cpu_load;
+	s64 highest_spare_capacity;
+};
+
+/*
+ * Should task be woken to any available idle cpu?
+ *
+ * Waking tasks to idle cpu has mixed implications on both performance and
+ * power. In many cases, scheduler can't estimate correctly impact of using idle
+ * cpus on either performance or power. PF_WAKE_UP_IDLE allows external kernel
+ * module to pass a strong hint to scheduler that the task in question should be
+ * woken to idle cpu, generally to improve performance.
+ */
+static inline int wake_to_idle(struct task_struct *p)
+{
+	return (current->flags & PF_WAKE_UP_IDLE) ||
+		 (p->flags & PF_WAKE_UP_IDLE);
+}
+
+static int spill_threshold_crossed(struct cpu_select_env *env, struct rq *rq)
+{
+	u64 total_load;
+
+	total_load = env->task_load + env->cpu_load;
+
+	if (total_load > sched_spill_load ||
+	    (rq->nr_running + 1) > sysctl_sched_spill_nr_run)
+		return 1;
+
+	return 0;
+}
+
+static int skip_cpu(int cpu, struct cpu_select_env *env)
+{
+	int tcpu = task_cpu(env->p);
+	int skip = 0;
+
+	if (!env->reason)
+		return 0;
+
+	if (is_reserved(cpu))
+		return 1;
+
+	switch (env->reason) {
+	case UP_MIGRATION:
+		skip = !idle_cpu(cpu);
+		break;
+	case IRQLOAD_MIGRATION:
+		/* Purposely fall through */
+	default:
+		skip = (cpu == tcpu);
+		break;
+	}
+
+	return skip;
+}
+
+static inline int
+acceptable_capacity(struct sched_cluster *cluster, struct cpu_select_env *env)
+{
+	int tcpu;
+
+	if (!env->reason)
+		return 1;
+
+	tcpu = task_cpu(env->p);
+	switch (env->reason) {
+	case UP_MIGRATION:
+		return cluster->capacity > cpu_capacity(tcpu);
+
+	case DOWN_MIGRATION:
+		return cluster->capacity < cpu_capacity(tcpu);
+
+	default:
+		break;
+	}
+
+	return 1;
+}
+
+static int
+skip_cluster(struct sched_cluster *cluster, struct cpu_select_env *env)
+{
+	if (!test_bit(cluster->id, env->candidate_list))
+		return 1;
+
+	if (!acceptable_capacity(cluster, env)) {
+		__clear_bit(cluster->id, env->candidate_list);
+		return 1;
+	}
+
+	return 0;
+}
+
+static struct sched_cluster *
+select_least_power_cluster(struct cpu_select_env *env)
+{
+	struct sched_cluster *cluster;
+
+	if (env->rtg) {
+		int cpu = cluster_first_cpu(env->rtg->preferred_cluster);
+
+		env->task_load = scale_load_to_cpu(task_load(env->p), cpu);
+
+		if (task_load_will_fit(env->p, env->task_load,
+					cpu, env->boost_policy)) {
+			env->sbc_best_cluster_flag |= SBC_FLAG_COLOC_CLUSTER;
+
+			if (env->boost_policy == SCHED_BOOST_NONE)
+				return env->rtg->preferred_cluster;
+
+			for_each_sched_cluster(cluster) {
+				if (cluster != env->rtg->preferred_cluster) {
+					__set_bit(cluster->id,
+						env->backup_list);
+					__clear_bit(cluster->id,
+						env->candidate_list);
+				}
+			}
+
+			return env->rtg->preferred_cluster;
+		}
+
+		/*
+		 * Since the task load does not fit on the preferred
+		 * cluster anymore, pretend that the task does not
+		 * have any preferred cluster. This allows the waking
+		 * task to get the appropriate CPU it needs as per the
+		 * non co-location placement policy without having to
+		 * wait until the preferred cluster is updated.
+		 */
+		env->rtg = NULL;
+	}
+
+	for_each_sched_cluster(cluster) {
+		if (!skip_cluster(cluster, env)) {
+			int cpu = cluster_first_cpu(cluster);
+
+			env->task_load = scale_load_to_cpu(task_load(env->p),
+									 cpu);
+			if (task_load_will_fit(env->p, env->task_load, cpu,
+					       env->boost_policy))
+				return cluster;
+
+			__set_bit(cluster->id, env->backup_list);
+			__clear_bit(cluster->id, env->candidate_list);
+		}
+	}
+
+	return NULL;
+}
+
+static struct sched_cluster *
+next_candidate(const unsigned long *list, int start, int end)
+{
+	int cluster_id;
+
+	cluster_id = find_next_bit(list, end, start - 1 + 1);
+	if (cluster_id >= end)
+		return NULL;
+
+	return sched_cluster[cluster_id];
+}
+
+static void
+update_spare_capacity(struct cluster_cpu_stats *stats,
+		      struct cpu_select_env *env, int cpu, int capacity,
+		      u64 cpu_load)
+{
+	s64 spare_capacity = sched_ravg_window - cpu_load;
+
+	if (spare_capacity > 0 &&
+	    (spare_capacity > stats->highest_spare_capacity ||
+	     (spare_capacity == stats->highest_spare_capacity &&
+	      ((!env->need_waker_cluster &&
+		capacity > cpu_capacity(stats->best_capacity_cpu)) ||
+	       (env->need_waker_cluster &&
+		cpu_rq(cpu)->nr_running <
+		cpu_rq(stats->best_capacity_cpu)->nr_running))))) {
+		/*
+		 * If sync waker is the only runnable of CPU, cr_avg of the
+		 * CPU is 0 so we have high chance to place the wakee on the
+		 * waker's CPU which likely causes preemtion of the waker.
+		 * This can lead migration of preempted waker.  Place the
+		 * wakee on the real idle CPU when it's possible by checking
+		 * nr_running to avoid such preemption.
+		 */
+		stats->highest_spare_capacity = spare_capacity;
+		stats->best_capacity_cpu = cpu;
+	}
+}
+
+static inline void find_backup_cluster(
+struct cpu_select_env *env, struct cluster_cpu_stats *stats)
+{
+	struct sched_cluster *next = NULL;
+	int i;
+	struct cpumask search_cpus;
+
+	while (!bitmap_empty(env->backup_list, num_clusters)) {
+		next = next_candidate(env->backup_list, 0, num_clusters);
+		__clear_bit(next->id, env->backup_list);
+
+		cpumask_and(&search_cpus, &env->search_cpus, &next->cpus);
+		for_each_cpu(i, &search_cpus) {
+			trace_sched_cpu_load_wakeup(cpu_rq(i), idle_cpu(i),
+			sched_irqload(i), power_cost(i, task_load(env->p) +
+					cpu_cravg_sync(i, env->sync)), 0);
+
+			update_spare_capacity(stats, env, i, next->capacity,
+					  cpu_load_sync(i, env->sync));
+		}
+		env->sbc_best_cluster_flag = SBC_FLAG_BACKUP_CLUSTER;
+	}
+}
+
+struct sched_cluster *
+next_best_cluster(struct sched_cluster *cluster, struct cpu_select_env *env,
+					struct cluster_cpu_stats *stats)
+{
+	struct sched_cluster *next = NULL;
+
+	__clear_bit(cluster->id, env->candidate_list);
+
+	if (env->rtg && preferred_cluster(cluster, env->p))
+		return NULL;
+
+	do {
+		if (bitmap_empty(env->candidate_list, num_clusters))
+			return NULL;
+
+		next = next_candidate(env->candidate_list, 0, num_clusters);
+		if (next) {
+			if (next->min_power_cost > stats->min_cost) {
+				clear_bit(next->id, env->candidate_list);
+				next = NULL;
+				continue;
+			}
+
+			if (skip_cluster(next, env))
+				next = NULL;
+		}
+	} while (!next);
+
+	env->task_load = scale_load_to_cpu(task_load(env->p),
+					cluster_first_cpu(next));
+	return next;
+}
+
+#ifdef CONFIG_SCHED_HMP_CSTATE_AWARE
+static void __update_cluster_stats(int cpu, struct cluster_cpu_stats *stats,
+				   struct cpu_select_env *env, int cpu_cost)
+{
+	int wakeup_latency;
+	int prev_cpu = env->prev_cpu;
+
+	wakeup_latency = cpu_rq(cpu)->wakeup_latency;
+
+	if (env->need_idle) {
+		stats->min_cost = cpu_cost;
+		if (idle_cpu(cpu)) {
+			if (wakeup_latency < stats->best_cpu_wakeup_latency ||
+			    (wakeup_latency == stats->best_cpu_wakeup_latency &&
+			     cpu == prev_cpu)) {
+				stats->best_idle_cpu = cpu;
+				stats->best_cpu_wakeup_latency = wakeup_latency;
+			}
+		} else {
+			if (env->cpu_load < stats->min_load ||
+				(env->cpu_load == stats->min_load &&
+							cpu == prev_cpu)) {
+				stats->least_loaded_cpu = cpu;
+				stats->min_load = env->cpu_load;
+			}
+		}
+
+		return;
+	}
+
+	if (cpu_cost < stats->min_cost)  {
+		stats->min_cost = cpu_cost;
+		stats->best_cpu_wakeup_latency = wakeup_latency;
+		stats->best_load = env->cpu_load;
+		stats->best_cpu = cpu;
+		env->sbc_best_flag = SBC_FLAG_CPU_COST;
+		return;
+	}
+
+	/* CPU cost is the same. Start breaking the tie by C-state */
+
+	if (wakeup_latency > stats->best_cpu_wakeup_latency)
+		return;
+
+	if (wakeup_latency < stats->best_cpu_wakeup_latency) {
+		stats->best_cpu_wakeup_latency = wakeup_latency;
+		stats->best_load = env->cpu_load;
+		stats->best_cpu = cpu;
+		env->sbc_best_flag = SBC_FLAG_COST_CSTATE_TIE_BREAKER;
+		return;
+	}
+
+	/* C-state is the same. Use prev CPU to break the tie */
+	if (cpu == prev_cpu) {
+		stats->best_cpu = cpu;
+		env->sbc_best_flag = SBC_FLAG_COST_CSTATE_PREV_CPU_TIE_BREAKER;
+		return;
+	}
+
+	if (stats->best_cpu != prev_cpu &&
+	    ((wakeup_latency == 0 && env->cpu_load < stats->best_load) ||
+	    (wakeup_latency > 0 && env->cpu_load > stats->best_load))) {
+		stats->best_load = env->cpu_load;
+		stats->best_cpu = cpu;
+		env->sbc_best_flag = SBC_FLAG_CSTATE_LOAD;
+	}
+}
+#else /* CONFIG_SCHED_HMP_CSTATE_AWARE */
+static void __update_cluster_stats(int cpu, struct cluster_cpu_stats *stats,
+				   struct cpu_select_env *env, int cpu_cost)
+{
+	int prev_cpu = env->prev_cpu;
+
+	if (cpu != prev_cpu && cpus_share_cache(prev_cpu, cpu)) {
+		if (stats->best_sibling_cpu_cost > cpu_cost ||
+		    (stats->best_sibling_cpu_cost == cpu_cost &&
+		     stats->best_sibling_cpu_load > env->cpu_load)) {
+			stats->best_sibling_cpu_cost = cpu_cost;
+			stats->best_sibling_cpu_load = env->cpu_load;
+			stats->best_sibling_cpu = cpu;
+		}
+	}
+
+	if ((cpu_cost < stats->min_cost) ||
+	    ((stats->best_cpu != prev_cpu &&
+	      stats->min_load > env->cpu_load) || cpu == prev_cpu)) {
+		if (env->need_idle) {
+			if (idle_cpu(cpu)) {
+				stats->min_cost = cpu_cost;
+				stats->best_idle_cpu = cpu;
+			}
+		} else {
+			stats->min_cost = cpu_cost;
+			stats->min_load = env->cpu_load;
+			stats->best_cpu = cpu;
+			env->sbc_best_flag = SBC_FLAG_MIN_COST;
+		}
+	}
+}
+#endif /* CONFIG_SCHED_HMP_CSTATE_AWARE */
+
+static void update_cluster_stats(int cpu, struct cluster_cpu_stats *stats,
+					 struct cpu_select_env *env)
+{
+	int cpu_cost;
+
+	/*
+	 * We try to find the least loaded *busy* CPU irrespective
+	 * of the power cost.
+	 */
+	if (env->pack_task)
+		cpu_cost = cpu_min_power_cost(cpu);
+
+	else
+		cpu_cost = power_cost(cpu, task_load(env->p) +
+				cpu_cravg_sync(cpu, env->sync));
+
+	if (cpu_cost <= stats->min_cost)
+		__update_cluster_stats(cpu, stats, env, cpu_cost);
+}
+
+static void find_best_cpu_in_cluster(struct sched_cluster *c,
+	 struct cpu_select_env *env, struct cluster_cpu_stats *stats)
+{
+	int i;
+	struct cpumask search_cpus;
+
+	cpumask_and(&search_cpus, &env->search_cpus, &c->cpus);
+
+	env->need_idle = wake_to_idle(env->p) || c->wake_up_idle;
+
+	for_each_cpu(i, &search_cpus) {
+		env->cpu_load = cpu_load_sync(i, env->sync);
+
+		trace_sched_cpu_load_wakeup(cpu_rq(i), idle_cpu(i),
+			sched_irqload(i),
+			power_cost(i, task_load(env->p) +
+					cpu_cravg_sync(i, env->sync)), 0);
+
+		if (skip_cpu(i, env))
+			continue;
+
+		update_spare_capacity(stats, env, i, c->capacity,
+				      env->cpu_load);
+
+		/*
+		 * need_idle takes precedence over sched boost but when both
+		 * are set, idlest CPU with in all the clusters is selected
+		 * when boost_policy = BOOST_ON_ALL whereas idlest CPU in the
+		 * big cluster is selected within boost_policy = BOOST_ON_BIG.
+		 */
+		if ((!env->need_idle &&
+		    env->boost_policy != SCHED_BOOST_NONE) ||
+		    env->need_waker_cluster ||
+		    sched_cpu_high_irqload(i) ||
+		    spill_threshold_crossed(env, cpu_rq(i)))
+			continue;
+
+		update_cluster_stats(i, stats, env);
+	}
+}
+
+static inline void init_cluster_cpu_stats(struct cluster_cpu_stats *stats)
+{
+	stats->best_cpu = stats->best_idle_cpu = -1;
+	stats->best_capacity_cpu = stats->best_sibling_cpu  = -1;
+	stats->min_cost = stats->best_sibling_cpu_cost = INT_MAX;
+	stats->min_load	= stats->best_sibling_cpu_load = ULLONG_MAX;
+	stats->highest_spare_capacity = 0;
+	stats->least_loaded_cpu = -1;
+	stats->best_cpu_wakeup_latency = INT_MAX;
+	/* No need to initialize stats->best_load */
+}
+
+static inline bool env_has_special_flags(struct cpu_select_env *env)
+{
+	if (env->need_idle || env->boost_policy != SCHED_BOOST_NONE ||
+	    env->reason)
+		return true;
+
+	return false;
+}
+
+static inline bool
+bias_to_prev_cpu(struct cpu_select_env *env, struct cluster_cpu_stats *stats)
+{
+	int prev_cpu;
+	struct task_struct *task = env->p;
+	struct sched_cluster *cluster;
+
+	if (!task->ravg.mark_start || !sched_short_sleep_task_threshold)
+		return false;
+
+	prev_cpu = env->prev_cpu;
+	if (!cpumask_test_cpu(prev_cpu, &env->search_cpus))
+		return false;
+
+	if (task->ravg.mark_start - task->last_cpu_selected_ts >=
+				sched_long_cpu_selection_threshold)
+		return false;
+
+	/*
+	 * This function should be used by task wake up path only as it's
+	 * assuming p->last_switch_out_ts as last sleep time.
+	 * p->last_switch_out_ts can denote last preemption time as well as
+	 * last sleep time.
+	 */
+	if (task->ravg.mark_start - task->last_switch_out_ts >=
+					sched_short_sleep_task_threshold)
+		return false;
+
+	env->task_load = scale_load_to_cpu(task_load(task), prev_cpu);
+	cluster = cpu_rq(prev_cpu)->cluster;
+
+	if (!task_load_will_fit(task, env->task_load, prev_cpu,
+				sched_boost_policy())) {
+
+		__set_bit(cluster->id, env->backup_list);
+		__clear_bit(cluster->id, env->candidate_list);
+		return false;
+	}
+
+	env->cpu_load = cpu_load_sync(prev_cpu, env->sync);
+	if (sched_cpu_high_irqload(prev_cpu) ||
+			spill_threshold_crossed(env, cpu_rq(prev_cpu))) {
+		update_spare_capacity(stats, env, prev_cpu,
+				cluster->capacity, env->cpu_load);
+		cpumask_clear_cpu(prev_cpu, &env->search_cpus);
+		return false;
+	}
+
+	return true;
+}
+
+static inline bool
+wake_to_waker_cluster(struct cpu_select_env *env)
+{
+	return env->sync &&
+	       task_load(current) > sched_big_waker_task_load &&
+	       task_load(env->p) < sched_small_wakee_task_load;
+}
+
+static inline bool
+bias_to_waker_cpu(struct cpu_select_env *env, int cpu)
+{
+	return sysctl_sched_prefer_sync_wakee_to_waker &&
+	       cpu_rq(cpu)->nr_running == 1 &&
+	       cpumask_test_cpu(cpu, &env->search_cpus);
+}
+
+static inline int
+cluster_allowed(struct cpu_select_env *env, struct sched_cluster *cluster)
+{
+	return cpumask_intersects(&env->search_cpus, &cluster->cpus);
+}
+
+/* return cheapest cpu that can fit this task */
+static int select_best_cpu(struct task_struct *p, int target, int reason,
+			   int sync)
+{
+	struct sched_cluster *cluster, *pref_cluster = NULL;
+	struct cluster_cpu_stats stats;
+	struct related_thread_group *grp;
+	unsigned int sbc_flag = 0;
+	int cpu = raw_smp_processor_id();
+	bool special;
+
+	struct cpu_select_env env = {
+		.p			= p,
+		.reason			= reason,
+		.need_idle		= wake_to_idle(p),
+		.need_waker_cluster	= 0,
+		.sync			= sync,
+		.prev_cpu		= target,
+		.rtg			= NULL,
+		.sbc_best_flag		= 0,
+		.sbc_best_cluster_flag	= 0,
+		.pack_task              = false,
+	};
+
+	env.boost_policy = task_sched_boost(p) ?
+			sched_boost_policy() : SCHED_BOOST_NONE;
+
+	bitmap_copy(env.candidate_list, all_cluster_ids, NR_CPUS);
+	bitmap_zero(env.backup_list, NR_CPUS);
+
+	cpumask_and(&env.search_cpus, tsk_cpus_allowed(p), cpu_active_mask);
+	cpumask_andnot(&env.search_cpus, &env.search_cpus, cpu_isolated_mask);
+
+	init_cluster_cpu_stats(&stats);
+	special = env_has_special_flags(&env);
+
+	rcu_read_lock();
+
+	grp = task_related_thread_group(p);
+
+	if (grp && grp->preferred_cluster) {
+		pref_cluster = grp->preferred_cluster;
+		if (!cluster_allowed(&env, pref_cluster))
+			clear_bit(pref_cluster->id, env.candidate_list);
+		else
+			env.rtg = grp;
+	} else if (!special) {
+		cluster = cpu_rq(cpu)->cluster;
+		if (wake_to_waker_cluster(&env)) {
+			if (bias_to_waker_cpu(&env, cpu)) {
+				target = cpu;
+				sbc_flag = SBC_FLAG_WAKER_CLUSTER |
+					   SBC_FLAG_WAKER_CPU;
+				goto out;
+			} else if (cluster_allowed(&env, cluster)) {
+				env.need_waker_cluster = 1;
+				bitmap_zero(env.candidate_list, NR_CPUS);
+				__set_bit(cluster->id, env.candidate_list);
+				env.sbc_best_cluster_flag =
+							SBC_FLAG_WAKER_CLUSTER;
+			}
+		} else if (bias_to_prev_cpu(&env, &stats)) {
+			sbc_flag = SBC_FLAG_PREV_CPU;
+			goto out;
+		}
+	}
+
+	if (!special && is_short_burst_task(p)) {
+		env.pack_task = true;
+		sbc_flag = SBC_FLAG_PACK_TASK;
+	}
+retry:
+	cluster = select_least_power_cluster(&env);
+
+	if (!cluster)
+		goto out;
+
+	/*
+	 * 'cluster' now points to the minimum power cluster which can satisfy
+	 * task's perf goals. Walk down the cluster list starting with that
+	 * cluster. For non-small tasks, skip clusters that don't have
+	 * mostly_idle/idle cpus
+	 */
+
+	do {
+		find_best_cpu_in_cluster(cluster, &env, &stats);
+
+	} while ((cluster = next_best_cluster(cluster, &env, &stats)));
+
+	if (env.need_idle) {
+		if (stats.best_idle_cpu >= 0) {
+			target = stats.best_idle_cpu;
+			sbc_flag |= SBC_FLAG_IDLE_CSTATE;
+		} else if (stats.least_loaded_cpu >= 0) {
+			target = stats.least_loaded_cpu;
+			sbc_flag |= SBC_FLAG_IDLE_LEAST_LOADED;
+		}
+	} else if (stats.best_cpu >= 0) {
+		if (stats.best_sibling_cpu >= 0 &&
+				stats.best_cpu != task_cpu(p) &&
+				stats.min_cost == stats.best_sibling_cpu_cost) {
+			stats.best_cpu = stats.best_sibling_cpu;
+			sbc_flag |= SBC_FLAG_BEST_SIBLING;
+		}
+		sbc_flag |= env.sbc_best_flag;
+		target = stats.best_cpu;
+	} else {
+		if (env.rtg && env.boost_policy == SCHED_BOOST_NONE) {
+			env.rtg = NULL;
+			goto retry;
+		}
+
+		/*
+		 * With boost_policy == SCHED_BOOST_ON_BIG, we reach here with
+		 * backup_list = little cluster, candidate_list = none and
+		 * stats->best_capacity_cpu points the best spare capacity
+		 * CPU among the CPUs in the big cluster.
+		 */
+		if (env.boost_policy == SCHED_BOOST_ON_BIG &&
+		    stats.best_capacity_cpu >= 0)
+			sbc_flag |= SBC_FLAG_BOOST_CLUSTER;
+		else
+			find_backup_cluster(&env, &stats);
+
+		if (stats.best_capacity_cpu >= 0) {
+			target = stats.best_capacity_cpu;
+			sbc_flag |= SBC_FLAG_BEST_CAP_CPU;
+		}
+	}
+	p->last_cpu_selected_ts = sched_ktime_clock();
+out:
+	sbc_flag |= env.sbc_best_cluster_flag;
+	rcu_read_unlock();
+	trace_sched_task_load(p, sched_boost_policy() && task_sched_boost(p),
+		env.reason, env.sync, env.need_idle, sbc_flag, target);
+	return target;
+}
+
+#ifdef CONFIG_CFS_BANDWIDTH
+
+static inline struct task_group *next_task_group(struct task_group *tg)
+{
+	tg = list_entry_rcu(tg->list.next, typeof(struct task_group), list);
+
+	return (&tg->list == &task_groups) ? NULL : tg;
+}
+
+/* Iterate over all cfs_rq in a cpu */
+#define for_each_cfs_rq(cfs_rq, tg, cpu)	\
+	for (tg = container_of(&task_groups, struct task_group, list);	\
+		((tg = next_task_group(tg)) && (cfs_rq = tg->cfs_rq[cpu]));)
+
+void reset_cfs_rq_hmp_stats(int cpu, int reset_cra)
+{
+	struct task_group *tg;
+	struct cfs_rq *cfs_rq;
+
+	rcu_read_lock();
+
+	for_each_cfs_rq(cfs_rq, tg, cpu)
+		reset_hmp_stats(&cfs_rq->hmp_stats, reset_cra);
+
+	rcu_read_unlock();
+}
+
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
+
+static void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+	 struct task_struct *p, int change_cra);
+static void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+	 struct task_struct *p, int change_cra);
+
+/* Add task's contribution to a cpu' HMP statistics */
+void _inc_hmp_sched_stats_fair(struct rq *rq,
+			struct task_struct *p, int change_cra)
+{
+	struct cfs_rq *cfs_rq;
+	struct sched_entity *se = &p->se;
+
+	/*
+	 * Although below check is not strictly required  (as
+	 * inc/dec_nr_big_task and inc/dec_cumulative_runnable_avg called
+	 * from inc_cfs_rq_hmp_stats() have similar checks), we gain a bit on
+	 * efficiency by short-circuiting for_each_sched_entity() loop when
+	 * sched_disable_window_stats
+	 */
+	if (sched_disable_window_stats)
+		return;
+
+	for_each_sched_entity(se) {
+		cfs_rq = cfs_rq_of(se);
+		inc_cfs_rq_hmp_stats(cfs_rq, p, change_cra);
+		if (cfs_rq_throttled(cfs_rq))
+			break;
+	}
+
+	/* Update rq->hmp_stats only if we didn't find any throttled cfs_rq */
+	if (!se)
+		inc_rq_hmp_stats(rq, p, change_cra);
+}
+
+/* Remove task's contribution from a cpu' HMP statistics */
+static void
+_dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p, int change_cra)
+{
+	struct cfs_rq *cfs_rq;
+	struct sched_entity *se = &p->se;
+
+	/* See comment on efficiency in _inc_hmp_sched_stats_fair */
+	if (sched_disable_window_stats)
+		return;
+
+	for_each_sched_entity(se) {
+		cfs_rq = cfs_rq_of(se);
+		dec_cfs_rq_hmp_stats(cfs_rq, p, change_cra);
+		if (cfs_rq_throttled(cfs_rq))
+			break;
+	}
+
+	/* Update rq->hmp_stats only if we didn't find any throttled cfs_rq */
+	if (!se)
+		dec_rq_hmp_stats(rq, p, change_cra);
+}
+
+static void inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p)
+{
+	_inc_hmp_sched_stats_fair(rq, p, 1);
+}
+
+static void dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p)
+{
+	_dec_hmp_sched_stats_fair(rq, p, 1);
+}
+
+static void fixup_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p,
+				       u32 new_task_load, u32 new_pred_demand)
+{
+	struct cfs_rq *cfs_rq;
+	struct sched_entity *se = &p->se;
+	s64 task_load_delta = (s64)new_task_load - task_load(p);
+	s64 pred_demand_delta = PRED_DEMAND_DELTA;
+
+	for_each_sched_entity(se) {
+		cfs_rq = cfs_rq_of(se);
+
+		fixup_cumulative_runnable_avg(&cfs_rq->hmp_stats, p,
+					      task_load_delta,
+					      pred_demand_delta);
+		fixup_nr_big_tasks(&cfs_rq->hmp_stats, p, task_load_delta);
+		if (cfs_rq_throttled(cfs_rq))
+			break;
+	}
+
+	/* Fix up rq->hmp_stats only if we didn't find any throttled cfs_rq */
+	if (!se) {
+		fixup_cumulative_runnable_avg(&rq->hmp_stats, p,
+					      task_load_delta,
+					      pred_demand_delta);
+		fixup_nr_big_tasks(&rq->hmp_stats, p, task_load_delta);
+	}
+}
+
+static int task_will_be_throttled(struct task_struct *p);
+
+#else	/* CONFIG_CFS_BANDWIDTH */
+
+inline void reset_cfs_rq_hmp_stats(int cpu, int reset_cra) { }
+
+static void
+inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p)
+{
+	inc_nr_big_task(&rq->hmp_stats, p);
+	inc_cumulative_runnable_avg(&rq->hmp_stats, p);
+}
+
+static void
+dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p)
+{
+	dec_nr_big_task(&rq->hmp_stats, p);
+	dec_cumulative_runnable_avg(&rq->hmp_stats, p);
+}
+static void
+fixup_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p,
+			   u32 new_task_load, u32 new_pred_demand)
+{
+	s64 task_load_delta = (s64)new_task_load - task_load(p);
+	s64 pred_demand_delta = PRED_DEMAND_DELTA;
+
+	fixup_cumulative_runnable_avg(&rq->hmp_stats, p, task_load_delta,
+				      pred_demand_delta);
+	fixup_nr_big_tasks(&rq->hmp_stats, p, task_load_delta);
+}
+
+static inline int task_will_be_throttled(struct task_struct *p)
+{
+	return 0;
+}
+
+void _inc_hmp_sched_stats_fair(struct rq *rq,
+			struct task_struct *p, int change_cra)
+{
+	inc_nr_big_task(&rq->hmp_stats, p);
+}
+
+#endif	/* CONFIG_CFS_BANDWIDTH */
+
+/*
+ * Reset balance_interval at all sched_domain levels of given cpu, so that it
+ * honors kick.
+ */
+static inline void reset_balance_interval(int cpu)
+{
+	struct sched_domain *sd;
+
+	if (cpu >= nr_cpu_ids)
+		return;
+
+	rcu_read_lock();
+	for_each_domain(cpu, sd)
+		sd->balance_interval = 0;
+	rcu_read_unlock();
+}
+
+/*
+ * Check if a task is on the "wrong" cpu (i.e its current cpu is not the ideal
+ * cpu as per its demand or priority)
+ *
+ * Returns reason why task needs to be migrated
+ */
+static inline int migration_needed(struct task_struct *p, int cpu)
+{
+	int nice;
+	struct related_thread_group *grp;
+
+	if (p->state != TASK_RUNNING || p->nr_cpus_allowed == 1)
+		return 0;
+
+	/* No need to migrate task that is about to be throttled */
+	if (task_will_be_throttled(p))
+		return 0;
+
+	if (sched_boost_policy() == SCHED_BOOST_ON_BIG &&
+		 cpu_capacity(cpu) != max_capacity && task_sched_boost(p))
+		return UP_MIGRATION;
+
+	if (sched_cpu_high_irqload(cpu))
+		return IRQLOAD_MIGRATION;
+
+	nice = task_nice(p);
+	rcu_read_lock();
+	grp = task_related_thread_group(p);
+	/*
+	 * Don't assume higher capacity means higher power. If the task
+	 * is running on the power efficient CPU, avoid migrating it
+	 * to a lower capacity cluster.
+	 */
+	if (!grp && (nice > SCHED_UPMIGRATE_MIN_NICE ||
+			upmigrate_discouraged(p)) &&
+			cpu_capacity(cpu) > min_capacity &&
+			cpu_max_power_cost(cpu) == max_power_cost) {
+		rcu_read_unlock();
+		return DOWN_MIGRATION;
+	}
+
+	if (!task_will_fit(p, cpu)) {
+		rcu_read_unlock();
+		return UP_MIGRATION;
+	}
+	rcu_read_unlock();
+
+	return 0;
+}
+
+static inline int
+kick_active_balance(struct rq *rq, struct task_struct *p, int new_cpu)
+{
+	unsigned long flags;
+	int rc = 0;
+
+	/* Invoke active balance to force migrate currently running task */
+	raw_spin_lock_irqsave(&rq->lock, flags);
+	if (!rq->active_balance) {
+		rq->active_balance = 1;
+		rq->push_cpu = new_cpu;
+		get_task_struct(p);
+		rq->push_task = p;
+		rc = 1;
+	}
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+	return rc;
+}
+
+static DEFINE_RAW_SPINLOCK(migration_lock);
+
+static bool do_migration(int reason, int new_cpu, int cpu)
+{
+	if ((reason == UP_MIGRATION || reason == DOWN_MIGRATION)
+				&& same_cluster(new_cpu, cpu))
+		return false;
+
+	/* Inter cluster high irqload migrations are OK */
+	return new_cpu != cpu;
+}
+
+/*
+ * Check if currently running task should be migrated to a better cpu.
+ *
+ * Todo: Effect this via changes to nohz_balancer_kick() and load balance?
+ */
+void check_for_migration(struct rq *rq, struct task_struct *p)
+{
+	int cpu = cpu_of(rq), new_cpu;
+	int active_balance = 0, reason;
+
+	reason = migration_needed(p, cpu);
+	if (!reason)
+		return;
+
+	raw_spin_lock(&migration_lock);
+	new_cpu = select_best_cpu(p, cpu, reason, 0);
+
+	if (do_migration(reason, new_cpu, cpu)) {
+		active_balance = kick_active_balance(rq, p, new_cpu);
+		if (active_balance)
+			mark_reserved(new_cpu);
+	}
+
+	raw_spin_unlock(&migration_lock);
+
+	if (active_balance)
+		stop_one_cpu_nowait(cpu, active_load_balance_cpu_stop, rq,
+					&rq->active_balance_work);
+}
+
+#ifdef CONFIG_CFS_BANDWIDTH
+
+static void init_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq)
+{
+	cfs_rq->hmp_stats.nr_big_tasks = 0;
+	cfs_rq->hmp_stats.cumulative_runnable_avg = 0;
+	cfs_rq->hmp_stats.pred_demands_sum = 0;
+}
+
+static void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+		 struct task_struct *p, int change_cra)
+{
+	inc_nr_big_task(&cfs_rq->hmp_stats, p);
+	if (change_cra)
+		inc_cumulative_runnable_avg(&cfs_rq->hmp_stats, p);
+}
+
+static void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+		 struct task_struct *p, int change_cra)
+{
+	dec_nr_big_task(&cfs_rq->hmp_stats, p);
+	if (change_cra)
+		dec_cumulative_runnable_avg(&cfs_rq->hmp_stats, p);
+}
+
+static void inc_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats,
+			 struct cfs_rq *cfs_rq)
+{
+	stats->nr_big_tasks += cfs_rq->hmp_stats.nr_big_tasks;
+	stats->cumulative_runnable_avg +=
+				cfs_rq->hmp_stats.cumulative_runnable_avg;
+	stats->pred_demands_sum += cfs_rq->hmp_stats.pred_demands_sum;
+}
+
+static void dec_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats,
+				 struct cfs_rq *cfs_rq)
+{
+	stats->nr_big_tasks -= cfs_rq->hmp_stats.nr_big_tasks;
+	stats->cumulative_runnable_avg -=
+				cfs_rq->hmp_stats.cumulative_runnable_avg;
+	stats->pred_demands_sum -= cfs_rq->hmp_stats.pred_demands_sum;
+
+	BUG_ON(stats->nr_big_tasks < 0 ||
+		(s64)stats->cumulative_runnable_avg < 0);
+	BUG_ON((s64)stats->pred_demands_sum < 0);
+}
+
+#else	/* CONFIG_CFS_BANDWIDTH */
+
+static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+	 struct task_struct *p, int change_cra) { }
+
+static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+	 struct task_struct *p, int change_cra) { }
+
+#endif	/* CONFIG_CFS_BANDWIDTH */
+
+#else	/* CONFIG_SCHED_HMP */
+
+static inline void init_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq) { }
+
+static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+	 struct task_struct *p, int change_cra) { }
+
+static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+	 struct task_struct *p, int change_cra) { }
+
+#define dec_throttled_cfs_rq_hmp_stats(...)
+#define inc_throttled_cfs_rq_hmp_stats(...)
+
+#endif	/* CONFIG_SCHED_HMP */
+
 #if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10
 #error "load tracking assumes 2^10 as unit"
 #endif
@@ -2816,6 +3917,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
 		if (cfs_rq)
 			cfs_rq->runnable_load_sum += weight * scaled_delta;
 	}
+
 	if (running)
 		sa->util_sum += scaled_delta * scale_cpu;
 
@@ -3384,6 +4486,12 @@ static inline int idle_balance(struct rq *rq)
 	return 0;
 }
 
+static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+	 struct task_struct *p, int change_cra) { }
+
+static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+	 struct task_struct *p, int change_cra) { }
+
 #endif /* CONFIG_SMP */
 
 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -4010,6 +5118,35 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
 	return cfs_bandwidth_used() && cfs_rq->throttled;
 }
 
+#ifdef CONFIG_SCHED_HMP
+/*
+ * Check if task is part of a hierarchy where some cfs_rq does not have any
+ * runtime left.
+ *
+ * We can't rely on throttled_hierarchy() to do this test, as
+ * cfs_rq->throttle_count will not be updated yet when this function is called
+ * from scheduler_tick()
+ */
+static int task_will_be_throttled(struct task_struct *p)
+{
+	struct sched_entity *se = &p->se;
+	struct cfs_rq *cfs_rq;
+
+	if (!cfs_bandwidth_used())
+		return 0;
+
+	for_each_sched_entity(se) {
+		cfs_rq = cfs_rq_of(se);
+		if (!cfs_rq->runtime_enabled)
+			continue;
+		if (cfs_rq->runtime_remaining <= 0)
+			return 1;
+	}
+
+	return 0;
+}
+#endif
+
 /* check whether cfs_rq, or any parent, is throttled */
 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
 {
@@ -4089,13 +5226,16 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 		if (dequeue)
 			dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
 		qcfs_rq->h_nr_running -= task_delta;
+		dec_throttled_cfs_rq_hmp_stats(&qcfs_rq->hmp_stats, cfs_rq);
 
 		if (qcfs_rq->load.weight)
 			dequeue = 0;
 	}
 
-	if (!se)
+	if (!se) {
 		sub_nr_running(rq, task_delta);
+		dec_throttled_cfs_rq_hmp_stats(&rq->hmp_stats, cfs_rq);
+	}
 
 	cfs_rq->throttled = 1;
 	cfs_rq->throttled_clock = rq_clock(rq);
@@ -4120,6 +5260,12 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 		start_cfs_bandwidth(cfs_b);
 
 	raw_spin_unlock(&cfs_b->lock);
+
+	/* Log effect on hmp stats after throttling */
+	trace_sched_cpu_load_cgroup(rq, idle_cpu(cpu_of(rq)),
+			     sched_irqload(cpu_of(rq)),
+			     power_cost(cpu_of(rq), 0),
+			     cpu_temp(cpu_of(rq)));
 }
 
 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
@@ -4129,6 +5275,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 	struct sched_entity *se;
 	int enqueue = 1;
 	long task_delta;
+	struct cfs_rq *tcfs_rq __maybe_unused = cfs_rq;
 
 	se = cfs_rq->tg->se[cpu_of(rq)];
 
@@ -4156,17 +5303,26 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 		if (enqueue)
 			enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
 		cfs_rq->h_nr_running += task_delta;
+		inc_throttled_cfs_rq_hmp_stats(&cfs_rq->hmp_stats, tcfs_rq);
 
 		if (cfs_rq_throttled(cfs_rq))
 			break;
 	}
 
-	if (!se)
+	if (!se) {
 		add_nr_running(rq, task_delta);
+		inc_throttled_cfs_rq_hmp_stats(&rq->hmp_stats, tcfs_rq);
+	}
 
 	/* determine whether we need to wake up potentially idle cpu */
 	if (rq->curr == rq->idle && rq->cfs.nr_running)
 		resched_curr(rq);
+
+	/* Log effect on hmp stats after un-throttling */
+	trace_sched_cpu_load_cgroup(rq, idle_cpu(cpu_of(rq)),
+			     sched_irqload(cpu_of(rq)),
+			     power_cost(cpu_of(rq), 0),
+			     cpu_temp(cpu_of(rq)));
 }
 
 static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
@@ -4519,6 +5675,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
 	cfs_rq->runtime_enabled = 0;
 	INIT_LIST_HEAD(&cfs_rq->throttled_list);
+	init_cfs_rq_hmp_stats(cfs_rq);
 }
 
 void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
@@ -4634,7 +5791,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
 
 	WARN_ON(task_rq(p) != rq);
 
-	if (cfs_rq->nr_running > 1) {
+	if (rq->cfs.h_nr_running > 1) {
 		u64 slice = sched_slice(cfs_rq, se);
 		u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
 		s64 delta = slice - ran;
@@ -4650,8 +5807,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
 
 /*
  * called from enqueue/dequeue and updates the hrtick when the
- * current task is from our class and nr_running is low enough
- * to matter.
+ * current task is from our class.
  */
 static void hrtick_update(struct rq *rq)
 {
@@ -4660,8 +5816,7 @@ static void hrtick_update(struct rq *rq)
 	if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
 		return;
 
-	if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
-		hrtick_start_fair(rq, curr);
+	hrtick_start_fair(rq, curr);
 }
 #else /* !CONFIG_SCHED_HRTICK */
 static inline void
@@ -4719,7 +5874,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		if (cfs_rq_throttled(cfs_rq))
 			break;
 		cfs_rq->h_nr_running++;
-		walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p);
+		inc_cfs_rq_hmp_stats(cfs_rq, p, 1);
 
 		flags = ENQUEUE_WAKEUP;
 	}
@@ -4727,7 +5882,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
 		cfs_rq->h_nr_running++;
-		walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p);
+		inc_cfs_rq_hmp_stats(cfs_rq, p, 1);
 
 		if (cfs_rq_throttled(cfs_rq))
 			break;
@@ -4736,8 +5891,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		update_cfs_shares(se);
 	}
 
-	if (!se)
+	if (!se) {
 		add_nr_running(rq, 1);
+		inc_rq_hmp_stats(rq, p, 1);
+	}
 
 #ifdef CONFIG_SMP
 
@@ -4760,8 +5917,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	 */
 	schedtune_enqueue_task(p, cpu_of(rq));
 
-	if (!se) {
-		walt_inc_cumulative_runnable_avg(rq, p);
+	if (energy_aware() && !se) {
 		if (!task_new && !rq->rd->overutilized &&
 		    cpu_overutilized(rq->cpu)) {
 			rq->rd->overutilized = true;
@@ -4799,7 +5955,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		if (cfs_rq_throttled(cfs_rq))
 			break;
 		cfs_rq->h_nr_running--;
-		walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p);
+		dec_cfs_rq_hmp_stats(cfs_rq, p, 1);
 
 		/* Don't dequeue parent if it has other entities besides us */
 		if (cfs_rq->load.weight) {
@@ -4819,7 +5975,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
 		cfs_rq->h_nr_running--;
-		walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p);
+		dec_cfs_rq_hmp_stats(cfs_rq, p, 1);
 
 		if (cfs_rq_throttled(cfs_rq))
 			break;
@@ -4828,8 +5984,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		update_cfs_shares(se);
 	}
 
-	if (!se)
+	if (!se) {
 		sub_nr_running(rq, 1);
+		dec_rq_hmp_stats(rq, p, 1);
+	}
 
 #ifdef CONFIG_SMP
 
@@ -4842,8 +6000,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	 */
 	schedtune_dequeue_task(p, cpu_of(rq));
 
-	if (!se)
-		walt_dec_cumulative_runnable_avg(rq, p);
 #endif /* CONFIG_SMP */
 
 	hrtick_update(rq);
@@ -5256,11 +6412,6 @@ unsigned long capacity_curr_of(int cpu)
 	       >> SCHED_CAPACITY_SHIFT;
 }
 
-static inline bool energy_aware(void)
-{
-	return sched_feat(ENERGY_AWARE);
-}
-
 struct energy_env {
 	struct sched_group	*sg_top;
 	struct sched_group	*sg_cap;
@@ -5860,12 +7011,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
 
 static inline unsigned long task_util(struct task_struct *p)
 {
-#ifdef CONFIG_SCHED_WALT
-	if (!walt_disabled && sysctl_sched_use_walt_task_util) {
-		unsigned long demand = p->ravg.demand;
-		return (demand << 10) / walt_ravg_window;
-	}
-#endif
 	return p->se.avg.util_avg;
 }
 
@@ -6250,6 +7395,10 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 		}
 	}
 
+	if (!(current->flags & PF_WAKE_UP_IDLE) &&
+			!(p->flags & PF_WAKE_UP_IDLE))
+		return target;
+
 	/*
 	 * Otherwise, iterate the domains and find an elegible idle cpu.
 	 */
@@ -6774,6 +7923,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 	int want_affine = 0;
 	int sync = wake_flags & WF_SYNC;
 
+#ifdef CONFIG_SCHED_HMP
+	return select_best_cpu(p, prev_cpu, 0, sync);
+#endif
+
 	if (sd_flag & SD_BALANCE_WAKE) {
 		record_wakee(p);
 		want_affine = !wake_wide(p, sibling_count_hint) &&
@@ -7360,6 +8513,10 @@ enum group_type {
 #define LBF_NEED_BREAK	0x02
 #define LBF_DST_PINNED  0x04
 #define LBF_SOME_PINNED	0x08
+#define LBF_BIG_TASK_ACTIVE_BALANCE 0x80
+#define LBF_IGNORE_BIG_TASKS 0x100
+#define LBF_IGNORE_PREFERRED_CLUSTER_TASKS 0x200
+#define LBF_MOVED_RELATED_THREAD_GROUP_TASK 0x400
 
 struct lb_env {
 	struct sched_domain	*sd;
@@ -7377,6 +8534,8 @@ struct lb_env {
 	unsigned int		src_grp_nr_running;
 	/* The set of CPUs under consideration for load-balancing */
 	struct cpumask		*cpus;
+	unsigned int		busiest_grp_capacity;
+	unsigned int		busiest_nr_running;
 
 	unsigned int		flags;
 
@@ -7387,6 +8546,7 @@ struct lb_env {
 	enum fbq_type		fbq_type;
 	enum group_type		busiest_group_type;
 	struct list_head	tasks;
+	enum sched_boost_policy	boost_policy;
 };
 
 /*
@@ -7484,6 +8644,7 @@ static
 int can_migrate_task(struct task_struct *p, struct lb_env *env)
 {
 	int tsk_cache_hot;
+	int twf, group_cpus;
 
 	lockdep_assert_held(&env->src_rq->lock);
 
@@ -7530,6 +8691,39 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 	/* Record that we found atleast one task that could run on dst_cpu */
 	env->flags &= ~LBF_ALL_PINNED;
 
+	if (cpu_capacity(env->dst_cpu) > cpu_capacity(env->src_cpu)) {
+		if (nr_big_tasks(env->src_rq) && !is_big_task(p))
+			return 0;
+
+		if (env->boost_policy == SCHED_BOOST_ON_BIG &&
+					!task_sched_boost(p))
+			return 0;
+	}
+
+	twf = task_will_fit(p, env->dst_cpu);
+
+	/*
+	 * Attempt to not pull tasks that don't fit. We may get lucky and find
+	 * one that actually fits.
+	 */
+	if (env->flags & LBF_IGNORE_BIG_TASKS && !twf)
+		return 0;
+
+	if (env->flags & LBF_IGNORE_PREFERRED_CLUSTER_TASKS &&
+	    !preferred_cluster(rq_cluster(cpu_rq(env->dst_cpu)), p))
+		return 0;
+
+	/*
+	 * Group imbalance can sometimes cause work to be pulled across groups
+	 * even though the group could have managed the imbalance on its own.
+	 * Prevent inter-cluster migrations for big tasks when the number of
+	 * tasks is lower than the capacity of the group.
+	 */
+	group_cpus = DIV_ROUND_UP(env->busiest_grp_capacity,
+						 SCHED_CAPACITY_SCALE);
+	if (!twf && env->busiest_nr_running <= group_cpus)
+		return 0;
+
 	if (task_running(env->src_rq, p)) {
 		schedstat_inc(p, se.statistics.nr_failed_migrations_running);
 		return 0;
@@ -7537,15 +8731,16 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 
 	/*
 	 * Aggressive migration if:
-	 * 1) destination numa is preferred
-	 * 2) task is cache cold, or
-	 * 3) too many balance attempts have failed.
+	 * 1) IDLE or NEWLY_IDLE balance.
+	 * 2) destination numa is preferred
+	 * 3) task is cache cold, or
+	 * 4) too many balance attempts have failed.
 	 */
 	tsk_cache_hot = migrate_degrades_locality(p, env);
 	if (tsk_cache_hot == -1)
 		tsk_cache_hot = task_hot(p, env);
 
-	if (tsk_cache_hot <= 0 ||
+	if (env->idle != CPU_NOT_IDLE || tsk_cache_hot <= 0 ||
 	    env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
 		if (tsk_cache_hot == 1) {
 			schedstat_inc(env->sd, lb_hot_gained[env->idle]);
@@ -7565,10 +8760,12 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
 {
 	lockdep_assert_held(&env->src_rq->lock);
 
-	deactivate_task(env->src_rq, p, 0);
 	p->on_rq = TASK_ON_RQ_MIGRATING;
+	deactivate_task(env->src_rq, p, 0);
 	double_lock_balance(env->src_rq, env->dst_rq);
 	set_task_cpu(p, env->dst_cpu);
+	if (task_in_related_thread_group(p))
+		env->flags |= LBF_MOVED_RELATED_THREAD_GROUP_TASK;
 	double_unlock_balance(env->src_rq, env->dst_rq);
 }
 
@@ -7597,6 +8794,7 @@ static struct task_struct *detach_one_task(struct lb_env *env)
 		 * inside detach_tasks().
 		 */
 		schedstat_inc(env->sd, lb_gained[env->idle]);
+
 		return p;
 	}
 	return NULL;
@@ -7616,12 +8814,20 @@ static int detach_tasks(struct lb_env *env)
 	struct task_struct *p;
 	unsigned long load;
 	int detached = 0;
+	int orig_loop = env->loop;
 
 	lockdep_assert_held(&env->src_rq->lock);
 
 	if (env->imbalance <= 0)
 		return 0;
 
+	if (!same_cluster(env->dst_cpu, env->src_cpu))
+		env->flags |= LBF_IGNORE_PREFERRED_CLUSTER_TASKS;
+
+	if (cpu_capacity(env->dst_cpu) < cpu_capacity(env->src_cpu))
+		env->flags |= LBF_IGNORE_BIG_TASKS;
+
+redo:
 	while (!list_empty(tasks)) {
 		/*
 		 * We don't want to steal all, otherwise we may be treated likewise,
@@ -7683,6 +8889,15 @@ next:
 		list_move_tail(&p->se.group_node, tasks);
 	}
 
+	if (env->flags & (LBF_IGNORE_BIG_TASKS |
+			LBF_IGNORE_PREFERRED_CLUSTER_TASKS) && !detached) {
+		tasks = &env->src_rq->cfs_tasks;
+		env->flags &= ~(LBF_IGNORE_BIG_TASKS |
+				LBF_IGNORE_PREFERRED_CLUSTER_TASKS);
+		env->loop = orig_loop;
+		goto redo;
+	}
+
 	/*
 	 * Right now, this is one of only two places we collect this stat
 	 * so we can safely collect detach_one_task() stats here rather
@@ -7701,8 +8916,8 @@ static void attach_task(struct rq *rq, struct task_struct *p)
 	lockdep_assert_held(&rq->lock);
 
 	BUG_ON(task_rq(p) != rq);
-	p->on_rq = TASK_ON_RQ_QUEUED;
 	activate_task(rq, p, 0);
+	p->on_rq = TASK_ON_RQ_QUEUED;
 	check_preempt_curr(rq, p, 0);
 }
 
@@ -7846,6 +9061,10 @@ struct sg_lb_stats {
 	unsigned long group_capacity;
 	unsigned long group_util; /* Total utilization of the group */
 	unsigned int sum_nr_running; /* Nr tasks running in the group */
+#ifdef CONFIG_SCHED_HMP
+	unsigned long sum_nr_big_tasks;
+	u64 group_cpu_load; /* Scaled load of all CPUs of the group */
+#endif
 	unsigned int idle_cpus;
 	unsigned int group_weight;
 	enum group_type group_type;
@@ -7889,10 +9108,64 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
 			.avg_load = 0UL,
 			.sum_nr_running = 0,
 			.group_type = group_other,
+#ifdef CONFIG_SCHED_HMP
+			.sum_nr_big_tasks = 0UL,
+			.group_cpu_load = 0ULL,
+#endif
 		},
 	};
 }
 
+#ifdef CONFIG_SCHED_HMP
+
+static int
+bail_inter_cluster_balance(struct lb_env *env, struct sd_lb_stats *sds)
+{
+	int local_cpu, busiest_cpu;
+	int local_capacity, busiest_capacity;
+	int local_pwr_cost, busiest_pwr_cost;
+	int nr_cpus;
+	int boost = sched_boost();
+
+	if (!sysctl_sched_restrict_cluster_spill ||
+		boost == FULL_THROTTLE_BOOST || boost == CONSERVATIVE_BOOST)
+		return 0;
+
+	local_cpu = group_first_cpu(sds->local);
+	busiest_cpu = group_first_cpu(sds->busiest);
+
+	local_capacity = cpu_max_possible_capacity(local_cpu);
+	busiest_capacity = cpu_max_possible_capacity(busiest_cpu);
+
+	local_pwr_cost = cpu_max_power_cost(local_cpu);
+	busiest_pwr_cost = cpu_max_power_cost(busiest_cpu);
+
+	if (local_pwr_cost <= busiest_pwr_cost)
+		return 0;
+
+	if (local_capacity > busiest_capacity &&
+			sds->busiest_stat.sum_nr_big_tasks)
+		return 0;
+
+	nr_cpus = cpumask_weight(sched_group_cpus(sds->busiest));
+	if ((sds->busiest_stat.group_cpu_load < nr_cpus * sched_spill_load) &&
+		(sds->busiest_stat.sum_nr_running <
+			nr_cpus * sysctl_sched_spill_nr_run))
+		return 1;
+
+	return 0;
+}
+
+#else	/* CONFIG_SCHED_HMP */
+
+static inline int
+bail_inter_cluster_balance(struct lb_env *env, struct sd_lb_stats *sds)
+{
+	return 0;
+}
+
+#endif	/* CONFIG_SCHED_HMP */
+
 /**
  * get_sd_load_idx - Obtain the load index for a given sched domain.
  * @sd: The sched_domain whose load_idx is to be obtained.
@@ -8036,6 +9309,8 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
 			struct sched_group_capacity *sgc;
 			struct rq *rq = cpu_rq(cpu);
 
+			if (cpumask_test_cpu(cpu, cpu_isolated_mask))
+				continue;
 			/*
 			 * build_sched_domains() -> init_sched_groups_capacity()
 			 * gets here before we've attached the domains to the
@@ -8067,9 +9342,14 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
 		do {
 			struct sched_group_capacity *sgc = group->sgc;
 
-			capacity += sgc->capacity;
-			max_capacity = max(sgc->max_capacity, max_capacity);
-			min_capacity = min(sgc->min_capacity, min_capacity);
+			cpumask_t *cpus = sched_group_cpus(group);
+
+			/* Revisit this later. This won't work for MT domain */
+			if (!cpu_isolated(cpumask_first(cpus))) {
+				capacity += sgc->capacity;
+				max_capacity = max(sgc->max_capacity, max_capacity);
+				min_capacity = min(sgc->min_capacity, min_capacity);
+			}
 			group = group->next;
 		} while (group != child->groups);
 	}
@@ -8185,7 +9465,7 @@ group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
 
 static inline enum
 group_type group_classify(struct sched_group *group,
-			  struct sg_lb_stats *sgs)
+			  struct sg_lb_stats *sgs, struct lb_env *env)
 {
 	if (sgs->group_no_capacity)
 		return group_overloaded;
@@ -8254,6 +9534,14 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
 		struct rq *rq = cpu_rq(i);
 
+		trace_sched_cpu_load_lb(cpu_rq(i), idle_cpu(i),
+				     sched_irqload(i),
+				     power_cost(i, 0),
+				     cpu_temp(i));
+
+		if (cpu_isolated(i))
+			continue;
+
 		/* if we are entering idle and there are CPUs with
 		 * their tick stopped, do an update for them
 		 */
@@ -8274,6 +9562,11 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 		if (nr_running > 1)
 			*overload = true;
 
+#ifdef CONFIG_SCHED_HMP
+		sgs->sum_nr_big_tasks += rq->hmp_stats.nr_big_tasks;
+		sgs->group_cpu_load += cpu_load(i);
+#endif
+
 #ifdef CONFIG_NUMA_BALANCING
 		sgs->nr_numa_running += rq->nr_numa_running;
 		sgs->nr_preferred_running += rq->nr_preferred_running;
@@ -8285,25 +9578,62 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 		if (!nr_running && idle_cpu(i))
 			sgs->idle_cpus++;
 
-		if (cpu_overutilized(i)) {
+		if (energy_aware() && cpu_overutilized(i)) {
 			*overutilized = true;
 			if (!sgs->group_misfit_task && rq->misfit_task)
 				sgs->group_misfit_task = capacity_of(i);
 		}
 	}
 
-	/* Adjust by relative CPU capacity of the group */
-	sgs->group_capacity = group->sgc->capacity;
-	sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
+	/* Isolated CPU has no weight */
+	if (!group->group_weight) {
+		sgs->group_capacity = 0;
+		sgs->avg_load = 0;
+		sgs->group_no_capacity = 1;
+		sgs->group_type = group_other;
+		sgs->group_weight = group->group_weight;
+	} else {
+		/* Adjust by relative CPU capacity of the group */
+		sgs->group_capacity = group->sgc->capacity;
+		sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) /
+							sgs->group_capacity;
+
+		sgs->group_weight = group->group_weight;
+
+		sgs->group_no_capacity = group_is_overloaded(env, sgs);
+		sgs->group_type = group_classify(group, sgs, env);
+	}
 
 	if (sgs->sum_nr_running)
 		sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
+}
 
-	sgs->group_weight = group->group_weight;
+#ifdef CONFIG_SCHED_HMP
+static bool update_sd_pick_busiest_active_balance(struct lb_env *env,
+						  struct sd_lb_stats *sds,
+						  struct sched_group *sg,
+						  struct sg_lb_stats *sgs)
+{
+	if (env->idle != CPU_NOT_IDLE &&
+	    cpu_capacity(env->dst_cpu) > group_rq_capacity(sg)) {
+		if (sgs->sum_nr_big_tasks >
+				sds->busiest_stat.sum_nr_big_tasks) {
+			env->flags |= LBF_BIG_TASK_ACTIVE_BALANCE;
+			return true;
+		}
+	}
 
-	sgs->group_no_capacity = group_is_overloaded(env, sgs);
-	sgs->group_type = group_classify(group, sgs);
+	return false;
 }
+#else
+static bool update_sd_pick_busiest_active_balance(struct lb_env *env,
+						  struct sd_lb_stats *sds,
+						  struct sched_group *sg,
+						  struct sg_lb_stats *sgs)
+{
+	return false;
+}
+#endif
 
 /**
  * update_sd_pick_busiest - return 1 on busiest group
@@ -8325,35 +9655,40 @@ static bool update_sd_pick_busiest(struct lb_env *env,
 {
 	struct sg_lb_stats *busiest = &sds->busiest_stat;
 
+	if (update_sd_pick_busiest_active_balance(env, sds, sg, sgs))
+		return true;
+
 	if (sgs->group_type > busiest->group_type)
 		return true;
 
 	if (sgs->group_type < busiest->group_type)
 		return false;
 
-	/*
-	 * Candidate sg doesn't face any serious load-balance problems
-	 * so don't pick it if the local sg is already filled up.
-	 */
-	if (sgs->group_type == group_other &&
-	    !group_has_capacity(env, &sds->local_stat))
-		return false;
+	if (energy_aware()) {
+		/*
+		 * Candidate sg doesn't face any serious load-balance problems
+		 * so don't pick it if the local sg is already filled up.
+		 */
+		if (sgs->group_type == group_other &&
+		    !group_has_capacity(env, &sds->local_stat))
+			return false;
 
-	if (sgs->avg_load <= busiest->avg_load)
-		return false;
+		if (sgs->avg_load <= busiest->avg_load)
+			return false;
 
-	if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
-		goto asym_packing;
+		if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
+			goto asym_packing;
 
-	/*
-	 * Candidate sg has no more than one task per CPU and
-	 * has higher per-CPU capacity. Migrating tasks to less
-	 * capable CPUs may harm throughput. Maximize throughput,
-	 * power/energy consequences are not considered.
-	 */
-	if (sgs->sum_nr_running <= sgs->group_weight &&
-	    group_smaller_cpu_capacity(sds->local, sg))
-		return false;
+		/*
+		 * Candidate sg has no more than one task per CPU and
+		 * has higher per-CPU capacity. Migrating tasks to less
+		 * capable CPUs may harm throughput. Maximize throughput,
+		 * power/energy consequences are not considered.
+		 */
+		if (sgs->sum_nr_running <= sgs->group_weight &&
+		    group_smaller_cpu_capacity(sds->local, sg))
+			return false;
+	}
 
 asym_packing:
 	/* This is the busiest node in its class. */
@@ -8461,14 +9796,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 		    group_has_capacity(env, &sds->local_stat) &&
 		    (sgs->sum_nr_running > 1)) {
 			sgs->group_no_capacity = 1;
-			sgs->group_type = group_classify(sg, sgs);
+			sgs->group_type = group_classify(sg, sgs, env);
 		}
 
 		/*
 		 * Ignore task groups with misfit tasks if local group has no
 		 * capacity or if per-cpu capacity isn't higher.
 		 */
-		if (sgs->group_type == group_misfit_task &&
+		if (energy_aware() &&
+		    sgs->group_type == group_misfit_task &&
 		    (!group_has_capacity(env, &sds->local_stat) ||
 		     !group_smaller_cpu_capacity(sg, sds->local)))
 			sgs->group_type = group_other;
@@ -8476,6 +9812,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 		if (update_sd_pick_busiest(env, sds, sg, sgs)) {
 			sds->busiest = sg;
 			sds->busiest_stat = *sgs;
+			env->busiest_nr_running = sgs->sum_nr_running;
+			env->busiest_grp_capacity = sgs->group_capacity;
 		}
 
 next_group:
@@ -8497,12 +9835,12 @@ next_group:
 			env->dst_rq->rd->overload = overload;
 
 		/* Update over-utilization (tipping point, U >= 0) indicator */
-		if (env->dst_rq->rd->overutilized != overutilized) {
+		if (energy_aware() && env->dst_rq->rd->overutilized != overutilized) {
 			env->dst_rq->rd->overutilized = overutilized;
 			trace_sched_overutilized(overutilized);
 		}
 	} else {
-		if (!env->dst_rq->rd->overutilized && overutilized) {
+		if (energy_aware() && !env->dst_rq->rd->overutilized && overutilized) {
 			env->dst_rq->rd->overutilized = true;
 			trace_sched_overutilized(true);
 		}
@@ -8654,20 +9992,22 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 	 */
 	if (busiest->avg_load <= sds->avg_load ||
 	    local->avg_load >= sds->avg_load) {
-		/* Misfitting tasks should be migrated in any case */
-		if (busiest->group_type == group_misfit_task) {
-			env->imbalance = busiest->group_misfit_task;
-			return;
-		}
+		if (energy_aware()) {
+			/* Misfitting tasks should be migrated in any case */
+			if (busiest->group_type == group_misfit_task) {
+				env->imbalance = busiest->group_misfit_task;
+				return;
+			}
 
-		/*
-		 * Busiest group is overloaded, local is not, use the spare
-		 * cycles to maximize throughput
-		 */
-		if (busiest->group_type == group_overloaded &&
-		    local->group_type <= group_misfit_task) {
-			env->imbalance = busiest->load_per_task;
-			return;
+			/*
+			 * Busiest group is overloaded, local is not, use the spare
+			 * cycles to maximize throughput
+			 */
+			if (busiest->group_type == group_overloaded &&
+			    local->group_type <= group_misfit_task) {
+				env->imbalance = busiest->load_per_task;
+				return;
+			}
 		}
 
 		env->imbalance = 0;
@@ -8704,7 +10044,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 	) / SCHED_CAPACITY_SCALE;
 
 	/* Boost imbalance to allow misfit task to be balanced. */
-	if (busiest->group_type == group_misfit_task)
+	if (energy_aware() && busiest->group_type == group_misfit_task)
 		env->imbalance = max_t(long, env->imbalance,
 				     busiest->group_misfit_task);
 
@@ -8765,6 +10105,12 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
 	if (!sds.busiest || busiest->sum_nr_running == 0)
 		goto out_balanced;
 
+	if (env->flags & LBF_BIG_TASK_ACTIVE_BALANCE)
+		goto force_balance;
+
+	if (bail_inter_cluster_balance(env, &sds))
+		goto out_balanced;
+
 	sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
 						/ sds.total_capacity;
 
@@ -8785,7 +10131,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
 		goto force_balance;
 
 	/* Misfitting tasks should be dealt with regardless of the avg load */
-	if (busiest->group_type == group_misfit_task) {
+	if (energy_aware() && busiest->group_type == group_misfit_task) {
 		goto force_balance;
 	}
 
@@ -8836,6 +10182,60 @@ out_balanced:
 	return NULL;
 }
 
+#ifdef CONFIG_SCHED_HMP
+static struct rq *find_busiest_queue_hmp(struct lb_env *env,
+				     struct sched_group *group)
+{
+	struct rq *busiest = NULL, *busiest_big = NULL;
+	u64 max_runnable_avg = 0, max_runnable_avg_big = 0;
+	int max_nr_big = 0, nr_big;
+	bool find_big = !!(env->flags & LBF_BIG_TASK_ACTIVE_BALANCE);
+	int i;
+	cpumask_t cpus;
+
+	cpumask_andnot(&cpus, sched_group_cpus(group), cpu_isolated_mask);
+
+	for_each_cpu(i, &cpus) {
+		struct rq *rq = cpu_rq(i);
+		u64 cumulative_runnable_avg =
+				rq->hmp_stats.cumulative_runnable_avg;
+
+		if (!cpumask_test_cpu(i, env->cpus))
+			continue;
+
+
+		if (find_big) {
+			nr_big = nr_big_tasks(rq);
+			if (nr_big > max_nr_big ||
+			    (nr_big > 0 && nr_big == max_nr_big &&
+			     cumulative_runnable_avg > max_runnable_avg_big)) {
+				max_runnable_avg_big = cumulative_runnable_avg;
+				busiest_big = rq;
+				max_nr_big = nr_big;
+				continue;
+			}
+		}
+
+		if (cumulative_runnable_avg > max_runnable_avg) {
+			max_runnable_avg = cumulative_runnable_avg;
+			busiest = rq;
+		}
+	}
+
+	if (busiest_big)
+		return busiest_big;
+
+	env->flags &= ~LBF_BIG_TASK_ACTIVE_BALANCE;
+	return busiest;
+}
+#else
+static inline struct rq *find_busiest_queue_hmp(struct lb_env *env,
+                                    struct sched_group *group)
+{
+	return NULL;
+}
+#endif
+
 /*
  * find_busiest_queue - find the busiest runqueue among the cpus in group.
  */
@@ -8846,6 +10246,10 @@ static struct rq *find_busiest_queue(struct lb_env *env,
 	unsigned long busiest_load = 0, busiest_capacity = 1;
 	int i;
 
+#ifdef CONFIG_SCHED_HMP
+	return find_busiest_queue_hmp(env, group);
+#endif
+
 	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
 		unsigned long capacity, wl;
 		enum fbq_type rt;
@@ -8914,15 +10318,20 @@ static struct rq *find_busiest_queue(struct lb_env *env,
  * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
  * so long as it is large enough.
  */
-#define MAX_PINNED_INTERVAL	512
+#define MAX_PINNED_INTERVAL	16
 
 /* Working cpumask for load_balance and load_balance_newidle. */
 DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
 
+#define NEED_ACTIVE_BALANCE_THRESHOLD 10
+
 static int need_active_balance(struct lb_env *env)
 {
 	struct sched_domain *sd = env->sd;
 
+	if (env->flags & LBF_BIG_TASK_ACTIVE_BALANCE)
+		return 1;
+
 	if (env->idle == CPU_NEWLY_IDLE) {
 
 		/*
@@ -8947,7 +10356,8 @@ static int need_active_balance(struct lb_env *env)
 			return 1;
 	}
 
-	if ((capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) &&
+	if (energy_aware() &&
+	    (capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) &&
 	    ((capacity_orig_of(env->src_cpu) < capacity_orig_of(env->dst_cpu))) &&
 				env->src_rq->cfs.h_nr_running == 1 &&
 				cpu_overutilized(env->src_cpu) &&
@@ -8955,10 +10365,18 @@ static int need_active_balance(struct lb_env *env)
 			return 1;
 	}
 
-	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
+	return unlikely(sd->nr_balance_failed >
+			sd->cache_nice_tries + NEED_ACTIVE_BALANCE_THRESHOLD);
 }
 
-static int active_load_balance_cpu_stop(void *data);
+static int group_balance_cpu_not_isolated(struct sched_group *sg)
+{
+	cpumask_t cpus;
+
+	cpumask_and(&cpus, sched_group_cpus(sg), sched_group_mask(sg));
+	cpumask_andnot(&cpus, &cpus, cpu_isolated_mask);
+	return cpumask_first(&cpus);
+}
 
 static int should_we_balance(struct lb_env *env)
 {
@@ -8977,7 +10395,8 @@ static int should_we_balance(struct lb_env *env)
 	sg_mask = sched_group_mask(sg);
 	/* Try to find first idle cpu */
 	for_each_cpu_and(cpu, sg_cpus, env->cpus) {
-		if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))
+		if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu) ||
+		    cpu_isolated(cpu))
 			continue;
 
 		balance_cpu = cpu;
@@ -8985,7 +10404,7 @@ static int should_we_balance(struct lb_env *env)
 	}
 
 	if (balance_cpu == -1)
-		balance_cpu = group_balance_cpu(sg);
+		balance_cpu = group_balance_cpu_not_isolated(sg);
 
 	/*
 	 * First idle cpu or the first cpu(busiest) in this sched group
@@ -9002,23 +10421,29 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 			struct sched_domain *sd, enum cpu_idle_type idle,
 			int *continue_balancing)
 {
-	int ld_moved, cur_ld_moved, active_balance = 0;
+	int ld_moved = 0, cur_ld_moved, active_balance = 0;
 	struct sched_domain *sd_parent = lb_sd_parent(sd) ? sd->parent : NULL;
-	struct sched_group *group;
-	struct rq *busiest;
+	struct sched_group *group = NULL;
+	struct rq *busiest = NULL;
 	unsigned long flags;
 	struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
 
 	struct lb_env env = {
-		.sd		= sd,
-		.dst_cpu	= this_cpu,
-		.dst_rq		= this_rq,
-		.dst_grpmask    = sched_group_cpus(sd->groups),
-		.idle		= idle,
-		.loop_break	= sched_nr_migrate_break,
-		.cpus		= cpus,
-		.fbq_type	= all,
-		.tasks		= LIST_HEAD_INIT(env.tasks),
+		.sd			= sd,
+		.dst_cpu		= this_cpu,
+		.dst_rq			= this_rq,
+		.dst_grpmask		= sched_group_cpus(sd->groups),
+		.idle			= idle,
+		.loop_break		= sched_nr_migrate_break,
+		.cpus			= cpus,
+		.fbq_type		= all,
+		.tasks			= LIST_HEAD_INIT(env.tasks),
+		.imbalance		= 0,
+		.flags			= 0,
+		.loop			= 0,
+		.busiest_nr_running     = 0,
+		.busiest_grp_capacity   = 0,
+		.boost_policy		= sched_boost_policy(),
 	};
 
 	/*
@@ -9072,6 +10497,13 @@ more_balance:
 		raw_spin_lock_irqsave(&busiest->lock, flags);
 		update_rq_clock(busiest);
 
+		/* The world might have changed. Validate assumptions */
+		if (busiest->nr_running <= 1) {
+			raw_spin_unlock_irqrestore(&busiest->lock, flags);
+			env.flags &= ~LBF_ALL_PINNED;
+			goto no_move;
+		}
+
 		/*
 		 * cur_ld_moved - load moved in current iteration
 		 * ld_moved     - cumulative load moved across iterations
@@ -9159,17 +10591,22 @@ more_balance:
 		}
 	}
 
+no_move:
 	if (!ld_moved) {
-		schedstat_inc(sd, lb_failed[idle]);
+		if (!(env.flags & LBF_BIG_TASK_ACTIVE_BALANCE))
+			schedstat_inc(sd, lb_failed[idle]);
+
 		/*
 		 * Increment the failure counter only on periodic balance.
 		 * We do not want newidle balance, which can be very
 		 * frequent, pollute the failure counter causing
 		 * excessive cache_hot migrations and active balances.
 		 */
-		if (idle != CPU_NEWLY_IDLE)
-			if (env.src_grp_nr_running > 1)
+		if (idle != CPU_NEWLY_IDLE &&
+		    !(env.flags & LBF_BIG_TASK_ACTIVE_BALANCE)) {
+		    	if (env.src_grp_nr_running > 1)
 				sd->nr_balance_failed++;
+		}
 
 		if (need_active_balance(&env)) {
 			raw_spin_lock_irqsave(&busiest->lock, flags);
@@ -9191,7 +10628,8 @@ more_balance:
 			 * ->active_balance_work.  Once set, it's cleared
 			 * only after active load balance is finished.
 			 */
-			if (!busiest->active_balance) {
+			if (!busiest->active_balance &&
+			    !cpu_isolated(cpu_of(busiest))) {
 				busiest->active_balance = 1;
 				busiest->push_cpu = this_cpu;
 				active_balance = 1;
@@ -9202,17 +10640,31 @@ more_balance:
 				stop_one_cpu_nowait(cpu_of(busiest),
 					active_load_balance_cpu_stop, busiest,
 					&busiest->active_balance_work);
+				*continue_balancing = 0;
 			}
 
 			/*
 			 * We've kicked active balancing, reset the failure
 			 * counter.
 			 */
-			sd->nr_balance_failed = sd->cache_nice_tries+1;
+			sd->nr_balance_failed =
+			    sd->cache_nice_tries +
+			    NEED_ACTIVE_BALANCE_THRESHOLD - 1;
 		}
-	} else
+	} else {
 		sd->nr_balance_failed = 0;
 
+		/* Assumes one 'busiest' cpu that we pulled tasks from */
+		if (!same_freq_domain(this_cpu, cpu_of(busiest))) {
+			int check_groups = !!(env.flags &
+					 LBF_MOVED_RELATED_THREAD_GROUP_TASK);
+
+			check_for_freq_change(this_rq, false, check_groups);
+			check_for_freq_change(busiest, false, check_groups);
+		} else {
+			check_for_freq_change(this_rq, true, false);
+		}
+	}
 	if (likely(!active_balance)) {
 		/* We were unbalanced, so reset the balancing interval */
 		sd->balance_interval = sd->min_interval;
@@ -9260,6 +10712,11 @@ out_one_pinned:
 
 	ld_moved = 0;
 out:
+	trace_sched_load_balance(this_cpu, idle, *continue_balancing,
+				 group ? group->cpumask[0] : 0,
+				 busiest ? busiest->nr_running : 0,
+				 env.imbalance, env.flags, ld_moved,
+				 sd->balance_interval);
 	return ld_moved;
 }
 
@@ -9302,6 +10759,9 @@ static int idle_balance(struct rq *this_rq)
 	int pulled_task = 0;
 	u64 curr_cost = 0;
 
+	if (cpu_isolated(this_cpu))
+		return 0;
+
 	idle_enter_fair(this_rq);
 
 	/*
@@ -9356,9 +10816,12 @@ static int idle_balance(struct rq *this_rq)
 
 		/*
 		 * Stop searching for tasks to pull if there are
-		 * now runnable tasks on this rq.
+		 * now runnable tasks on the balance rq or if
+		 * continue_balancing has been unset (only possible
+		 * due to active migration).
 		 */
-		if (pulled_task || this_rq->nr_running > 0)
+		if (pulled_task || this_rq->nr_running > 0 ||
+						!continue_balancing)
 			break;
 	}
 	rcu_read_unlock();
@@ -9410,13 +10873,19 @@ static int active_load_balance_cpu_stop(void *data)
 	struct task_struct *push_task = NULL;
 	int push_task_detached = 0;
 	struct lb_env env = {
-		.sd		= sd,
-		.dst_cpu	= target_cpu,
-		.dst_rq		= target_rq,
-		.src_cpu	= busiest_rq->cpu,
-		.src_rq		= busiest_rq,
-		.idle		= CPU_IDLE,
+		.sd			= sd,
+		.dst_cpu		= target_cpu,
+		.dst_rq			= target_rq,
+		.src_cpu		= busiest_rq->cpu,
+		.src_rq			= busiest_rq,
+		.idle			= CPU_IDLE,
+		.busiest_nr_running 	= 0,
+		.busiest_grp_capacity 	= 0,
+		.flags			= 0,
+		.loop			= 0,
+		.boost_policy		= sched_boost_policy(),
 	};
+	bool moved = false;
 
 	raw_spin_lock_irq(&busiest_rq->lock);
 
@@ -9437,12 +10906,15 @@ static int active_load_balance_cpu_stop(void *data)
 	BUG_ON(busiest_rq == target_rq);
 
 	push_task = busiest_rq->push_task;
+	target_cpu = busiest_rq->push_cpu;
 	if (push_task) {
 		if (task_on_rq_queued(push_task) &&
+			push_task->state == TASK_RUNNING &&
 			task_cpu(push_task) == busiest_cpu &&
 					cpu_online(target_cpu)) {
 			detach_task(push_task, &env);
 			push_task_detached = 1;
+			moved = true;
 		}
 		goto out_unlock;
 	}
@@ -9461,14 +10933,18 @@ static int active_load_balance_cpu_stop(void *data)
 		update_rq_clock(busiest_rq);
 
 		p = detach_one_task(&env);
-		if (p)
+		if (p) {
 			schedstat_inc(sd, alb_pushed);
-		else
+			moved = true;
+		} else {
 			schedstat_inc(sd, alb_failed);
+		}
 	}
 	rcu_read_unlock();
 out_unlock:
 	busiest_rq->active_balance = 0;
+	push_task = busiest_rq->push_task;
+	target_cpu = busiest_rq->push_cpu;
 
 	if (push_task)
 		busiest_rq->push_task = NULL;
@@ -9479,6 +10955,7 @@ out_unlock:
 		if (push_task_detached)
 			attach_one_task(target_rq, push_task);
 		put_task_struct(push_task);
+		clear_reserved(target_cpu);
 	}
 
 	if (p)
@@ -9486,6 +10963,15 @@ out_unlock:
 
 	local_irq_enable();
 
+	if (moved && !same_freq_domain(busiest_cpu, target_cpu)) {
+		int check_groups = !!(env.flags &
+					 LBF_MOVED_RELATED_THREAD_GROUP_TASK);
+		check_for_freq_change(busiest_rq, false, check_groups);
+		check_for_freq_change(target_rq, false, check_groups);
+	} else if (moved) {
+		check_for_freq_change(target_rq, true, false);
+	}
+
 	return 0;
 }
 
@@ -9501,9 +10987,49 @@ static inline int on_null_domain(struct rq *rq)
  *   needed, they will kick the idle load balancer, which then does idle
  *   load balancing for all the idle CPUs.
  */
-static inline int find_new_ilb(void)
+
+#ifdef CONFIG_SCHED_HMP
+static inline int find_new_hmp_ilb(int type)
+{
+	int call_cpu = raw_smp_processor_id();
+	struct sched_domain *sd;
+	int ilb;
+
+	rcu_read_lock();
+
+	/* Pick an idle cpu "closest" to call_cpu */
+	for_each_domain(call_cpu, sd) {
+		for_each_cpu_and(ilb, nohz.idle_cpus_mask,
+						sched_domain_span(sd)) {
+			if (idle_cpu(ilb) && (type != NOHZ_KICK_RESTRICT ||
+					cpu_max_power_cost(ilb) <=
+					cpu_max_power_cost(call_cpu))) {
+				rcu_read_unlock();
+				reset_balance_interval(ilb);
+				return ilb;
+			}
+		}
+	}
+
+	rcu_read_unlock();
+	return nr_cpu_ids;
+}
+#else	/* CONFIG_SCHED_HMP */
+static inline int find_new_hmp_ilb(int type)
 {
-	int ilb = cpumask_first(nohz.idle_cpus_mask);
+	return 0;
+}
+#endif	/* CONFIG_SCHED_HMP */
+
+static inline int find_new_ilb(int type)
+{
+	int ilb;
+
+#ifdef CONFIG_SCHED_HMP
+	return find_new_hmp_ilb(type);
+#endif
+
+	ilb = cpumask_first(nohz.idle_cpus_mask);
 
 	if (ilb < nr_cpu_ids && idle_cpu(ilb))
 		return ilb;
@@ -9516,13 +11042,13 @@ static inline int find_new_ilb(void)
  * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
  * CPU (if there is one).
  */
-static void nohz_balancer_kick(void)
+static void nohz_balancer_kick(int type)
 {
 	int ilb_cpu;
 
 	nohz.next_balance++;
 
-	ilb_cpu = find_new_ilb();
+	ilb_cpu = find_new_ilb(type);
 
 	if (ilb_cpu >= nr_cpu_ids)
 		return;
@@ -9539,16 +11065,21 @@ static void nohz_balancer_kick(void)
 	return;
 }
 
+void nohz_balance_clear_nohz_mask(int cpu)
+{
+	if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
+		cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
+		atomic_dec(&nohz.nr_cpus);
+	}
+}
+
 static inline void nohz_balance_exit_idle(int cpu)
 {
 	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
 		/*
 		 * Completely isolated CPUs don't ever set, so we must test.
 		 */
-		if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
-			cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
-			atomic_dec(&nohz.nr_cpus);
-		}
+		nohz_balance_clear_nohz_mask(cpu);
 		clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
 	}
 }
@@ -9605,7 +11136,7 @@ void nohz_balance_enter_idle(int cpu)
 	/*
 	 * If we're a completely isolated CPU, we don't play.
 	 */
-	if (on_null_domain(cpu_rq(cpu)))
+	if (on_null_domain(cpu_rq(cpu)) || cpu_isolated(cpu))
 		return;
 
 	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
@@ -9634,7 +11165,13 @@ static DEFINE_SPINLOCK(balancing);
  */
 void update_max_interval(void)
 {
-	max_load_balance_interval = HZ*num_online_cpus()/10;
+	cpumask_t avail_mask;
+	unsigned int available_cpus;
+
+	cpumask_andnot(&avail_mask, cpu_online_mask, cpu_isolated_mask);
+	available_cpus = cpumask_weight(&avail_mask);
+
+	max_load_balance_interval = HZ*available_cpus/10;
 }
 
 /*
@@ -9759,12 +11296,15 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 	/* Earliest time when we have to do rebalance again */
 	unsigned long next_balance = jiffies + 60*HZ;
 	int update_next_balance = 0;
+	cpumask_t cpus;
 
 	if (idle != CPU_IDLE ||
 	    !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
 		goto end;
 
-	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
+	cpumask_andnot(&cpus, nohz.idle_cpus_mask, cpu_isolated_mask);
+
+	for_each_cpu(balance_cpu, &cpus) {
 		if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
 			continue;
 
@@ -9807,6 +11347,79 @@ end:
 	clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
 }
 
+#ifdef CONFIG_SCHED_HMP
+static inline int _nohz_kick_needed_hmp(struct rq *rq, int cpu, int *type)
+{
+	struct sched_domain *sd;
+	int i;
+
+	if (rq->nr_running < 2)
+		return 0;
+
+	if (!sysctl_sched_restrict_cluster_spill ||
+			sched_boost_policy() == SCHED_BOOST_ON_ALL)
+		return 1;
+
+	if (cpu_max_power_cost(cpu) == max_power_cost)
+		return 1;
+
+	rcu_read_lock();
+	sd = rcu_dereference_check_sched_domain(rq->sd);
+	if (!sd) {
+		rcu_read_unlock();
+		return 0;
+	}
+
+	for_each_cpu(i, sched_domain_span(sd)) {
+		if (cpu_load(i) < sched_spill_load &&
+				cpu_rq(i)->nr_running <
+				sysctl_sched_spill_nr_run) {
+			/* Change the kick type to limit to CPUs that
+			 * are of equal or lower capacity.
+			 */
+			*type = NOHZ_KICK_RESTRICT;
+			break;
+		}
+	}
+	rcu_read_unlock();
+	return 1;
+}
+#else
+static inline int _nohz_kick_needed_hmp(struct rq *rq, int cpu, int *type)
+{
+	return 0;
+}
+#endif
+
+static inline int _nohz_kick_needed(struct rq *rq, int cpu, int *type)
+{
+	unsigned long now = jiffies;
+
+	/*
+	 * None are in tickless mode and hence no need for NOHZ idle load
+	 * balancing.
+	 */
+	if (likely(!atomic_read(&nohz.nr_cpus)))
+		return 0;
+
+#ifdef CONFIG_SCHED_HMP
+	return _nohz_kick_needed_hmp(rq, cpu, type);
+#endif
+
+	if (time_before(now, nohz.next_balance))
+		return 0;
+
+	if (rq->nr_running >= 2 &&
+	    (!energy_aware() || cpu_overutilized(cpu)))
+	    	return true;
+
+	/* Do idle load balance if there have misfit task */
+	if (energy_aware())
+		return rq->misfit_task;
+
+	return (rq->nr_running >= 2);
+}
+
 /*
  * Current heuristic for kicking the idle load balancer in the presence
  * of an idle cpu in the system.
@@ -9818,12 +11431,14 @@ end:
  *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
  *     domain span are idle.
  */
-static inline bool nohz_kick_needed(struct rq *rq)
+static inline bool nohz_kick_needed(struct rq *rq, int *type)
 {
-	unsigned long now = jiffies;
+#ifndef CONFIG_SCHED_HMP
 	struct sched_domain *sd;
 	struct sched_group_capacity *sgc;
-	int nr_busy, cpu = rq->cpu;
+	int nr_busy;
+#endif
+	int cpu = rq->cpu;
 	bool kick = false;
 
 	if (unlikely(rq->idle_balance))
@@ -9836,24 +11451,10 @@ static inline bool nohz_kick_needed(struct rq *rq)
 	set_cpu_sd_state_busy();
 	nohz_balance_exit_idle(cpu);
 
-	/*
-	 * None are in tickless mode and hence no need for NOHZ idle load
-	 * balancing.
-	 */
-	if (likely(!atomic_read(&nohz.nr_cpus)))
-		return false;
-
-	if (time_before(now, nohz.next_balance))
-		return false;
-
-	if (rq->nr_running >= 2 &&
-	    (!energy_aware() || cpu_overutilized(cpu)))
+	if (_nohz_kick_needed(rq, cpu, type))
 		return true;
 
-	/* Do idle load balance if there have misfit task */
-	if (energy_aware())
-		return rq->misfit_task;
-
+#ifndef CONFIG_SCHED_HMP
 	rcu_read_lock();
 	sd = rcu_dereference(per_cpu(sd_busy, cpu));
 	if (sd) {
@@ -9885,6 +11486,7 @@ static inline bool nohz_kick_needed(struct rq *rq)
 
 unlock:
 	rcu_read_unlock();
+#endif
 	return kick;
 }
 #else
@@ -9918,15 +11520,19 @@ static void run_rebalance_domains(struct softirq_action *h)
  */
 void trigger_load_balance(struct rq *rq)
 {
-	/* Don't need to rebalance while attached to NULL domain */
-	if (unlikely(on_null_domain(rq)))
+	int type = NOHZ_KICK_ANY;
+
+	/* Don't need to rebalance while attached to NULL domain or
+	 * cpu is isolated.
+	 */
+	if (unlikely(on_null_domain(rq)) || cpu_isolated(cpu_of(rq)))
 		return;
 
 	if (time_after_eq(jiffies, rq->next_balance))
 		raise_softirq(SCHED_SOFTIRQ);
 #ifdef CONFIG_NO_HZ_COMMON
-	if (nohz_kick_needed(rq))
-		nohz_balancer_kick();
+	if (nohz_kick_needed(rq, &type))
+		nohz_balancer_kick(type);
 #endif
 }
 
@@ -9945,47 +11551,6 @@ static void rq_offline_fair(struct rq *rq)
 	unthrottle_offline_cfs_rqs(rq);
 }
 
-static inline int
-kick_active_balance(struct rq *rq, struct task_struct *p, int new_cpu)
-{
-	int rc = 0;
-
-	/* Invoke active balance to force migrate currently running task */
-	raw_spin_lock(&rq->lock);
-	if (!rq->active_balance) {
-		rq->active_balance = 1;
-		rq->push_cpu = new_cpu;
-		get_task_struct(p);
-		rq->push_task = p;
-		rc = 1;
-	}
-	raw_spin_unlock(&rq->lock);
-
-	return rc;
-}
-
-void check_for_migration(struct rq *rq, struct task_struct *p)
-{
-	int new_cpu;
-	int active_balance;
-	int cpu = task_cpu(p);
-
-	if (energy_aware() && rq->misfit_task) {
-		if (rq->curr->state != TASK_RUNNING ||
-		    rq->curr->nr_cpus_allowed == 1)
-			return;
-
-		new_cpu = select_energy_cpu_brute(p, cpu, 0);
-		if (capacity_orig_of(new_cpu) > capacity_orig_of(cpu)) {
-			active_balance = kick_active_balance(rq, p, new_cpu);
-			if (active_balance)
-				stop_one_cpu_nowait(cpu,
-						active_load_balance_cpu_stop,
-						rq, &rq->active_balance_work);
-		}
-	}
-}
-
 #endif /* CONFIG_SMP */
 
 /*
@@ -10005,7 +11570,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 		task_tick_numa(rq, curr);
 
 #ifdef CONFIG_SMP
-	if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) {
+	if (energy_aware() &&
+	    !rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) {
 		rq->rd->overutilized = true;
 		trace_sched_overutilized(true);
 	}
@@ -10505,6 +12071,11 @@ const struct sched_class fair_sched_class = {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	.task_change_group	= task_change_group_fair,
 #endif
+#ifdef CONFIG_SCHED_HMP
+	.inc_hmp_sched_stats	= inc_hmp_sched_stats_fair,
+	.dec_hmp_sched_stats	= dec_hmp_sched_stats_fair,
+	.fixup_hmp_sched_stats	= fixup_hmp_sched_stats_fair,
+#endif
 };
 
 #ifdef CONFIG_SCHED_DEBUG
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 55e461055332..c30c48fde7e6 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -49,7 +49,7 @@ SCHED_FEAT(NONTASK_CAPACITY, true)
  * Queue remote wakeups on the target CPU and process them
  * using the scheduler IPI. Reduces rq->lock contention/bounces.
  */
-SCHED_FEAT(TTWU_QUEUE, true)
+SCHED_FEAT(TTWU_QUEUE, false)
 
 #ifdef HAVE_RT_PUSH_IPI
 /*
diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c
new file mode 100644
index 000000000000..ddcf7cfb7248
--- /dev/null
+++ b/kernel/sched/hmp.c
@@ -0,0 +1,4407 @@
+/* Copyright (c) 2012-2018, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Implementation credits: Srivatsa Vaddagiri, Steve Muckle
+ * Syed Rameez Mustafa, Olav haugan, Joonwoo Park, Pavan Kumar Kondeti
+ * and Vikram Mulukutla
+ */
+
+#include <linux/cpufreq.h>
+#include <linux/list_sort.h>
+#include <linux/syscore_ops.h>
+
+#include "sched.h"
+
+#include <trace/events/sched.h>
+
+#define CSTATE_LATENCY_GRANULARITY_SHIFT (6)
+
+const char *task_event_names[] = {"PUT_PREV_TASK", "PICK_NEXT_TASK",
+				  "TASK_WAKE", "TASK_MIGRATE", "TASK_UPDATE",
+				"IRQ_UPDATE"};
+
+const char *migrate_type_names[] = {"GROUP_TO_RQ", "RQ_TO_GROUP"};
+
+static ktime_t ktime_last;
+static bool sched_ktime_suspended;
+
+static bool use_cycle_counter;
+static struct cpu_cycle_counter_cb cpu_cycle_counter_cb;
+
+u64 sched_ktime_clock(void)
+{
+	if (unlikely(sched_ktime_suspended))
+		return ktime_to_ns(ktime_last);
+	return ktime_get_ns();
+}
+
+static void sched_resume(void)
+{
+	sched_ktime_suspended = false;
+}
+
+static int sched_suspend(void)
+{
+	ktime_last = ktime_get();
+	sched_ktime_suspended = true;
+	return 0;
+}
+
+static struct syscore_ops sched_syscore_ops = {
+	.resume	= sched_resume,
+	.suspend = sched_suspend
+};
+
+static int __init sched_init_ops(void)
+{
+	register_syscore_ops(&sched_syscore_ops);
+	return 0;
+}
+late_initcall(sched_init_ops);
+
+inline void clear_ed_task(struct task_struct *p, struct rq *rq)
+{
+	if (p == rq->ed_task)
+		rq->ed_task = NULL;
+}
+
+inline void set_task_last_switch_out(struct task_struct *p, u64 wallclock)
+{
+	p->last_switch_out_ts = wallclock;
+}
+
+/*
+ * Note C-state for (idle) cpus.
+ *
+ * @cstate = cstate index, 0 -> active state
+ * @wakeup_energy = energy spent in waking up cpu
+ * @wakeup_latency = latency to wakeup from cstate
+ *
+ */
+void
+sched_set_cpu_cstate(int cpu, int cstate, int wakeup_energy, int wakeup_latency)
+{
+	struct rq *rq = cpu_rq(cpu);
+
+	rq->cstate = cstate; /* C1, C2 etc */
+	rq->wakeup_energy = wakeup_energy;
+	/* disregard small latency delta (64 us). */
+	rq->wakeup_latency = ((wakeup_latency >>
+			       CSTATE_LATENCY_GRANULARITY_SHIFT) <<
+			      CSTATE_LATENCY_GRANULARITY_SHIFT);
+}
+
+/*
+ * Note D-state for (idle) cluster.
+ *
+ * @dstate = dstate index, 0 -> active state
+ * @wakeup_energy = energy spent in waking up cluster
+ * @wakeup_latency = latency to wakeup from cluster
+ *
+ */
+void sched_set_cluster_dstate(const cpumask_t *cluster_cpus, int dstate,
+			int wakeup_energy, int wakeup_latency)
+{
+	struct sched_cluster *cluster =
+		cpu_rq(cpumask_first(cluster_cpus))->cluster;
+	cluster->dstate = dstate;
+	cluster->dstate_wakeup_energy = wakeup_energy;
+	cluster->dstate_wakeup_latency = wakeup_latency;
+}
+
+u32 __weak get_freq_max_load(int cpu, u32 freq)
+{
+	/* 100% by default */
+	return 100;
+}
+
+struct freq_max_load_entry {
+	/* The maximum load which has accounted governor's headroom. */
+	u64 hdemand;
+};
+
+struct freq_max_load {
+	struct rcu_head rcu;
+	int length;
+	struct freq_max_load_entry freqs[0];
+};
+
+static DEFINE_PER_CPU(struct freq_max_load *, freq_max_load);
+static DEFINE_SPINLOCK(freq_max_load_lock);
+
+struct cpu_pwr_stats __weak *get_cpu_pwr_stats(void)
+{
+	return NULL;
+}
+
+int sched_update_freq_max_load(const cpumask_t *cpumask)
+{
+	int i, cpu, ret;
+	unsigned int freq;
+	struct cpu_pstate_pwr *costs;
+	struct cpu_pwr_stats *per_cpu_info = get_cpu_pwr_stats();
+	struct freq_max_load *max_load, *old_max_load;
+	struct freq_max_load_entry *entry;
+	u64 max_demand_capacity, max_demand;
+	unsigned long flags;
+	u32 hfreq;
+	int hpct;
+
+	if (!per_cpu_info)
+		return 0;
+
+	spin_lock_irqsave(&freq_max_load_lock, flags);
+	max_demand_capacity = div64_u64(max_task_load(), max_possible_capacity);
+	for_each_cpu(cpu, cpumask) {
+		if (!per_cpu_info[cpu].ptable) {
+			ret = -EINVAL;
+			goto fail;
+		}
+
+		old_max_load = rcu_dereference(per_cpu(freq_max_load, cpu));
+
+		/*
+		 * allocate len + 1 and leave the last power cost as 0 for
+		 * power_cost() can stop iterating index when
+		 * per_cpu_info[cpu].len > len of max_load due to race between
+		 * cpu power stats update and get_cpu_pwr_stats().
+		 */
+		max_load = kzalloc(sizeof(struct freq_max_load) +
+				   sizeof(struct freq_max_load_entry) *
+				   (per_cpu_info[cpu].len + 1), GFP_ATOMIC);
+		if (unlikely(!max_load)) {
+			ret = -ENOMEM;
+			goto fail;
+		}
+
+		max_load->length = per_cpu_info[cpu].len;
+
+		max_demand = max_demand_capacity *
+			     cpu_max_possible_capacity(cpu);
+
+		i = 0;
+		costs = per_cpu_info[cpu].ptable;
+		while (costs[i].freq) {
+			entry = &max_load->freqs[i];
+			freq = costs[i].freq;
+			hpct = get_freq_max_load(cpu, freq);
+			if (hpct <= 0 || hpct > 100)
+				hpct = 100;
+			hfreq = div64_u64((u64)freq * hpct, 100);
+			entry->hdemand =
+			    div64_u64(max_demand * hfreq,
+				      cpu_max_possible_freq(cpu));
+			i++;
+		}
+
+		rcu_assign_pointer(per_cpu(freq_max_load, cpu), max_load);
+		if (old_max_load)
+			kfree_rcu(old_max_load, rcu);
+	}
+
+	spin_unlock_irqrestore(&freq_max_load_lock, flags);
+	return 0;
+
+fail:
+	for_each_cpu(cpu, cpumask) {
+		max_load = rcu_dereference(per_cpu(freq_max_load, cpu));
+		if (max_load) {
+			rcu_assign_pointer(per_cpu(freq_max_load, cpu), NULL);
+			kfree_rcu(max_load, rcu);
+		}
+	}
+
+	spin_unlock_irqrestore(&freq_max_load_lock, flags);
+	return ret;
+}
+
+unsigned int max_possible_efficiency = 1;
+unsigned int min_possible_efficiency = UINT_MAX;
+
+unsigned long __weak arch_get_cpu_efficiency(int cpu)
+{
+	return SCHED_LOAD_SCALE;
+}
+
+/* Keep track of max/min capacity possible across CPUs "currently" */
+static void __update_min_max_capacity(void)
+{
+	int i;
+	int max_cap = 0, min_cap = INT_MAX;
+
+	for_each_online_cpu(i) {
+		max_cap = max(max_cap, cpu_capacity(i));
+		min_cap = min(min_cap, cpu_capacity(i));
+	}
+
+	max_capacity = max_cap;
+	min_capacity = min_cap;
+}
+
+static void update_min_max_capacity(void)
+{
+	unsigned long flags;
+	int i;
+
+	local_irq_save(flags);
+	for_each_possible_cpu(i)
+		raw_spin_lock(&cpu_rq(i)->lock);
+
+	__update_min_max_capacity();
+
+	for_each_possible_cpu(i)
+		raw_spin_unlock(&cpu_rq(i)->lock);
+	local_irq_restore(flags);
+}
+
+/*
+ * Return 'capacity' of a cpu in reference to "least" efficient cpu, such that
+ * least efficient cpu gets capacity of 1024
+ */
+static unsigned long
+capacity_scale_cpu_efficiency(struct sched_cluster *cluster)
+{
+	return (1024 * cluster->efficiency) / min_possible_efficiency;
+}
+
+/*
+ * Return 'capacity' of a cpu in reference to cpu with lowest max_freq
+ * (min_max_freq), such that one with lowest max_freq gets capacity of 1024.
+ */
+static unsigned long capacity_scale_cpu_freq(struct sched_cluster *cluster)
+{
+	return (1024 * cluster_max_freq(cluster)) / min_max_freq;
+}
+
+/*
+ * Return load_scale_factor of a cpu in reference to "most" efficient cpu, so
+ * that "most" efficient cpu gets a load_scale_factor of 1
+ */
+static inline unsigned long
+load_scale_cpu_efficiency(struct sched_cluster *cluster)
+{
+	return DIV_ROUND_UP(1024 * max_possible_efficiency,
+			    cluster->efficiency);
+}
+
+/*
+ * Return load_scale_factor of a cpu in reference to cpu with best max_freq
+ * (max_possible_freq), so that one with best max_freq gets a load_scale_factor
+ * of 1.
+ */
+static inline unsigned long load_scale_cpu_freq(struct sched_cluster *cluster)
+{
+	return DIV_ROUND_UP(1024 * max_possible_freq,
+			   cluster_max_freq(cluster));
+}
+
+static int compute_capacity(struct sched_cluster *cluster)
+{
+	int capacity = 1024;
+
+	capacity *= capacity_scale_cpu_efficiency(cluster);
+	capacity >>= 10;
+
+	capacity *= capacity_scale_cpu_freq(cluster);
+	capacity >>= 10;
+
+	return capacity;
+}
+
+static int compute_max_possible_capacity(struct sched_cluster *cluster)
+{
+	int capacity = 1024;
+
+	capacity *= capacity_scale_cpu_efficiency(cluster);
+	capacity >>= 10;
+
+	capacity *= (1024 * cluster->max_possible_freq) / min_max_freq;
+	capacity >>= 10;
+
+	return capacity;
+}
+
+static int compute_load_scale_factor(struct sched_cluster *cluster)
+{
+	int load_scale = 1024;
+
+	/*
+	 * load_scale_factor accounts for the fact that task load
+	 * is in reference to "best" performing cpu. Task's load will need to be
+	 * scaled (up) by a factor to determine suitability to be placed on a
+	 * (little) cpu.
+	 */
+	load_scale *= load_scale_cpu_efficiency(cluster);
+	load_scale >>= 10;
+
+	load_scale *= load_scale_cpu_freq(cluster);
+	load_scale >>= 10;
+
+	return load_scale;
+}
+
+struct list_head cluster_head;
+static DEFINE_MUTEX(cluster_lock);
+static cpumask_t all_cluster_cpus = CPU_MASK_NONE;
+DECLARE_BITMAP(all_cluster_ids, NR_CPUS);
+struct sched_cluster *sched_cluster[NR_CPUS];
+int num_clusters;
+
+unsigned int max_power_cost = 1;
+
+struct sched_cluster init_cluster = {
+	.list			=	LIST_HEAD_INIT(init_cluster.list),
+	.id			=	0,
+	.max_power_cost		=	1,
+	.min_power_cost		=	1,
+	.capacity		=	1024,
+	.max_possible_capacity	=	1024,
+	.efficiency		=	1,
+	.load_scale_factor	=	1024,
+	.cur_freq		=	1,
+	.max_freq		=	1,
+	.max_mitigated_freq	=	UINT_MAX,
+	.min_freq		=	1,
+	.max_possible_freq	=	1,
+	.dstate			=	0,
+	.dstate_wakeup_energy	=	0,
+	.dstate_wakeup_latency	=	0,
+	.exec_scale_factor	=	1024,
+	.notifier_sent		=	0,
+	.wake_up_idle		=	0,
+};
+
+static void update_all_clusters_stats(void)
+{
+	struct sched_cluster *cluster;
+	u64 highest_mpc = 0, lowest_mpc = U64_MAX;
+
+	pre_big_task_count_change(cpu_possible_mask);
+
+	for_each_sched_cluster(cluster) {
+		u64 mpc;
+
+		cluster->capacity = compute_capacity(cluster);
+		mpc = cluster->max_possible_capacity =
+			compute_max_possible_capacity(cluster);
+		cluster->load_scale_factor = compute_load_scale_factor(cluster);
+
+		cluster->exec_scale_factor =
+			DIV_ROUND_UP(cluster->efficiency * 1024,
+				     max_possible_efficiency);
+
+		if (mpc > highest_mpc)
+			highest_mpc = mpc;
+
+		if (mpc < lowest_mpc)
+			lowest_mpc = mpc;
+	}
+
+	max_possible_capacity = highest_mpc;
+	min_max_possible_capacity = lowest_mpc;
+
+	__update_min_max_capacity();
+	sched_update_freq_max_load(cpu_possible_mask);
+	post_big_task_count_change(cpu_possible_mask);
+}
+
+static void assign_cluster_ids(struct list_head *head)
+{
+	struct sched_cluster *cluster;
+	int pos = 0;
+
+	list_for_each_entry(cluster, head, list) {
+		cluster->id = pos;
+		sched_cluster[pos++] = cluster;
+	}
+}
+
+static void
+move_list(struct list_head *dst, struct list_head *src, bool sync_rcu)
+{
+	struct list_head *first, *last;
+
+	first = src->next;
+	last = src->prev;
+
+	if (sync_rcu) {
+		INIT_LIST_HEAD_RCU(src);
+		synchronize_rcu();
+	}
+
+	first->prev = dst;
+	dst->prev = last;
+	last->next = dst;
+
+	/* Ensure list sanity before making the head visible to all CPUs. */
+	smp_mb();
+	dst->next = first;
+}
+
+static int
+compare_clusters(void *priv, struct list_head *a, struct list_head *b)
+{
+	struct sched_cluster *cluster1, *cluster2;
+	int ret;
+
+	cluster1 = container_of(a, struct sched_cluster, list);
+	cluster2 = container_of(b, struct sched_cluster, list);
+
+	/*
+	 * Don't assume higher capacity means higher power. If the
+	 * power cost is same, sort the higher capacity cluster before
+	 * the lower capacity cluster to start placing the tasks
+	 * on the higher capacity cluster.
+	 */
+	ret = cluster1->max_power_cost > cluster2->max_power_cost ||
+		(cluster1->max_power_cost == cluster2->max_power_cost &&
+		cluster1->max_possible_capacity <
+				cluster2->max_possible_capacity);
+
+	return ret;
+}
+
+static void sort_clusters(void)
+{
+	struct sched_cluster *cluster;
+	struct list_head new_head;
+	unsigned int tmp_max = 1;
+
+	INIT_LIST_HEAD(&new_head);
+
+	for_each_sched_cluster(cluster) {
+		cluster->max_power_cost = power_cost(cluster_first_cpu(cluster),
+							       max_task_load());
+		cluster->min_power_cost = power_cost(cluster_first_cpu(cluster),
+							       0);
+
+		if (cluster->max_power_cost > tmp_max)
+			tmp_max = cluster->max_power_cost;
+	}
+	max_power_cost = tmp_max;
+
+	move_list(&new_head, &cluster_head, true);
+
+	list_sort(NULL, &new_head, compare_clusters);
+	assign_cluster_ids(&new_head);
+
+	/*
+	 * Ensure cluster ids are visible to all CPUs before making
+	 * cluster_head visible.
+	 */
+	move_list(&cluster_head, &new_head, false);
+}
+
+static void
+insert_cluster(struct sched_cluster *cluster, struct list_head *head)
+{
+	struct sched_cluster *tmp;
+	struct list_head *iter = head;
+
+	list_for_each_entry(tmp, head, list) {
+		if (cluster->max_power_cost < tmp->max_power_cost)
+			break;
+		iter = &tmp->list;
+	}
+
+	list_add(&cluster->list, iter);
+}
+
+static struct sched_cluster *alloc_new_cluster(const struct cpumask *cpus)
+{
+	struct sched_cluster *cluster = NULL;
+
+	cluster = kzalloc(sizeof(struct sched_cluster), GFP_ATOMIC);
+	if (!cluster) {
+		__WARN_printf("Cluster allocation failed. \
+				Possible bad scheduling\n");
+		return NULL;
+	}
+
+	INIT_LIST_HEAD(&cluster->list);
+	cluster->max_power_cost		=	1;
+	cluster->min_power_cost		=	1;
+	cluster->capacity		=	1024;
+	cluster->max_possible_capacity	=	1024;
+	cluster->efficiency		=	1;
+	cluster->load_scale_factor	=	1024;
+	cluster->cur_freq		=	1;
+	cluster->max_freq		=	1;
+	cluster->max_mitigated_freq	=	UINT_MAX;
+	cluster->min_freq		=	1;
+	cluster->max_possible_freq	=	1;
+	cluster->dstate			=	0;
+	cluster->dstate_wakeup_energy	=	0;
+	cluster->dstate_wakeup_latency	=	0;
+	cluster->freq_init_done		=	false;
+
+	raw_spin_lock_init(&cluster->load_lock);
+	cluster->cpus = *cpus;
+	cluster->efficiency = arch_get_cpu_efficiency(cpumask_first(cpus));
+
+	if (cluster->efficiency > max_possible_efficiency)
+		max_possible_efficiency = cluster->efficiency;
+	if (cluster->efficiency < min_possible_efficiency)
+		min_possible_efficiency = cluster->efficiency;
+
+	cluster->notifier_sent = 0;
+	return cluster;
+}
+
+static void add_cluster(const struct cpumask *cpus, struct list_head *head)
+{
+	struct sched_cluster *cluster = alloc_new_cluster(cpus);
+	int i;
+
+	if (!cluster)
+		return;
+
+	for_each_cpu(i, cpus)
+		cpu_rq(i)->cluster = cluster;
+
+	insert_cluster(cluster, head);
+	set_bit(num_clusters, all_cluster_ids);
+	num_clusters++;
+}
+
+void update_cluster_topology(void)
+{
+	struct cpumask cpus = *cpu_possible_mask;
+	const struct cpumask *cluster_cpus;
+	struct list_head new_head;
+	int i;
+
+	INIT_LIST_HEAD(&new_head);
+
+	for_each_cpu(i, &cpus) {
+		cluster_cpus = cpu_coregroup_mask(i);
+		cpumask_or(&all_cluster_cpus, &all_cluster_cpus, cluster_cpus);
+		cpumask_andnot(&cpus, &cpus, cluster_cpus);
+		add_cluster(cluster_cpus, &new_head);
+	}
+
+	assign_cluster_ids(&new_head);
+
+	/*
+	 * Ensure cluster ids are visible to all CPUs before making
+	 * cluster_head visible.
+	 */
+	move_list(&cluster_head, &new_head, false);
+	update_all_clusters_stats();
+}
+
+void init_clusters(void)
+{
+	bitmap_clear(all_cluster_ids, 0, NR_CPUS);
+	init_cluster.cpus = *cpu_possible_mask;
+	raw_spin_lock_init(&init_cluster.load_lock);
+	INIT_LIST_HEAD(&cluster_head);
+}
+
+int register_cpu_cycle_counter_cb(struct cpu_cycle_counter_cb *cb)
+{
+	mutex_lock(&cluster_lock);
+	if (!cb->get_cpu_cycle_counter) {
+		mutex_unlock(&cluster_lock);
+		return -EINVAL;
+	}
+
+	cpu_cycle_counter_cb = *cb;
+	use_cycle_counter = true;
+	mutex_unlock(&cluster_lock);
+
+	return 0;
+}
+
+/* Clear any HMP scheduler related requests pending from or on cpu */
+void clear_hmp_request(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	unsigned long flags;
+
+	clear_boost_kick(cpu);
+	clear_reserved(cpu);
+	if (rq->push_task) {
+		struct task_struct *push_task = NULL;
+
+		raw_spin_lock_irqsave(&rq->lock, flags);
+		if (rq->push_task) {
+			clear_reserved(rq->push_cpu);
+			push_task = rq->push_task;
+			rq->push_task = NULL;
+		}
+		rq->active_balance = 0;
+		raw_spin_unlock_irqrestore(&rq->lock, flags);
+		if (push_task)
+			put_task_struct(push_task);
+	}
+}
+
+int sched_set_static_cpu_pwr_cost(int cpu, unsigned int cost)
+{
+	struct rq *rq = cpu_rq(cpu);
+
+	rq->static_cpu_pwr_cost = cost;
+	return 0;
+}
+
+unsigned int sched_get_static_cpu_pwr_cost(int cpu)
+{
+	return cpu_rq(cpu)->static_cpu_pwr_cost;
+}
+
+int sched_set_static_cluster_pwr_cost(int cpu, unsigned int cost)
+{
+	struct sched_cluster *cluster = cpu_rq(cpu)->cluster;
+
+	cluster->static_cluster_pwr_cost = cost;
+	return 0;
+}
+
+unsigned int sched_get_static_cluster_pwr_cost(int cpu)
+{
+	return cpu_rq(cpu)->cluster->static_cluster_pwr_cost;
+}
+
+int sched_set_cluster_wake_idle(int cpu, unsigned int wake_idle)
+{
+	struct sched_cluster *cluster = cpu_rq(cpu)->cluster;
+
+	cluster->wake_up_idle = !!wake_idle;
+	return 0;
+}
+
+unsigned int sched_get_cluster_wake_idle(int cpu)
+{
+	return cpu_rq(cpu)->cluster->wake_up_idle;
+}
+
+/*
+ * sched_window_stats_policy and sched_ravg_hist_size have a 'sysctl' copy
+ * associated with them. This is required for atomic update of those variables
+ * when being modifed via sysctl interface.
+ *
+ * IMPORTANT: Initialize both copies to same value!!
+ */
+
+/*
+ * Tasks that are runnable continuously for a period greather than
+ * EARLY_DETECTION_DURATION can be flagged early as potential
+ * high load tasks.
+ */
+#define EARLY_DETECTION_DURATION 9500000
+
+static __read_mostly unsigned int sched_ravg_hist_size = 5;
+__read_mostly unsigned int sysctl_sched_ravg_hist_size = 5;
+
+static __read_mostly unsigned int sched_window_stats_policy =
+	 WINDOW_STATS_MAX_RECENT_AVG;
+__read_mostly unsigned int sysctl_sched_window_stats_policy =
+	WINDOW_STATS_MAX_RECENT_AVG;
+
+#define SCHED_ACCOUNT_WAIT_TIME 1
+
+__read_mostly unsigned int sysctl_sched_cpu_high_irqload = (10 * NSEC_PER_MSEC);
+
+/*
+ * Enable colocation and frequency aggregation for all threads in a process.
+ * The children inherits the group id from the parent.
+ */
+unsigned int __read_mostly sysctl_sched_enable_thread_grouping;
+
+
+#define SCHED_NEW_TASK_WINDOWS 5
+
+#define SCHED_FREQ_ACCOUNT_WAIT_TIME 0
+
+/*
+ * This governs what load needs to be used when reporting CPU busy time
+ * to the cpufreq governor.
+ */
+__read_mostly unsigned int sysctl_sched_freq_reporting_policy;
+
+/*
+ * For increase, send notification if
+ *      freq_required - cur_freq > sysctl_sched_freq_inc_notify
+ */
+__read_mostly int sysctl_sched_freq_inc_notify = 10 * 1024 * 1024; /* + 10GHz */
+
+/*
+ * For decrease, send notification if
+ *      cur_freq - freq_required > sysctl_sched_freq_dec_notify
+ */
+__read_mostly int sysctl_sched_freq_dec_notify = 10 * 1024 * 1024; /* - 10GHz */
+
+static __read_mostly unsigned int sched_io_is_busy;
+
+__read_mostly unsigned int sysctl_sched_pred_alert_freq = 10 * 1024 * 1024;
+
+/*
+ * Maximum possible frequency across all cpus. Task demand and cpu
+ * capacity (cpu_power) metrics are scaled in reference to it.
+ */
+unsigned int max_possible_freq = 1;
+
+/*
+ * Minimum possible max_freq across all cpus. This will be same as
+ * max_possible_freq on homogeneous systems and could be different from
+ * max_possible_freq on heterogenous systems. min_max_freq is used to derive
+ * capacity (cpu_power) of cpus.
+ */
+unsigned int min_max_freq = 1;
+
+unsigned int max_capacity = 1024; /* max(rq->capacity) */
+unsigned int min_capacity = 1024; /* min(rq->capacity) */
+unsigned int max_possible_capacity = 1024; /* max(rq->max_possible_capacity) */
+unsigned int
+min_max_possible_capacity = 1024; /* min(rq->max_possible_capacity) */
+
+/* Min window size (in ns) = 10ms */
+#define MIN_SCHED_RAVG_WINDOW 10000000
+
+/* Max window size (in ns) = 1s */
+#define MAX_SCHED_RAVG_WINDOW 1000000000
+
+/* Window size (in ns) */
+__read_mostly unsigned int sched_ravg_window = MIN_SCHED_RAVG_WINDOW;
+
+/* Maximum allowed threshold before freq aggregation must be enabled */
+#define MAX_FREQ_AGGR_THRESH 1000
+
+/* Temporarily disable window-stats activity on all cpus */
+unsigned int __read_mostly sched_disable_window_stats;
+
+struct related_thread_group *related_thread_groups[MAX_NUM_CGROUP_COLOC_ID];
+static LIST_HEAD(active_related_thread_groups);
+static DEFINE_RWLOCK(related_thread_group_lock);
+
+#define for_each_related_thread_group(grp) \
+	list_for_each_entry(grp, &active_related_thread_groups, list)
+
+/*
+ * Task load is categorized into buckets for the purpose of top task tracking.
+ * The entire range of load from 0 to sched_ravg_window needs to be covered
+ * in NUM_LOAD_INDICES number of buckets. Therefore the size of each bucket
+ * is given by sched_ravg_window / NUM_LOAD_INDICES. Since the default value
+ * of sched_ravg_window is MIN_SCHED_RAVG_WINDOW, use that to compute
+ * sched_load_granule.
+ */
+__read_mostly unsigned int sched_load_granule =
+			MIN_SCHED_RAVG_WINDOW / NUM_LOAD_INDICES;
+
+/* Size of bitmaps maintained to track top tasks */
+static const unsigned int top_tasks_bitmap_size =
+		BITS_TO_LONGS(NUM_LOAD_INDICES + 1) * sizeof(unsigned long);
+
+/*
+ * Demand aggregation for frequency purpose:
+ *
+ * 'sched_freq_aggregate' controls aggregation of cpu demand of related threads
+ * for frequency determination purpose. This aggregation is done per-cluster.
+ *
+ * CPU demand of tasks from various related groups is aggregated per-cluster and
+ * added to the "max_busy_cpu" in that cluster, where max_busy_cpu is determined
+ * by just rq->prev_runnable_sum.
+ *
+ * Some examples follow, which assume:
+ *	Cluster0 = CPU0-3, Cluster1 = CPU4-7
+ *	One related thread group A that has tasks A0, A1, A2
+ *
+ *	A->cpu_time[X].curr/prev_sum = counters in which cpu execution stats of
+ *	tasks belonging to group A are accumulated when they run on cpu X.
+ *
+ *	CX->curr/prev_sum = counters in which cpu execution stats of all tasks
+ *	not belonging to group A are accumulated when they run on cpu X
+ *
+ * Lets say the stats for window M was as below:
+ *
+ *	C0->prev_sum = 1ms, A->cpu_time[0].prev_sum = 5ms
+ *		Task A0 ran 5ms on CPU0
+ *		Task B0 ran 1ms on CPU0
+ *
+ *	C1->prev_sum = 5ms, A->cpu_time[1].prev_sum = 6ms
+ *		Task A1 ran 4ms on CPU1
+ *		Task A2 ran 2ms on CPU1
+ *		Task B1 ran 5ms on CPU1
+ *
+ *	C2->prev_sum = 0ms, A->cpu_time[2].prev_sum = 0
+ *		CPU2 idle
+ *
+ *	C3->prev_sum = 0ms, A->cpu_time[3].prev_sum = 0
+ *		CPU3 idle
+ *
+ * In this case, CPU1 was most busy going by just its prev_sum counter. Demand
+ * from all group A tasks are added to CPU1. IOW, at end of window M, cpu busy
+ * time reported to governor will be:
+ *
+ *
+ *	C0 busy time = 1ms
+ *	C1 busy time = 5 + 5 + 6 = 16ms
+ *
+ */
+static __read_mostly unsigned int sched_freq_aggregate = 1;
+__read_mostly unsigned int sysctl_sched_freq_aggregate = 1;
+
+unsigned int __read_mostly sysctl_sched_freq_aggregate_threshold_pct;
+static unsigned int __read_mostly sched_freq_aggregate_threshold;
+
+/* Initial task load. Newly created tasks are assigned this load. */
+unsigned int __read_mostly sched_init_task_load_windows;
+unsigned int __read_mostly sysctl_sched_init_task_load_pct = 15;
+
+unsigned int max_task_load(void)
+{
+	return sched_ravg_window;
+}
+
+/* A cpu can no longer accommodate more tasks if:
+ *
+ *	rq->nr_running > sysctl_sched_spill_nr_run ||
+ *	rq->hmp_stats.cumulative_runnable_avg > sched_spill_load
+ */
+unsigned int __read_mostly sysctl_sched_spill_nr_run = 10;
+
+/*
+ * Place sync wakee tasks those have less than configured demand to the waker's
+ * cluster.
+ */
+unsigned int __read_mostly sched_small_wakee_task_load;
+unsigned int __read_mostly sysctl_sched_small_wakee_task_load_pct = 10;
+
+unsigned int __read_mostly sched_big_waker_task_load;
+unsigned int __read_mostly sysctl_sched_big_waker_task_load_pct = 25;
+
+/*
+ * CPUs with load greater than the sched_spill_load_threshold are not
+ * eligible for task placement. When all CPUs in a cluster achieve a
+ * load higher than this level, tasks becomes eligible for inter
+ * cluster migration.
+ */
+unsigned int __read_mostly sched_spill_load;
+unsigned int __read_mostly sysctl_sched_spill_load_pct = 100;
+
+/*
+ * Prefer the waker CPU for sync wakee task, if the CPU has only 1 runnable
+ * task. This eliminates the LPM exit latency associated with the idle
+ * CPUs in the waker cluster.
+ */
+unsigned int __read_mostly sysctl_sched_prefer_sync_wakee_to_waker;
+
+/*
+ * Tasks whose bandwidth consumption on a cpu is more than
+ * sched_upmigrate are considered "big" tasks. Big tasks will be
+ * considered for "up" migration, i.e migrating to a cpu with better
+ * capacity.
+ */
+unsigned int __read_mostly sched_upmigrate;
+unsigned int __read_mostly sysctl_sched_upmigrate_pct = 80;
+
+/*
+ * Big tasks, once migrated, will need to drop their bandwidth
+ * consumption to less than sched_downmigrate before they are "down"
+ * migrated.
+ */
+unsigned int __read_mostly sched_downmigrate;
+unsigned int __read_mostly sysctl_sched_downmigrate_pct = 60;
+
+/*
+ * Task groups whose aggregate demand on a cpu is more than
+ * sched_group_upmigrate need to be up-migrated if possible.
+ */
+unsigned int __read_mostly sched_group_upmigrate;
+unsigned int __read_mostly sysctl_sched_group_upmigrate_pct = 100;
+
+/*
+ * Task groups, once up-migrated, will need to drop their aggregate
+ * demand to less than sched_group_downmigrate before they are "down"
+ * migrated.
+ */
+unsigned int __read_mostly sched_group_downmigrate;
+unsigned int __read_mostly sysctl_sched_group_downmigrate_pct = 95;
+
+/*
+ * The load scale factor of a CPU gets boosted when its max frequency
+ * is restricted due to which the tasks are migrating to higher capacity
+ * CPUs early. The sched_upmigrate threshold is auto-upgraded by
+ * rq->max_possible_freq/rq->max_freq of a lower capacity CPU.
+ */
+unsigned int up_down_migrate_scale_factor = 1024;
+
+/*
+ * Scheduler selects and places task to its previous CPU if sleep time is
+ * less than sysctl_sched_select_prev_cpu_us.
+ */
+unsigned int __read_mostly
+sched_short_sleep_task_threshold = 2000 * NSEC_PER_USEC;
+
+unsigned int __read_mostly sysctl_sched_select_prev_cpu_us = 2000;
+
+unsigned int __read_mostly
+sched_long_cpu_selection_threshold = 100 * NSEC_PER_MSEC;
+
+unsigned int __read_mostly sysctl_sched_restrict_cluster_spill;
+
+/*
+ * Scheduler tries to avoid waking up idle CPUs for tasks running
+ * in short bursts. If the task average burst is less than
+ * sysctl_sched_short_burst nanoseconds and it sleeps on an average
+ * for more than sysctl_sched_short_sleep nanoseconds, then the
+ * task is eligible for packing.
+ */
+unsigned int __read_mostly sysctl_sched_short_burst;
+unsigned int __read_mostly sysctl_sched_short_sleep = 1 * NSEC_PER_MSEC;
+
+static void _update_up_down_migrate(unsigned int *up_migrate,
+			unsigned int *down_migrate, bool is_group)
+{
+	unsigned int delta;
+
+	if (up_down_migrate_scale_factor == 1024)
+		return;
+
+	delta = *up_migrate - *down_migrate;
+
+	*up_migrate /= NSEC_PER_USEC;
+	*up_migrate *= up_down_migrate_scale_factor;
+	*up_migrate >>= 10;
+	*up_migrate *= NSEC_PER_USEC;
+
+	if (!is_group)
+		*up_migrate = min(*up_migrate, sched_ravg_window);
+
+	*down_migrate /= NSEC_PER_USEC;
+	*down_migrate *= up_down_migrate_scale_factor;
+	*down_migrate >>= 10;
+	*down_migrate *= NSEC_PER_USEC;
+
+	*down_migrate = min(*down_migrate, *up_migrate - delta);
+}
+
+static void update_up_down_migrate(void)
+{
+	unsigned int up_migrate = pct_to_real(sysctl_sched_upmigrate_pct);
+	unsigned int down_migrate = pct_to_real(sysctl_sched_downmigrate_pct);
+
+	_update_up_down_migrate(&up_migrate, &down_migrate, false);
+	sched_upmigrate = up_migrate;
+	sched_downmigrate = down_migrate;
+
+	up_migrate = pct_to_real(sysctl_sched_group_upmigrate_pct);
+	down_migrate = pct_to_real(sysctl_sched_group_downmigrate_pct);
+
+	_update_up_down_migrate(&up_migrate, &down_migrate, true);
+	sched_group_upmigrate = up_migrate;
+	sched_group_downmigrate = down_migrate;
+}
+
+void set_hmp_defaults(void)
+{
+	sched_spill_load =
+		pct_to_real(sysctl_sched_spill_load_pct);
+
+	update_up_down_migrate();
+
+	sched_init_task_load_windows =
+		div64_u64((u64)sysctl_sched_init_task_load_pct *
+			  (u64)sched_ravg_window, 100);
+
+	sched_short_sleep_task_threshold = sysctl_sched_select_prev_cpu_us *
+					   NSEC_PER_USEC;
+
+	sched_small_wakee_task_load =
+		div64_u64((u64)sysctl_sched_small_wakee_task_load_pct *
+			  (u64)sched_ravg_window, 100);
+
+	sched_big_waker_task_load =
+		div64_u64((u64)sysctl_sched_big_waker_task_load_pct *
+			  (u64)sched_ravg_window, 100);
+
+	sched_freq_aggregate_threshold =
+		pct_to_real(sysctl_sched_freq_aggregate_threshold_pct);
+}
+
+u32 sched_get_init_task_load(struct task_struct *p)
+{
+	return p->init_load_pct;
+}
+
+int sched_set_init_task_load(struct task_struct *p, int init_load_pct)
+{
+	if (init_load_pct < 0 || init_load_pct > 100)
+		return -EINVAL;
+
+	p->init_load_pct = init_load_pct;
+
+	return 0;
+}
+
+#ifdef CONFIG_CGROUP_SCHED
+
+int upmigrate_discouraged(struct task_struct *p)
+{
+	return task_group(p)->upmigrate_discouraged;
+}
+
+#else
+
+static inline int upmigrate_discouraged(struct task_struct *p)
+{
+	return 0;
+}
+
+#endif
+
+/* Is a task "big" on its current cpu */
+static inline int __is_big_task(struct task_struct *p, u64 scaled_load)
+{
+	int nice = task_nice(p);
+
+	if (nice > SCHED_UPMIGRATE_MIN_NICE || upmigrate_discouraged(p))
+		return 0;
+
+	return scaled_load > sched_upmigrate;
+}
+
+int is_big_task(struct task_struct *p)
+{
+	return __is_big_task(p, scale_load_to_cpu(task_load(p), task_cpu(p)));
+}
+
+u64 cpu_load(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+
+	return scale_load_to_cpu(rq->hmp_stats.cumulative_runnable_avg, cpu);
+}
+
+u64 cpu_load_sync(int cpu, int sync)
+{
+	return scale_load_to_cpu(cpu_cravg_sync(cpu, sync), cpu);
+}
+
+/*
+ * Task will fit on a cpu if it's bandwidth consumption on that cpu
+ * will be less than sched_upmigrate. A big task that was previously
+ * "up" migrated will be considered fitting on "little" cpu if its
+ * bandwidth consumption on "little" cpu will be less than
+ * sched_downmigrate. This will help avoid frequenty migrations for
+ * tasks with load close to the upmigrate threshold
+ */
+int task_load_will_fit(struct task_struct *p, u64 task_load, int cpu,
+			      enum sched_boost_policy boost_policy)
+{
+	int upmigrate = sched_upmigrate;
+
+	if (cpu_capacity(cpu) == max_capacity)
+		return 1;
+
+	if (cpu_capacity(task_cpu(p)) > cpu_capacity(cpu))
+		upmigrate = sched_downmigrate;
+
+	if (boost_policy != SCHED_BOOST_ON_BIG) {
+		if (task_nice(p) > SCHED_UPMIGRATE_MIN_NICE ||
+		    upmigrate_discouraged(p))
+			return 1;
+
+		if (task_load < upmigrate)
+			return 1;
+	} else {
+		if (task_sched_boost(p) || task_load >= upmigrate)
+			return 0;
+
+		return 1;
+	}
+
+	return 0;
+}
+
+int task_will_fit(struct task_struct *p, int cpu)
+{
+	u64 tload = scale_load_to_cpu(task_load(p), cpu);
+
+	return task_load_will_fit(p, tload, cpu, sched_boost_policy());
+}
+
+static int
+group_will_fit(struct sched_cluster *cluster, struct related_thread_group *grp,
+						u64 demand, bool group_boost)
+{
+	int cpu = cluster_first_cpu(cluster);
+	int prev_capacity = 0;
+	unsigned int threshold = sched_group_upmigrate;
+	u64 load;
+
+	if (cluster->capacity == max_capacity)
+		return 1;
+
+	if (group_boost)
+		return 0;
+
+	if (!demand)
+		return 1;
+
+	if (grp->preferred_cluster)
+		prev_capacity = grp->preferred_cluster->capacity;
+
+	if (cluster->capacity < prev_capacity)
+		threshold = sched_group_downmigrate;
+
+	load = scale_load_to_cpu(demand, cpu);
+	if (load < threshold)
+		return 1;
+
+	return 0;
+}
+
+/*
+ * Return the cost of running task p on CPU cpu. This function
+ * currently assumes that task p is the only task which will run on
+ * the CPU.
+ */
+unsigned int power_cost(int cpu, u64 demand)
+{
+	int first, mid, last;
+	struct cpu_pwr_stats *per_cpu_info = get_cpu_pwr_stats();
+	struct cpu_pstate_pwr *costs;
+	struct freq_max_load *max_load;
+	int total_static_pwr_cost = 0;
+	struct rq *rq = cpu_rq(cpu);
+	unsigned int pc;
+
+	if (!per_cpu_info || !per_cpu_info[cpu].ptable)
+		/*
+		 * When power aware scheduling is not in use, or CPU
+		 * power data is not available, just use the CPU
+		 * capacity as a rough stand-in for real CPU power
+		 * numbers, assuming bigger CPUs are more power
+		 * hungry.
+		 */
+		return cpu_max_possible_capacity(cpu);
+
+	rcu_read_lock();
+	max_load = rcu_dereference(per_cpu(freq_max_load, cpu));
+	if (!max_load) {
+		pc = cpu_max_possible_capacity(cpu);
+		goto unlock;
+	}
+
+	costs = per_cpu_info[cpu].ptable;
+
+	if (demand <= max_load->freqs[0].hdemand) {
+		pc = costs[0].power;
+		goto unlock;
+	} else if (demand > max_load->freqs[max_load->length - 1].hdemand) {
+		pc = costs[max_load->length - 1].power;
+		goto unlock;
+	}
+
+	first = 0;
+	last = max_load->length - 1;
+	mid = (last - first) >> 1;
+	while (1) {
+		if (demand <= max_load->freqs[mid].hdemand)
+			last = mid;
+		else
+			first = mid;
+
+		if (last - first == 1)
+			break;
+		mid = first + ((last - first) >> 1);
+	}
+
+	pc = costs[last].power;
+
+unlock:
+	rcu_read_unlock();
+
+	if (idle_cpu(cpu) && rq->cstate) {
+		total_static_pwr_cost += rq->static_cpu_pwr_cost;
+		if (rq->cluster->dstate)
+			total_static_pwr_cost +=
+				rq->cluster->static_cluster_pwr_cost;
+	}
+
+	return pc + total_static_pwr_cost;
+
+}
+
+void inc_nr_big_task(struct hmp_sched_stats *stats, struct task_struct *p)
+{
+	if (sched_disable_window_stats)
+		return;
+
+	if (is_big_task(p))
+		stats->nr_big_tasks++;
+}
+
+void dec_nr_big_task(struct hmp_sched_stats *stats, struct task_struct *p)
+{
+	if (sched_disable_window_stats)
+		return;
+
+	if (is_big_task(p))
+		stats->nr_big_tasks--;
+
+	BUG_ON(stats->nr_big_tasks < 0);
+}
+
+void inc_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra)
+{
+	inc_nr_big_task(&rq->hmp_stats, p);
+	if (change_cra)
+		inc_cumulative_runnable_avg(&rq->hmp_stats, p);
+}
+
+void dec_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra)
+{
+	dec_nr_big_task(&rq->hmp_stats, p);
+	if (change_cra)
+		dec_cumulative_runnable_avg(&rq->hmp_stats, p);
+}
+
+void reset_hmp_stats(struct hmp_sched_stats *stats, int reset_cra)
+{
+	stats->nr_big_tasks = 0;
+	if (reset_cra) {
+		stats->cumulative_runnable_avg = 0;
+		stats->pred_demands_sum = 0;
+	}
+}
+
+int preferred_cluster(struct sched_cluster *cluster, struct task_struct *p)
+{
+	struct related_thread_group *grp;
+	int rc = 1;
+
+	rcu_read_lock();
+
+	grp = task_related_thread_group(p);
+	if (grp)
+		rc = (grp->preferred_cluster == cluster);
+
+	rcu_read_unlock();
+	return rc;
+}
+
+struct sched_cluster *rq_cluster(struct rq *rq)
+{
+	return rq->cluster;
+}
+
+/*
+ * reset_cpu_hmp_stats - reset HMP stats for a cpu
+ *	nr_big_tasks
+ *	cumulative_runnable_avg (iff reset_cra is true)
+ */
+void reset_cpu_hmp_stats(int cpu, int reset_cra)
+{
+	reset_cfs_rq_hmp_stats(cpu, reset_cra);
+	reset_hmp_stats(&cpu_rq(cpu)->hmp_stats, reset_cra);
+}
+
+void fixup_nr_big_tasks(struct hmp_sched_stats *stats,
+				struct task_struct *p, s64 delta)
+{
+	u64 new_task_load;
+	u64 old_task_load;
+
+	if (sched_disable_window_stats)
+		return;
+
+	old_task_load = scale_load_to_cpu(task_load(p), task_cpu(p));
+	new_task_load = scale_load_to_cpu(delta + task_load(p), task_cpu(p));
+
+	if (__is_big_task(p, old_task_load) && !__is_big_task(p, new_task_load))
+		stats->nr_big_tasks--;
+	else if (!__is_big_task(p, old_task_load) &&
+		 __is_big_task(p, new_task_load))
+		stats->nr_big_tasks++;
+
+	BUG_ON(stats->nr_big_tasks < 0);
+}
+
+/*
+ * Walk runqueue of cpu and re-initialize 'nr_big_tasks' counters.
+ */
+static void update_nr_big_tasks(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	struct task_struct *p;
+
+	/* Do not reset cumulative_runnable_avg */
+	reset_cpu_hmp_stats(cpu, 0);
+
+	list_for_each_entry(p, &rq->cfs_tasks, se.group_node)
+		_inc_hmp_sched_stats_fair(rq, p, 0);
+}
+
+/* Disable interrupts and grab runqueue lock of all cpus listed in @cpus */
+void pre_big_task_count_change(const struct cpumask *cpus)
+{
+	int i;
+
+	local_irq_disable();
+
+	for_each_cpu(i, cpus)
+		raw_spin_lock(&cpu_rq(i)->lock);
+}
+
+/*
+ * Reinitialize 'nr_big_tasks' counters on all affected cpus
+ */
+void post_big_task_count_change(const struct cpumask *cpus)
+{
+	int i;
+
+	/* Assumes local_irq_disable() keeps online cpumap stable */
+	for_each_cpu(i, cpus)
+		update_nr_big_tasks(i);
+
+	for_each_cpu(i, cpus)
+		raw_spin_unlock(&cpu_rq(i)->lock);
+
+	local_irq_enable();
+}
+
+DEFINE_MUTEX(policy_mutex);
+
+unsigned int update_freq_aggregate_threshold(unsigned int threshold)
+{
+	unsigned int old_threshold;
+
+	mutex_lock(&policy_mutex);
+
+	old_threshold = sysctl_sched_freq_aggregate_threshold_pct;
+
+	sysctl_sched_freq_aggregate_threshold_pct = threshold;
+	sched_freq_aggregate_threshold =
+		pct_to_real(sysctl_sched_freq_aggregate_threshold_pct);
+
+	mutex_unlock(&policy_mutex);
+
+	return old_threshold;
+}
+
+static inline int invalid_value_freq_input(unsigned int *data)
+{
+	if (data == &sysctl_sched_freq_aggregate)
+		return !(*data == 0 || *data == 1);
+
+	return 0;
+}
+
+static inline int invalid_value(unsigned int *data)
+{
+	unsigned int val = *data;
+
+	if (data == &sysctl_sched_ravg_hist_size)
+		return (val < 2 || val > RAVG_HIST_SIZE_MAX);
+
+	if (data == &sysctl_sched_window_stats_policy)
+		return val >= WINDOW_STATS_INVALID_POLICY;
+
+	return invalid_value_freq_input(data);
+}
+
+/*
+ * Handle "atomic" update of sysctl_sched_window_stats_policy,
+ * sysctl_sched_ravg_hist_size variables.
+ */
+int sched_window_update_handler(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp,
+		loff_t *ppos)
+{
+	int ret;
+	unsigned int *data = (unsigned int *)table->data;
+	unsigned int old_val;
+
+	mutex_lock(&policy_mutex);
+
+	old_val = *data;
+
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
+	if (ret || !write || (write && (old_val == *data)))
+		goto done;
+
+	if (invalid_value(data)) {
+		*data = old_val;
+		ret = -EINVAL;
+		goto done;
+	}
+
+	reset_all_window_stats(0, 0);
+
+done:
+	mutex_unlock(&policy_mutex);
+
+	return ret;
+}
+
+/*
+ * Convert percentage value into absolute form. This will avoid div() operation
+ * in fast path, to convert task load in percentage scale.
+ */
+int sched_hmp_proc_update_handler(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp,
+		loff_t *ppos)
+{
+	int ret;
+	unsigned int old_val;
+	unsigned int *data = (unsigned int *)table->data;
+	int update_task_count = 0;
+
+	/*
+	 * The policy mutex is acquired with cpu_hotplug.lock
+	 * held from cpu_up()->cpufreq_governor_interactive()->
+	 * sched_set_window(). So enforce the same order here.
+	 */
+	if (write && (data == &sysctl_sched_upmigrate_pct)) {
+		update_task_count = 1;
+		get_online_cpus();
+	}
+
+	mutex_lock(&policy_mutex);
+
+	old_val = *data;
+
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+	if (ret || !write)
+		goto done;
+
+	if (write && (old_val == *data))
+		goto done;
+
+	if (sysctl_sched_downmigrate_pct > sysctl_sched_upmigrate_pct ||
+				sysctl_sched_group_downmigrate_pct >
+				sysctl_sched_group_upmigrate_pct) {
+		*data = old_val;
+		ret = -EINVAL;
+		goto done;
+	}
+
+	/*
+	 * Big task tunable change will need to re-classify tasks on
+	 * runqueue as big and set their counters appropriately.
+	 * sysctl interface affects secondary variables (*_pct), which is then
+	 * "atomically" carried over to the primary variables. Atomic change
+	 * includes taking runqueue lock of all online cpus and re-initiatizing
+	 * their big counter values based on changed criteria.
+	 */
+	if (update_task_count)
+		pre_big_task_count_change(cpu_online_mask);
+
+	set_hmp_defaults();
+
+	if (update_task_count)
+		post_big_task_count_change(cpu_online_mask);
+
+done:
+	mutex_unlock(&policy_mutex);
+	if (update_task_count)
+		put_online_cpus();
+	return ret;
+}
+
+inline int nr_big_tasks(struct rq *rq)
+{
+	return rq->hmp_stats.nr_big_tasks;
+}
+
+unsigned int cpu_temp(int cpu)
+{
+	struct cpu_pwr_stats *per_cpu_info = get_cpu_pwr_stats();
+
+	if (per_cpu_info)
+		return per_cpu_info[cpu].temp;
+	else
+		return 0;
+}
+
+/*
+ * kfree() may wakeup kswapd. So this function should NOT be called
+ * with any CPU's rq->lock acquired.
+ */
+void free_task_load_ptrs(struct task_struct *p)
+{
+	kfree(p->ravg.curr_window_cpu);
+	kfree(p->ravg.prev_window_cpu);
+
+	/*
+	 * update_task_ravg() can be called for exiting tasks. While the
+	 * function itself ensures correct behavior, the corresponding
+	 * trace event requires that these pointers be NULL.
+	 */
+	p->ravg.curr_window_cpu = NULL;
+	p->ravg.prev_window_cpu = NULL;
+}
+
+void init_new_task_load(struct task_struct *p, bool idle_task)
+{
+	int i;
+	u32 init_load_windows = sched_init_task_load_windows;
+	u32 init_load_pct = current->init_load_pct;
+
+	p->init_load_pct = 0;
+	rcu_assign_pointer(p->grp, NULL);
+	INIT_LIST_HEAD(&p->grp_list);
+	memset(&p->ravg, 0, sizeof(struct ravg));
+	p->cpu_cycles = 0;
+	p->ravg.curr_burst = 0;
+	/*
+	 * Initialize the avg_burst to twice the threshold, so that
+	 * a task would not be classified as short burst right away
+	 * after fork. It takes at least 6 sleep-wakeup cycles for
+	 * the avg_burst to go below the threshold.
+	 */
+	p->ravg.avg_burst = 2 * (u64)sysctl_sched_short_burst;
+	p->ravg.avg_sleep_time = 0;
+
+	p->ravg.curr_window_cpu = kcalloc(nr_cpu_ids, sizeof(u32), GFP_KERNEL);
+	p->ravg.prev_window_cpu = kcalloc(nr_cpu_ids, sizeof(u32), GFP_KERNEL);
+
+	/* Don't have much choice. CPU frequency would be bogus */
+	BUG_ON(!p->ravg.curr_window_cpu || !p->ravg.prev_window_cpu);
+
+	if (idle_task)
+		return;
+
+	if (init_load_pct)
+		init_load_windows = div64_u64((u64)init_load_pct *
+			  (u64)sched_ravg_window, 100);
+
+	p->ravg.demand = init_load_windows;
+	p->ravg.pred_demand = 0;
+	for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i)
+		p->ravg.sum_history[i] = init_load_windows;
+}
+
+/* Return task demand in percentage scale */
+unsigned int pct_task_load(struct task_struct *p)
+{
+	unsigned int load;
+
+	load = div64_u64((u64)task_load(p) * 100, (u64)max_task_load());
+
+	return load;
+}
+
+/*
+ * Return total number of tasks "eligible" to run on highest capacity cpu
+ *
+ * This is simply nr_big_tasks for cpus which are not of max_capacity and
+ * nr_running for cpus of max_capacity
+ */
+unsigned int nr_eligible_big_tasks(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	int nr_big = rq->hmp_stats.nr_big_tasks;
+	int nr = rq->nr_running;
+
+	if (!is_max_capacity_cpu(cpu))
+		return nr_big;
+
+	return nr;
+}
+
+static inline int exiting_task(struct task_struct *p)
+{
+	return (p->ravg.sum_history[0] == EXITING_TASK_MARKER);
+}
+
+static int __init set_sched_ravg_window(char *str)
+{
+	unsigned int window_size;
+
+	get_option(&str, &window_size);
+
+	if (window_size < MIN_SCHED_RAVG_WINDOW ||
+			window_size > MAX_SCHED_RAVG_WINDOW) {
+		WARN_ON(1);
+		return -EINVAL;
+	}
+
+	sched_ravg_window = window_size;
+	return 0;
+}
+
+early_param("sched_ravg_window", set_sched_ravg_window);
+
+static inline void
+update_window_start(struct rq *rq, u64 wallclock)
+{
+	s64 delta;
+	int nr_windows;
+
+	delta = wallclock - rq->window_start;
+	BUG_ON(delta < 0);
+	if (delta < sched_ravg_window)
+		return;
+
+	nr_windows = div64_u64(delta, sched_ravg_window);
+	rq->window_start += (u64)nr_windows * (u64)sched_ravg_window;
+}
+
+#define DIV64_U64_ROUNDUP(X, Y) div64_u64((X) + (Y - 1), Y)
+
+static inline u64 scale_exec_time(u64 delta, struct rq *rq)
+{
+	u32 freq;
+
+	freq = cpu_cycles_to_freq(rq->cc.cycles, rq->cc.time);
+	delta = DIV64_U64_ROUNDUP(delta * freq, max_possible_freq);
+	delta *= rq->cluster->exec_scale_factor;
+	delta >>= 10;
+
+	return delta;
+}
+
+static inline int cpu_is_waiting_on_io(struct rq *rq)
+{
+	if (!sched_io_is_busy)
+		return 0;
+
+	return atomic_read(&rq->nr_iowait);
+}
+
+/* Does freq_required sufficiently exceed or fall behind cur_freq? */
+static inline int
+nearly_same_freq(unsigned int cur_freq, unsigned int freq_required)
+{
+	int delta = freq_required - cur_freq;
+
+	if (freq_required > cur_freq)
+		return delta < sysctl_sched_freq_inc_notify;
+
+	delta = -delta;
+
+	return delta < sysctl_sched_freq_dec_notify;
+}
+
+/* Convert busy time to frequency equivalent */
+static inline unsigned int load_to_freq(struct rq *rq, u64 load)
+{
+	unsigned int freq;
+
+	load = scale_load_to_cpu(load, cpu_of(rq));
+	load *= 128;
+	load = div64_u64(load, max_task_load());
+
+	freq = load * cpu_max_possible_freq(cpu_of(rq));
+	freq /= 128;
+
+	return freq;
+}
+
+/*
+ * Return load from all related groups in given frequency domain.
+ */
+static void group_load_in_freq_domain(struct cpumask *cpus,
+				u64 *grp_load, u64 *new_grp_load)
+{
+	int j;
+
+	for_each_cpu(j, cpus) {
+		struct rq *rq = cpu_rq(j);
+
+		*grp_load += rq->grp_time.prev_runnable_sum;
+		*new_grp_load += rq->grp_time.nt_prev_runnable_sum;
+	}
+}
+
+static inline u64 freq_policy_load(struct rq *rq, u64 load);
+/*
+ * Should scheduler alert governor for changing frequency?
+ *
+ * @check_pred - evaluate frequency based on the predictive demand
+ * @check_groups - add load from all related groups on given cpu
+ *
+ * check_groups is set to 1 if a "related" task movement/wakeup is triggering
+ * the notification check. To avoid "re-aggregation" of demand in such cases,
+ * we check whether the migrated/woken tasks demand (along with demand from
+ * existing tasks on the cpu) can be met on target cpu
+ *
+ */
+
+static int send_notification(struct rq *rq, int check_pred, int check_groups)
+{
+	unsigned int cur_freq, freq_required;
+	unsigned long flags;
+	int rc = 0;
+	u64 group_load = 0, new_load  = 0;
+
+	if (check_pred) {
+		u64 prev = rq->old_busy_time;
+		u64 predicted = rq->hmp_stats.pred_demands_sum;
+
+		if (rq->cluster->cur_freq == cpu_max_freq(cpu_of(rq)))
+			return 0;
+
+		prev = max(prev, rq->old_estimated_time);
+		if (prev > predicted)
+			return 0;
+
+		cur_freq = load_to_freq(rq, prev);
+		freq_required = load_to_freq(rq, predicted);
+
+		if (freq_required < cur_freq + sysctl_sched_pred_alert_freq)
+			return 0;
+	} else {
+		/*
+		 * Protect from concurrent update of rq->prev_runnable_sum and
+		 * group cpu load
+		 */
+		raw_spin_lock_irqsave(&rq->lock, flags);
+		if (check_groups)
+			group_load = rq->grp_time.prev_runnable_sum;
+
+		new_load = rq->prev_runnable_sum + group_load;
+		new_load = freq_policy_load(rq, new_load);
+
+		raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+		cur_freq = load_to_freq(rq, rq->old_busy_time);
+		freq_required = load_to_freq(rq, new_load);
+
+		if (nearly_same_freq(cur_freq, freq_required))
+			return 0;
+	}
+
+	raw_spin_lock_irqsave(&rq->lock, flags);
+	if (!rq->cluster->notifier_sent) {
+		rq->cluster->notifier_sent = 1;
+		rc = 1;
+		trace_sched_freq_alert(cpu_of(rq), check_pred, check_groups, rq,
+				       new_load);
+	}
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+	return rc;
+}
+
+/* Alert governor if there is a need to change frequency */
+void check_for_freq_change(struct rq *rq, bool check_pred, bool check_groups)
+{
+	int cpu = cpu_of(rq);
+
+	if (!send_notification(rq, check_pred, check_groups))
+		return;
+
+	atomic_notifier_call_chain(
+		&load_alert_notifier_head, 0,
+		(void *)(long)cpu);
+}
+
+void notify_migration(int src_cpu, int dest_cpu, bool src_cpu_dead,
+			     struct task_struct *p)
+{
+	bool check_groups;
+
+	rcu_read_lock();
+	check_groups = task_in_related_thread_group(p);
+	rcu_read_unlock();
+
+	if (!same_freq_domain(src_cpu, dest_cpu)) {
+		if (!src_cpu_dead)
+			check_for_freq_change(cpu_rq(src_cpu), false,
+					      check_groups);
+		check_for_freq_change(cpu_rq(dest_cpu), false, check_groups);
+	} else {
+		check_for_freq_change(cpu_rq(dest_cpu), true, check_groups);
+	}
+}
+
+static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p,
+				     u64 irqtime, int event)
+{
+	if (is_idle_task(p)) {
+		/* TASK_WAKE && TASK_MIGRATE is not possible on idle task! */
+		if (event == PICK_NEXT_TASK)
+			return 0;
+
+		/* PUT_PREV_TASK, TASK_UPDATE && IRQ_UPDATE are left */
+		return irqtime || cpu_is_waiting_on_io(rq);
+	}
+
+	if (event == TASK_WAKE)
+		return 0;
+
+	if (event == PUT_PREV_TASK || event == IRQ_UPDATE)
+		return 1;
+
+	/*
+	 * TASK_UPDATE can be called on sleeping task, when its moved between
+	 * related groups
+	 */
+	if (event == TASK_UPDATE) {
+		if (rq->curr == p)
+			return 1;
+
+		return p->on_rq ? SCHED_FREQ_ACCOUNT_WAIT_TIME : 0;
+	}
+
+	/* TASK_MIGRATE, PICK_NEXT_TASK left */
+	return SCHED_FREQ_ACCOUNT_WAIT_TIME;
+}
+
+static inline bool is_new_task(struct task_struct *p)
+{
+	return p->ravg.active_windows < SCHED_NEW_TASK_WINDOWS;
+}
+
+#define INC_STEP 8
+#define DEC_STEP 2
+#define CONSISTENT_THRES 16
+#define INC_STEP_BIG 16
+/*
+ * bucket_increase - update the count of all buckets
+ *
+ * @buckets: array of buckets tracking busy time of a task
+ * @idx: the index of bucket to be incremented
+ *
+ * Each time a complete window finishes, count of bucket that runtime
+ * falls in (@idx) is incremented. Counts of all other buckets are
+ * decayed. The rate of increase and decay could be different based
+ * on current count in the bucket.
+ */
+static inline void bucket_increase(u8 *buckets, int idx)
+{
+	int i, step;
+
+	for (i = 0; i < NUM_BUSY_BUCKETS; i++) {
+		if (idx != i) {
+			if (buckets[i] > DEC_STEP)
+				buckets[i] -= DEC_STEP;
+			else
+				buckets[i] = 0;
+		} else {
+			step = buckets[i] >= CONSISTENT_THRES ?
+						INC_STEP_BIG : INC_STEP;
+			if (buckets[i] > U8_MAX - step)
+				buckets[i] = U8_MAX;
+			else
+				buckets[i] += step;
+		}
+	}
+}
+
+static inline int busy_to_bucket(u32 normalized_rt)
+{
+	int bidx;
+
+	bidx = mult_frac(normalized_rt, NUM_BUSY_BUCKETS, max_task_load());
+	bidx = min(bidx, NUM_BUSY_BUCKETS - 1);
+
+	/*
+	 * Combine lowest two buckets. The lowest frequency falls into
+	 * 2nd bucket and thus keep predicting lowest bucket is not
+	 * useful.
+	 */
+	if (!bidx)
+		bidx++;
+
+	return bidx;
+}
+
+static inline u64
+scale_load_to_freq(u64 load, unsigned int src_freq, unsigned int dst_freq)
+{
+	return div64_u64(load * (u64)src_freq, (u64)dst_freq);
+}
+
+/*
+ * get_pred_busy - calculate predicted demand for a task on runqueue
+ *
+ * @rq: runqueue of task p
+ * @p: task whose prediction is being updated
+ * @start: starting bucket. returned prediction should not be lower than
+ *         this bucket.
+ * @runtime: runtime of the task. returned prediction should not be lower
+ *           than this runtime.
+ * Note: @start can be derived from @runtime. It's passed in only to
+ * avoid duplicated calculation in some cases.
+ *
+ * A new predicted busy time is returned for task @p based on @runtime
+ * passed in. The function searches through buckets that represent busy
+ * time equal to or bigger than @runtime and attempts to find the bucket to
+ * to use for prediction. Once found, it searches through historical busy
+ * time and returns the latest that falls into the bucket. If no such busy
+ * time exists, it returns the medium of that bucket.
+ */
+static u32 get_pred_busy(struct rq *rq, struct task_struct *p,
+				int start, u32 runtime)
+{
+	int i;
+	u8 *buckets = p->ravg.busy_buckets;
+	u32 *hist = p->ravg.sum_history;
+	u32 dmin, dmax;
+	u64 cur_freq_runtime = 0;
+	int first = NUM_BUSY_BUCKETS, final;
+	u32 ret = runtime;
+
+	/* skip prediction for new tasks due to lack of history */
+	if (unlikely(is_new_task(p)))
+		goto out;
+
+	/* find minimal bucket index to pick */
+	for (i = start; i < NUM_BUSY_BUCKETS; i++) {
+		if (buckets[i]) {
+			first = i;
+			break;
+		}
+	}
+	/* if no higher buckets are filled, predict runtime */
+	if (first >= NUM_BUSY_BUCKETS)
+		goto out;
+
+	/* compute the bucket for prediction */
+	final = first;
+
+	/* determine demand range for the predicted bucket */
+	if (final < 2) {
+		/* lowest two buckets are combined */
+		dmin = 0;
+		final = 1;
+	} else {
+		dmin = mult_frac(final, max_task_load(), NUM_BUSY_BUCKETS);
+	}
+	dmax = mult_frac(final + 1, max_task_load(), NUM_BUSY_BUCKETS);
+
+	/*
+	 * search through runtime history and return first runtime that falls
+	 * into the range of predicted bucket.
+	 */
+	for (i = 0; i < sched_ravg_hist_size; i++) {
+		if (hist[i] >= dmin && hist[i] < dmax) {
+			ret = hist[i];
+			break;
+		}
+	}
+	/* no historical runtime within bucket found, use average of the bin */
+	if (ret < dmin)
+		ret = (dmin + dmax) / 2;
+	/*
+	 * when updating in middle of a window, runtime could be higher
+	 * than all recorded history. Always predict at least runtime.
+	 */
+	ret = max(runtime, ret);
+out:
+	trace_sched_update_pred_demand(rq, p, runtime,
+		mult_frac((unsigned int)cur_freq_runtime, 100,
+			  sched_ravg_window), ret);
+	return ret;
+}
+
+static inline u32 calc_pred_demand(struct rq *rq, struct task_struct *p)
+{
+	if (p->ravg.pred_demand >= p->ravg.curr_window)
+		return p->ravg.pred_demand;
+
+	return get_pred_busy(rq, p, busy_to_bucket(p->ravg.curr_window),
+			     p->ravg.curr_window);
+}
+
+/*
+ * predictive demand of a task is calculated at the window roll-over.
+ * if the task current window busy time exceeds the predicted
+ * demand, update it here to reflect the task needs.
+ */
+void update_task_pred_demand(struct rq *rq, struct task_struct *p, int event)
+{
+	u32 new, old;
+
+	if (is_idle_task(p) || exiting_task(p))
+		return;
+
+	if (event != PUT_PREV_TASK && event != TASK_UPDATE &&
+			(!SCHED_FREQ_ACCOUNT_WAIT_TIME ||
+			 (event != TASK_MIGRATE &&
+			 event != PICK_NEXT_TASK)))
+		return;
+
+	/*
+	 * TASK_UPDATE can be called on sleeping task, when its moved between
+	 * related groups
+	 */
+	if (event == TASK_UPDATE) {
+		if (!p->on_rq && !SCHED_FREQ_ACCOUNT_WAIT_TIME)
+			return;
+	}
+
+	new = calc_pred_demand(rq, p);
+	old = p->ravg.pred_demand;
+
+	if (old >= new)
+		return;
+
+	if (task_on_rq_queued(p) && (!task_has_dl_policy(p) ||
+				!p->dl.dl_throttled))
+		p->sched_class->fixup_hmp_sched_stats(rq, p,
+				p->ravg.demand,
+				new);
+
+	p->ravg.pred_demand = new;
+}
+
+void clear_top_tasks_bitmap(unsigned long *bitmap)
+{
+	memset(bitmap, 0, top_tasks_bitmap_size);
+	__set_bit(NUM_LOAD_INDICES, bitmap);
+}
+
+/*
+ * Special case the last index and provide a fast path for index = 0.
+ * Note that sched_load_granule can change underneath us if we are not
+ * holding any runqueue locks while calling the two functions below.
+ */
+static u32  top_task_load(struct rq *rq)
+{
+	int index = rq->prev_top;
+	u8 prev = 1 - rq->curr_table;
+
+	if (!index) {
+		int msb = NUM_LOAD_INDICES - 1;
+
+		if (!test_bit(msb, rq->top_tasks_bitmap[prev]))
+			return 0;
+		else
+			return sched_load_granule;
+	} else if (index == NUM_LOAD_INDICES - 1) {
+		return sched_ravg_window;
+	} else {
+		return (index + 1) * sched_load_granule;
+	}
+}
+
+static u32 load_to_index(u32 load)
+{
+	u32 index = load / sched_load_granule;
+
+	return min(index, (u32)(NUM_LOAD_INDICES - 1));
+}
+
+static void update_top_tasks(struct task_struct *p, struct rq *rq,
+		u32 old_curr_window, int new_window, bool full_window)
+{
+	u8 curr = rq->curr_table;
+	u8 prev = 1 - curr;
+	u8 *curr_table = rq->top_tasks[curr];
+	u8 *prev_table = rq->top_tasks[prev];
+	int old_index, new_index, update_index;
+	u32 curr_window = p->ravg.curr_window;
+	u32 prev_window = p->ravg.prev_window;
+	bool zero_index_update;
+
+	if (old_curr_window == curr_window && !new_window)
+		return;
+
+	old_index = load_to_index(old_curr_window);
+	new_index = load_to_index(curr_window);
+
+	if (!new_window) {
+		zero_index_update = !old_curr_window && curr_window;
+		if (old_index != new_index || zero_index_update) {
+			if (old_curr_window)
+				curr_table[old_index] -= 1;
+			if (curr_window)
+				curr_table[new_index] += 1;
+			if (new_index > rq->curr_top)
+				rq->curr_top = new_index;
+		}
+
+		if (!curr_table[old_index])
+			__clear_bit(NUM_LOAD_INDICES - old_index - 1,
+				rq->top_tasks_bitmap[curr]);
+
+		if (curr_table[new_index] == 1)
+			__set_bit(NUM_LOAD_INDICES - new_index - 1,
+				rq->top_tasks_bitmap[curr]);
+
+		return;
+	}
+
+	/*
+	 * The window has rolled over for this task. By the time we get
+	 * here, curr/prev swaps would has already occurred. So we need
+	 * to use prev_window for the new index.
+	 */
+	update_index = load_to_index(prev_window);
+
+	if (full_window) {
+		/*
+		 * Two cases here. Either 'p' ran for the entire window or
+		 * it didn't run at all. In either case there is no entry
+		 * in the prev table. If 'p' ran the entire window, we just
+		 * need to create a new entry in the prev table. In this case
+		 * update_index will be correspond to sched_ravg_window
+		 * so we can unconditionally update the top index.
+		 */
+		if (prev_window) {
+			prev_table[update_index] += 1;
+			rq->prev_top = update_index;
+		}
+
+		if (prev_table[update_index] == 1)
+			__set_bit(NUM_LOAD_INDICES - update_index - 1,
+				rq->top_tasks_bitmap[prev]);
+	} else {
+		zero_index_update = !old_curr_window && prev_window;
+		if (old_index != update_index || zero_index_update) {
+			if (old_curr_window)
+				prev_table[old_index] -= 1;
+
+			prev_table[update_index] += 1;
+
+			if (update_index > rq->prev_top)
+				rq->prev_top = update_index;
+
+			if (!prev_table[old_index])
+				__clear_bit(NUM_LOAD_INDICES - old_index - 1,
+						rq->top_tasks_bitmap[prev]);
+
+			if (prev_table[update_index] == 1)
+				__set_bit(NUM_LOAD_INDICES - update_index - 1,
+						rq->top_tasks_bitmap[prev]);
+		}
+	}
+
+	if (curr_window) {
+		curr_table[new_index] += 1;
+
+		if (new_index > rq->curr_top)
+			rq->curr_top = new_index;
+
+		if (curr_table[new_index] == 1)
+			__set_bit(NUM_LOAD_INDICES - new_index - 1,
+				rq->top_tasks_bitmap[curr]);
+	}
+}
+
+static inline void clear_top_tasks_table(u8 *table)
+{
+	memset(table, 0, NUM_LOAD_INDICES * sizeof(u8));
+}
+
+static void rollover_top_tasks(struct rq *rq, bool full_window)
+{
+	u8 curr_table = rq->curr_table;
+	u8 prev_table = 1 - curr_table;
+	int curr_top = rq->curr_top;
+
+	clear_top_tasks_table(rq->top_tasks[prev_table]);
+	clear_top_tasks_bitmap(rq->top_tasks_bitmap[prev_table]);
+
+	if (full_window) {
+		curr_top = 0;
+		clear_top_tasks_table(rq->top_tasks[curr_table]);
+		clear_top_tasks_bitmap(
+				rq->top_tasks_bitmap[curr_table]);
+	}
+
+	rq->curr_table = prev_table;
+	rq->prev_top = curr_top;
+	rq->curr_top = 0;
+}
+
+static u32 empty_windows[NR_CPUS];
+
+static void rollover_task_window(struct task_struct *p, bool full_window)
+{
+	u32 *curr_cpu_windows = empty_windows;
+	u32 curr_window;
+	int i;
+
+	/* Rollover the sum */
+	curr_window = 0;
+
+	if (!full_window) {
+		curr_window = p->ravg.curr_window;
+		curr_cpu_windows = p->ravg.curr_window_cpu;
+	}
+
+	p->ravg.prev_window = curr_window;
+	p->ravg.curr_window = 0;
+
+	/* Roll over individual CPU contributions */
+	for (i = 0; i < nr_cpu_ids; i++) {
+		p->ravg.prev_window_cpu[i] = curr_cpu_windows[i];
+		p->ravg.curr_window_cpu[i] = 0;
+	}
+}
+
+static void rollover_cpu_window(struct rq *rq, bool full_window)
+{
+	u64 curr_sum = rq->curr_runnable_sum;
+	u64 nt_curr_sum = rq->nt_curr_runnable_sum;
+	u64 grp_curr_sum = rq->grp_time.curr_runnable_sum;
+	u64 grp_nt_curr_sum = rq->grp_time.nt_curr_runnable_sum;
+
+	if (unlikely(full_window)) {
+		curr_sum = 0;
+		nt_curr_sum = 0;
+		grp_curr_sum = 0;
+		grp_nt_curr_sum = 0;
+	}
+
+	rq->prev_runnable_sum = curr_sum;
+	rq->nt_prev_runnable_sum = nt_curr_sum;
+	rq->grp_time.prev_runnable_sum = grp_curr_sum;
+	rq->grp_time.nt_prev_runnable_sum = grp_nt_curr_sum;
+
+	rq->curr_runnable_sum = 0;
+	rq->nt_curr_runnable_sum = 0;
+	rq->grp_time.curr_runnable_sum = 0;
+	rq->grp_time.nt_curr_runnable_sum = 0;
+}
+
+/*
+ * Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum)
+ */
+static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
+				 int event, u64 wallclock, u64 irqtime)
+{
+	int new_window, full_window = 0;
+	int p_is_curr_task = (p == rq->curr);
+	u64 mark_start = p->ravg.mark_start;
+	u64 window_start = rq->window_start;
+	u32 window_size = sched_ravg_window;
+	u64 delta;
+	u64 *curr_runnable_sum = &rq->curr_runnable_sum;
+	u64 *prev_runnable_sum = &rq->prev_runnable_sum;
+	u64 *nt_curr_runnable_sum = &rq->nt_curr_runnable_sum;
+	u64 *nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;
+	bool new_task;
+	struct related_thread_group *grp;
+	int cpu = rq->cpu;
+	u32 old_curr_window = p->ravg.curr_window;
+
+	new_window = mark_start < window_start;
+	if (new_window) {
+		full_window = (window_start - mark_start) >= window_size;
+		if (p->ravg.active_windows < USHRT_MAX)
+			p->ravg.active_windows++;
+	}
+
+	new_task = is_new_task(p);
+
+	/*
+	 * Handle per-task window rollover. We don't care about the idle
+	 * task or exiting tasks.
+	 */
+	if (!is_idle_task(p) && !exiting_task(p)) {
+		if (new_window)
+			rollover_task_window(p, full_window);
+	}
+
+	if (p_is_curr_task && new_window) {
+		rollover_cpu_window(rq, full_window);
+		rollover_top_tasks(rq, full_window);
+	}
+
+	if (!account_busy_for_cpu_time(rq, p, irqtime, event))
+		goto done;
+
+	grp = p->grp;
+	if (grp && sched_freq_aggregate) {
+		struct group_cpu_time *cpu_time = &rq->grp_time;
+
+		curr_runnable_sum = &cpu_time->curr_runnable_sum;
+		prev_runnable_sum = &cpu_time->prev_runnable_sum;
+
+		nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
+		nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
+	}
+
+	if (!new_window) {
+		/*
+		 * account_busy_for_cpu_time() = 1 so busy time needs
+		 * to be accounted to the current window. No rollover
+		 * since we didn't start a new window. An example of this is
+		 * when a task starts execution and then sleeps within the
+		 * same window.
+		 */
+
+		if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq))
+			delta = wallclock - mark_start;
+		else
+			delta = irqtime;
+		delta = scale_exec_time(delta, rq);
+		*curr_runnable_sum += delta;
+		if (new_task)
+			*nt_curr_runnable_sum += delta;
+
+		if (!is_idle_task(p) && !exiting_task(p)) {
+			p->ravg.curr_window += delta;
+			p->ravg.curr_window_cpu[cpu] += delta;
+		}
+
+		goto done;
+	}
+
+	if (!p_is_curr_task) {
+		/*
+		 * account_busy_for_cpu_time() = 1 so busy time needs
+		 * to be accounted to the current window. A new window
+		 * has also started, but p is not the current task, so the
+		 * window is not rolled over - just split up and account
+		 * as necessary into curr and prev. The window is only
+		 * rolled over when a new window is processed for the current
+		 * task.
+		 *
+		 * Irqtime can't be accounted by a task that isn't the
+		 * currently running task.
+		 */
+
+		if (!full_window) {
+			/*
+			 * A full window hasn't elapsed, account partial
+			 * contribution to previous completed window.
+			 */
+			delta = scale_exec_time(window_start - mark_start, rq);
+			if (!exiting_task(p)) {
+				p->ravg.prev_window += delta;
+				p->ravg.prev_window_cpu[cpu] += delta;
+			}
+		} else {
+			/*
+			 * Since at least one full window has elapsed,
+			 * the contribution to the previous window is the
+			 * full window (window_size).
+			 */
+			delta = scale_exec_time(window_size, rq);
+			if (!exiting_task(p)) {
+				p->ravg.prev_window = delta;
+				p->ravg.prev_window_cpu[cpu] = delta;
+			}
+		}
+
+		*prev_runnable_sum += delta;
+		if (new_task)
+			*nt_prev_runnable_sum += delta;
+
+		/* Account piece of busy time in the current window. */
+		delta = scale_exec_time(wallclock - window_start, rq);
+		*curr_runnable_sum += delta;
+		if (new_task)
+			*nt_curr_runnable_sum += delta;
+
+		if (!exiting_task(p)) {
+			p->ravg.curr_window = delta;
+			p->ravg.curr_window_cpu[cpu] = delta;
+		}
+
+		goto done;
+	}
+
+	if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) {
+		/*
+		 * account_busy_for_cpu_time() = 1 so busy time needs
+		 * to be accounted to the current window. A new window
+		 * has started and p is the current task so rollover is
+		 * needed. If any of these three above conditions are true
+		 * then this busy time can't be accounted as irqtime.
+		 *
+		 * Busy time for the idle task or exiting tasks need not
+		 * be accounted.
+		 *
+		 * An example of this would be a task that starts execution
+		 * and then sleeps once a new window has begun.
+		 */
+
+		if (!full_window) {
+			/*
+			 * A full window hasn't elapsed, account partial
+			 * contribution to previous completed window.
+			 */
+			delta = scale_exec_time(window_start - mark_start, rq);
+			if (!is_idle_task(p) && !exiting_task(p)) {
+				p->ravg.prev_window += delta;
+				p->ravg.prev_window_cpu[cpu] += delta;
+			}
+		} else {
+			/*
+			 * Since at least one full window has elapsed,
+			 * the contribution to the previous window is the
+			 * full window (window_size).
+			 */
+			delta = scale_exec_time(window_size, rq);
+			if (!is_idle_task(p) && !exiting_task(p)) {
+				p->ravg.prev_window = delta;
+				p->ravg.prev_window_cpu[cpu] = delta;
+			}
+		}
+
+		/*
+		 * Rollover is done here by overwriting the values in
+		 * prev_runnable_sum and curr_runnable_sum.
+		 */
+		*prev_runnable_sum += delta;
+		if (new_task)
+			*nt_prev_runnable_sum += delta;
+
+		/* Account piece of busy time in the current window. */
+		delta = scale_exec_time(wallclock - window_start, rq);
+		*curr_runnable_sum += delta;
+		if (new_task)
+			*nt_curr_runnable_sum += delta;
+
+		if (!is_idle_task(p) && !exiting_task(p)) {
+			p->ravg.curr_window = delta;
+			p->ravg.curr_window_cpu[cpu] = delta;
+		}
+
+		goto done;
+	}
+
+	if (irqtime) {
+		/*
+		 * account_busy_for_cpu_time() = 1 so busy time needs
+		 * to be accounted to the current window. A new window
+		 * has started and p is the current task so rollover is
+		 * needed. The current task must be the idle task because
+		 * irqtime is not accounted for any other task.
+		 *
+		 * Irqtime will be accounted each time we process IRQ activity
+		 * after a period of idleness, so we know the IRQ busy time
+		 * started at wallclock - irqtime.
+		 */
+
+		BUG_ON(!is_idle_task(p));
+		mark_start = wallclock - irqtime;
+
+		/*
+		 * Roll window over. If IRQ busy time was just in the current
+		 * window then that is all that need be accounted.
+		 */
+		if (mark_start > window_start) {
+			*curr_runnable_sum = scale_exec_time(irqtime, rq);
+			return;
+		}
+
+		/*
+		 * The IRQ busy time spanned multiple windows. Process the
+		 * busy time preceding the current window start first.
+		 */
+		delta = window_start - mark_start;
+		if (delta > window_size)
+			delta = window_size;
+		delta = scale_exec_time(delta, rq);
+		*prev_runnable_sum += delta;
+
+		/* Process the remaining IRQ busy time in the current window. */
+		delta = wallclock - window_start;
+		rq->curr_runnable_sum = scale_exec_time(delta, rq);
+
+		return;
+	}
+
+done:
+	if (!is_idle_task(p) && !exiting_task(p))
+		update_top_tasks(p, rq, old_curr_window,
+					new_window, full_window);
+}
+
+static inline u32 predict_and_update_buckets(struct rq *rq,
+			struct task_struct *p, u32 runtime) {
+
+	int bidx;
+	u32 pred_demand;
+
+	bidx = busy_to_bucket(runtime);
+	pred_demand = get_pred_busy(rq, p, bidx, runtime);
+	bucket_increase(p->ravg.busy_buckets, bidx);
+
+	return pred_demand;
+}
+
+#define THRESH_CC_UPDATE (2 * NSEC_PER_USEC)
+
+/*
+ * Assumes rq_lock is held and wallclock was recorded in the same critical
+ * section as this function's invocation.
+ */
+static inline u64 read_cycle_counter(int cpu, u64 wallclock)
+{
+	struct sched_cluster *cluster = cpu_rq(cpu)->cluster;
+	u64 delta;
+
+	if (unlikely(!cluster))
+		return cpu_cycle_counter_cb.get_cpu_cycle_counter(cpu);
+
+	/*
+	 * Why don't we need locking here? Let's say that delta is negative
+	 * because some other CPU happened to update last_cc_update with a
+	 * more recent timestamp. We simply read the conter again in that case
+	 * with no harmful side effects. This can happen if there is an FIQ
+	 * between when we read the wallclock and when we use it here.
+	 */
+	delta = wallclock - atomic64_read(&cluster->last_cc_update);
+	if (delta > THRESH_CC_UPDATE) {
+		atomic64_set(&cluster->cycles,
+			     cpu_cycle_counter_cb.get_cpu_cycle_counter(cpu));
+		atomic64_set(&cluster->last_cc_update, wallclock);
+	}
+
+	return atomic64_read(&cluster->cycles);
+}
+
+static void update_task_cpu_cycles(struct task_struct *p, int cpu,
+				   u64 wallclock)
+{
+	if (use_cycle_counter)
+		p->cpu_cycles = read_cycle_counter(cpu, wallclock);
+}
+
+static void
+update_task_rq_cpu_cycles(struct task_struct *p, struct rq *rq, int event,
+			  u64 wallclock, u64 irqtime)
+{
+	u64 cur_cycles;
+	int cpu = cpu_of(rq);
+
+	lockdep_assert_held(&rq->lock);
+
+	if (!use_cycle_counter) {
+		rq->cc.cycles = cpu_cur_freq(cpu);
+		rq->cc.time = 1;
+		return;
+	}
+
+	cur_cycles = read_cycle_counter(cpu, wallclock);
+
+	/*
+	 * If current task is idle task and irqtime == 0 CPU was
+	 * indeed idle and probably its cycle counter was not
+	 * increasing.  We still need estimatied CPU frequency
+	 * for IO wait time accounting.  Use the previously
+	 * calculated frequency in such a case.
+	 */
+	if (!is_idle_task(rq->curr) || irqtime) {
+		if (unlikely(cur_cycles < p->cpu_cycles))
+			rq->cc.cycles = cur_cycles + (U64_MAX - p->cpu_cycles);
+		else
+			rq->cc.cycles = cur_cycles - p->cpu_cycles;
+		rq->cc.cycles = rq->cc.cycles * NSEC_PER_MSEC;
+
+		if (event == IRQ_UPDATE && is_idle_task(p))
+			/*
+			 * Time between mark_start of idle task and IRQ handler
+			 * entry time is CPU cycle counter stall period.
+			 * Upon IRQ handler entry sched_account_irqstart()
+			 * replenishes idle task's cpu cycle counter so
+			 * rq->cc.cycles now represents increased cycles during
+			 * IRQ handler rather than time between idle entry and
+			 * IRQ exit.  Thus use irqtime as time delta.
+			 */
+			rq->cc.time = irqtime;
+		else
+			rq->cc.time = wallclock - p->ravg.mark_start;
+		BUG_ON((s64)rq->cc.time < 0);
+	}
+
+	p->cpu_cycles = cur_cycles;
+
+	trace_sched_get_task_cpu_cycles(cpu, event, rq->cc.cycles,
+					rq->cc.time, p);
+}
+
+static int
+account_busy_for_task_demand(struct rq *rq, struct task_struct *p, int event)
+{
+	/*
+	 * No need to bother updating task demand for exiting tasks
+	 * or the idle task.
+	 */
+	if (exiting_task(p) || is_idle_task(p))
+		return 0;
+
+	/*
+	 * When a task is waking up it is completing a segment of non-busy
+	 * time. Likewise, if wait time is not treated as busy time, then
+	 * when a task begins to run or is migrated, it is not running and
+	 * is completing a segment of non-busy time.
+	 */
+	if (event == TASK_WAKE || (!SCHED_ACCOUNT_WAIT_TIME &&
+			 (event == PICK_NEXT_TASK || event == TASK_MIGRATE)))
+		return 0;
+
+	/*
+	 * TASK_UPDATE can be called on sleeping task, when its moved between
+	 * related groups
+	 */
+	if (event == TASK_UPDATE) {
+		if (rq->curr == p)
+			return 1;
+
+		return p->on_rq ? SCHED_ACCOUNT_WAIT_TIME : 0;
+	}
+
+	return 1;
+}
+
+/*
+ * Called when new window is starting for a task, to record cpu usage over
+ * recently concluded window(s). Normally 'samples' should be 1. It can be > 1
+ * when, say, a real-time task runs without preemption for several windows at a
+ * stretch.
+ */
+static void update_history(struct rq *rq, struct task_struct *p,
+			 u32 runtime, int samples, int event)
+{
+	u32 *hist = &p->ravg.sum_history[0];
+	int ridx, widx;
+	u32 max = 0, avg, demand, pred_demand;
+	u64 sum = 0;
+
+	/* Ignore windows where task had no activity */
+	if (!runtime || is_idle_task(p) || exiting_task(p) || !samples)
+		goto done;
+
+	/* Push new 'runtime' value onto stack */
+	widx = sched_ravg_hist_size - 1;
+	ridx = widx - samples;
+	for (; ridx >= 0; --widx, --ridx) {
+		hist[widx] = hist[ridx];
+		sum += hist[widx];
+		if (hist[widx] > max)
+			max = hist[widx];
+	}
+
+	for (widx = 0; widx < samples && widx < sched_ravg_hist_size; widx++) {
+		hist[widx] = runtime;
+		sum += hist[widx];
+		if (hist[widx] > max)
+			max = hist[widx];
+	}
+
+	p->ravg.sum = 0;
+
+	if (sched_window_stats_policy == WINDOW_STATS_RECENT) {
+		demand = runtime;
+	} else if (sched_window_stats_policy == WINDOW_STATS_MAX) {
+		demand = max;
+	} else {
+		avg = div64_u64(sum, sched_ravg_hist_size);
+		if (sched_window_stats_policy == WINDOW_STATS_AVG)
+			demand = avg;
+		else
+			demand = max(avg, runtime);
+	}
+	pred_demand = predict_and_update_buckets(rq, p, runtime);
+
+	/*
+	 * A throttled deadline sched class task gets dequeued without
+	 * changing p->on_rq. Since the dequeue decrements hmp stats
+	 * avoid decrementing it here again.
+	 */
+	if (task_on_rq_queued(p) && (!task_has_dl_policy(p) ||
+						!p->dl.dl_throttled))
+		p->sched_class->fixup_hmp_sched_stats(rq, p, demand,
+						      pred_demand);
+
+	p->ravg.demand = demand;
+	p->ravg.pred_demand = pred_demand;
+
+done:
+	trace_sched_update_history(rq, p, runtime, samples, event);
+}
+
+static u64 add_to_task_demand(struct rq *rq, struct task_struct *p, u64 delta)
+{
+	delta = scale_exec_time(delta, rq);
+	p->ravg.sum += delta;
+	if (unlikely(p->ravg.sum > sched_ravg_window))
+		p->ravg.sum = sched_ravg_window;
+
+	return delta;
+}
+
+/*
+ * Account cpu demand of task and/or update task's cpu demand history
+ *
+ * ms = p->ravg.mark_start;
+ * wc = wallclock
+ * ws = rq->window_start
+ *
+ * Three possibilities:
+ *
+ *	a) Task event is contained within one window.
+ *		window_start < mark_start < wallclock
+ *
+ *		ws   ms  wc
+ *		|    |   |
+ *		V    V   V
+ *		|---------------|
+ *
+ *	In this case, p->ravg.sum is updated *iff* event is appropriate
+ *	(ex: event == PUT_PREV_TASK)
+ *
+ *	b) Task event spans two windows.
+ *		mark_start < window_start < wallclock
+ *
+ *		ms   ws   wc
+ *		|    |    |
+ *		V    V    V
+ *		-----|-------------------
+ *
+ *	In this case, p->ravg.sum is updated with (ws - ms) *iff* event
+ *	is appropriate, then a new window sample is recorded followed
+ *	by p->ravg.sum being set to (wc - ws) *iff* event is appropriate.
+ *
+ *	c) Task event spans more than two windows.
+ *
+ *		ms ws_tmp			   ws  wc
+ *		|  |				   |   |
+ *		V  V				   V   V
+ *		---|-------|-------|-------|-------|------
+ *		   |				   |
+ *		   |<------ nr_full_windows ------>|
+ *
+ *	In this case, p->ravg.sum is updated with (ws_tmp - ms) first *iff*
+ *	event is appropriate, window sample of p->ravg.sum is recorded,
+ *	'nr_full_window' samples of window_size is also recorded *iff*
+ *	event is appropriate and finally p->ravg.sum is set to (wc - ws)
+ *	*iff* event is appropriate.
+ *
+ * IMPORTANT : Leave p->ravg.mark_start unchanged, as update_cpu_busy_time()
+ * depends on it!
+ */
+static u64 update_task_demand(struct task_struct *p, struct rq *rq,
+			       int event, u64 wallclock)
+{
+	u64 mark_start = p->ravg.mark_start;
+	u64 delta, window_start = rq->window_start;
+	int new_window, nr_full_windows;
+	u32 window_size = sched_ravg_window;
+	u64 runtime;
+
+	new_window = mark_start < window_start;
+	if (!account_busy_for_task_demand(rq, p, event)) {
+		if (new_window)
+			/*
+			 * If the time accounted isn't being accounted as
+			 * busy time, and a new window started, only the
+			 * previous window need be closed out with the
+			 * pre-existing demand. Multiple windows may have
+			 * elapsed, but since empty windows are dropped,
+			 * it is not necessary to account those.
+			 */
+			update_history(rq, p, p->ravg.sum, 1, event);
+		return 0;
+	}
+
+	if (!new_window) {
+		/*
+		 * The simple case - busy time contained within the existing
+		 * window.
+		 */
+		return add_to_task_demand(rq, p, wallclock - mark_start);
+	}
+
+	/*
+	 * Busy time spans at least two windows. Temporarily rewind
+	 * window_start to first window boundary after mark_start.
+	 */
+	delta = window_start - mark_start;
+	nr_full_windows = div64_u64(delta, window_size);
+	window_start -= (u64)nr_full_windows * (u64)window_size;
+
+	/* Process (window_start - mark_start) first */
+	runtime = add_to_task_demand(rq, p, window_start - mark_start);
+
+	/* Push new sample(s) into task's demand history */
+	update_history(rq, p, p->ravg.sum, 1, event);
+	if (nr_full_windows) {
+		u64 scaled_window = scale_exec_time(window_size, rq);
+
+		update_history(rq, p, scaled_window, nr_full_windows, event);
+		runtime += nr_full_windows * scaled_window;
+	}
+
+	/*
+	 * Roll window_start back to current to process any remainder
+	 * in current window.
+	 */
+	window_start += (u64)nr_full_windows * (u64)window_size;
+
+	/* Process (wallclock - window_start) next */
+	mark_start = window_start;
+	runtime += add_to_task_demand(rq, p, wallclock - mark_start);
+
+	return runtime;
+}
+
+static inline void
+update_task_burst(struct task_struct *p, struct rq *rq, int event, u64 runtime)
+{
+	/*
+	 * update_task_demand() has checks for idle task and
+	 * exit task. The runtime may include the wait time,
+	 * so update the burst only for the cases where the
+	 * task is running.
+	 */
+	if (event == PUT_PREV_TASK || (event == TASK_UPDATE &&
+				rq->curr == p))
+		p->ravg.curr_burst += runtime;
+}
+
+/* Reflect task activity on its demand and cpu's busy time statistics */
+void update_task_ravg(struct task_struct *p, struct rq *rq, int event,
+						u64 wallclock, u64 irqtime)
+{
+	u64 runtime;
+
+	if (!rq->window_start || sched_disable_window_stats ||
+	    p->ravg.mark_start == wallclock)
+		return;
+
+	lockdep_assert_held(&rq->lock);
+
+	update_window_start(rq, wallclock);
+
+	if (!p->ravg.mark_start) {
+		update_task_cpu_cycles(p, cpu_of(rq), wallclock);
+		goto done;
+	}
+
+	update_task_rq_cpu_cycles(p, rq, event, wallclock, irqtime);
+	runtime = update_task_demand(p, rq, event, wallclock);
+	if (runtime)
+		update_task_burst(p, rq, event, runtime);
+	update_cpu_busy_time(p, rq, event, wallclock, irqtime);
+	update_task_pred_demand(rq, p, event);
+
+	if (exiting_task(p))
+		goto done;
+
+	trace_sched_update_task_ravg(p, rq, event, wallclock, irqtime,
+				     rq->cc.cycles, rq->cc.time,
+				     p->grp ? &rq->grp_time : NULL);
+
+done:
+	p->ravg.mark_start = wallclock;
+}
+
+void sched_account_irqtime(int cpu, struct task_struct *curr,
+				 u64 delta, u64 wallclock)
+{
+	struct rq *rq = cpu_rq(cpu);
+	unsigned long flags, nr_windows;
+	u64 cur_jiffies_ts;
+
+	raw_spin_lock_irqsave(&rq->lock, flags);
+
+	/*
+	 * cputime (wallclock) uses sched_clock so use the same here for
+	 * consistency.
+	 */
+	delta += sched_clock() - wallclock;
+	cur_jiffies_ts = get_jiffies_64();
+
+	if (is_idle_task(curr))
+		update_task_ravg(curr, rq, IRQ_UPDATE, sched_ktime_clock(),
+				 delta);
+
+	nr_windows = cur_jiffies_ts - rq->irqload_ts;
+
+	if (nr_windows) {
+		if (nr_windows < 10) {
+			/* Decay CPU's irqload by 3/4 for each window. */
+			rq->avg_irqload *= (3 * nr_windows);
+			rq->avg_irqload = div64_u64(rq->avg_irqload,
+						    4 * nr_windows);
+		} else {
+			rq->avg_irqload = 0;
+		}
+		rq->avg_irqload += rq->cur_irqload;
+		rq->cur_irqload = 0;
+	}
+
+	rq->cur_irqload += delta;
+	rq->irqload_ts = cur_jiffies_ts;
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+void sched_account_irqstart(int cpu, struct task_struct *curr, u64 wallclock)
+{
+	struct rq *rq = cpu_rq(cpu);
+
+	if (!rq->window_start || sched_disable_window_stats)
+		return;
+
+	if (is_idle_task(curr)) {
+		/* We're here without rq->lock held, IRQ disabled */
+		raw_spin_lock(&rq->lock);
+		update_task_cpu_cycles(curr, cpu, sched_ktime_clock());
+		raw_spin_unlock(&rq->lock);
+	}
+}
+
+void reset_task_stats(struct task_struct *p)
+{
+	u32 sum = 0;
+	u32 *curr_window_ptr = NULL;
+	u32 *prev_window_ptr = NULL;
+
+	if (exiting_task(p)) {
+		sum = EXITING_TASK_MARKER;
+	} else {
+		curr_window_ptr =  p->ravg.curr_window_cpu;
+		prev_window_ptr = p->ravg.prev_window_cpu;
+		memset(curr_window_ptr, 0, sizeof(u32) * nr_cpu_ids);
+		memset(prev_window_ptr, 0, sizeof(u32) * nr_cpu_ids);
+	}
+
+	memset(&p->ravg, 0, sizeof(struct ravg));
+
+	p->ravg.curr_window_cpu = curr_window_ptr;
+	p->ravg.prev_window_cpu = prev_window_ptr;
+
+	p->ravg.avg_burst = 2 * (u64)sysctl_sched_short_burst;
+
+	/* Retain EXITING_TASK marker */
+	p->ravg.sum_history[0] = sum;
+}
+
+void mark_task_starting(struct task_struct *p)
+{
+	u64 wallclock;
+	struct rq *rq = task_rq(p);
+
+	if (!rq->window_start || sched_disable_window_stats) {
+		reset_task_stats(p);
+		return;
+	}
+
+	wallclock = sched_ktime_clock();
+	p->ravg.mark_start = p->last_wake_ts = wallclock;
+	p->last_cpu_selected_ts = wallclock;
+	p->last_switch_out_ts = 0;
+	update_task_cpu_cycles(p, cpu_of(rq), wallclock);
+}
+
+void set_window_start(struct rq *rq)
+{
+	static int sync_cpu_available;
+
+	if (rq->window_start)
+		return;
+
+	if (!sync_cpu_available) {
+		rq->window_start = sched_ktime_clock();
+		sync_cpu_available = 1;
+	} else {
+		struct rq *sync_rq = cpu_rq(cpumask_any(cpu_online_mask));
+
+		raw_spin_unlock(&rq->lock);
+		double_rq_lock(rq, sync_rq);
+		rq->window_start = sync_rq->window_start;
+		rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
+		rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0;
+		raw_spin_unlock(&sync_rq->lock);
+	}
+
+	rq->curr->ravg.mark_start = rq->window_start;
+}
+
+static void reset_all_task_stats(void)
+{
+	struct task_struct *g, *p;
+
+	do_each_thread(g, p) {
+		reset_task_stats(p);
+	}  while_each_thread(g, p);
+}
+
+enum reset_reason_code {
+	WINDOW_CHANGE,
+	POLICY_CHANGE,
+	HIST_SIZE_CHANGE,
+	FREQ_AGGREGATE_CHANGE,
+};
+
+const char *sched_window_reset_reasons[] = {
+	"WINDOW_CHANGE",
+	"POLICY_CHANGE",
+	"HIST_SIZE_CHANGE",
+	"FREQ_AGGREGATE_CHANGE",
+};
+
+/* Called with IRQs enabled */
+void reset_all_window_stats(u64 window_start, unsigned int window_size)
+{
+	int cpu, i;
+	unsigned long flags;
+	u64 start_ts = sched_ktime_clock();
+	int reason = WINDOW_CHANGE;
+	unsigned int old = 0, new = 0;
+
+	local_irq_save(flags);
+
+	read_lock(&tasklist_lock);
+
+	read_lock(&related_thread_group_lock);
+
+	/* Taking all runqueue locks prevents race with sched_exit(). */
+	for_each_possible_cpu(cpu)
+		raw_spin_lock(&cpu_rq(cpu)->lock);
+
+	sched_disable_window_stats = 1;
+
+	reset_all_task_stats();
+
+	read_unlock(&tasklist_lock);
+
+	if (window_size) {
+		sched_ravg_window = window_size * TICK_NSEC;
+		set_hmp_defaults();
+		sched_load_granule = sched_ravg_window / NUM_LOAD_INDICES;
+	}
+
+	sched_disable_window_stats = 0;
+
+	for_each_possible_cpu(cpu) {
+		struct rq *rq = cpu_rq(cpu);
+
+		if (window_start)
+			rq->window_start = window_start;
+		rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
+		rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0;
+		memset(&rq->grp_time, 0, sizeof(struct group_cpu_time));
+		for (i = 0; i < NUM_TRACKED_WINDOWS; i++) {
+			memset(&rq->load_subs[i], 0,
+					sizeof(struct load_subtractions));
+			clear_top_tasks_table(rq->top_tasks[i]);
+			clear_top_tasks_bitmap(rq->top_tasks_bitmap[i]);
+		}
+
+		rq->curr_table = 0;
+		rq->curr_top = 0;
+		rq->prev_top = 0;
+		reset_cpu_hmp_stats(cpu, 1);
+	}
+
+	if (sched_window_stats_policy != sysctl_sched_window_stats_policy) {
+		reason = POLICY_CHANGE;
+		old = sched_window_stats_policy;
+		new = sysctl_sched_window_stats_policy;
+		sched_window_stats_policy = sysctl_sched_window_stats_policy;
+	} else if (sched_ravg_hist_size != sysctl_sched_ravg_hist_size) {
+		reason = HIST_SIZE_CHANGE;
+		old = sched_ravg_hist_size;
+		new = sysctl_sched_ravg_hist_size;
+		sched_ravg_hist_size = sysctl_sched_ravg_hist_size;
+	} else if (sched_freq_aggregate !=
+					sysctl_sched_freq_aggregate) {
+		reason = FREQ_AGGREGATE_CHANGE;
+		old = sched_freq_aggregate;
+		new = sysctl_sched_freq_aggregate;
+		sched_freq_aggregate = sysctl_sched_freq_aggregate;
+	}
+
+	for_each_possible_cpu(cpu)
+		raw_spin_unlock(&cpu_rq(cpu)->lock);
+
+	read_unlock(&related_thread_group_lock);
+
+	local_irq_restore(flags);
+
+	trace_sched_reset_all_window_stats(window_start, window_size,
+		sched_ktime_clock() - start_ts, reason, old, new);
+}
+
+/*
+ * In this function we match the accumulated subtractions with the current
+ * and previous windows we are operating with. Ignore any entries where
+ * the window start in the load_subtraction struct does not match either
+ * the curent or the previous window. This could happen whenever CPUs
+ * become idle or busy with interrupts disabled for an extended period.
+ */
+static inline void account_load_subtractions(struct rq *rq)
+{
+	u64 ws = rq->window_start;
+	u64 prev_ws = ws - sched_ravg_window;
+	struct load_subtractions *ls = rq->load_subs;
+	int i;
+
+	for (i = 0; i < NUM_TRACKED_WINDOWS; i++) {
+		if (ls[i].window_start == ws) {
+			rq->curr_runnable_sum -= ls[i].subs;
+			rq->nt_curr_runnable_sum -= ls[i].new_subs;
+		} else if (ls[i].window_start == prev_ws) {
+			rq->prev_runnable_sum -= ls[i].subs;
+			rq->nt_prev_runnable_sum -= ls[i].new_subs;
+		}
+
+		ls[i].subs = 0;
+		ls[i].new_subs = 0;
+	}
+
+	BUG_ON((s64)rq->prev_runnable_sum < 0);
+	BUG_ON((s64)rq->curr_runnable_sum < 0);
+	BUG_ON((s64)rq->nt_prev_runnable_sum < 0);
+	BUG_ON((s64)rq->nt_curr_runnable_sum < 0);
+}
+
+static inline u64 freq_policy_load(struct rq *rq, u64 load)
+{
+	unsigned int reporting_policy = sysctl_sched_freq_reporting_policy;
+
+	switch (reporting_policy) {
+	case FREQ_REPORT_MAX_CPU_LOAD_TOP_TASK:
+		load = max_t(u64, load, top_task_load(rq));
+		break;
+	case FREQ_REPORT_TOP_TASK:
+		load = top_task_load(rq);
+		break;
+	case FREQ_REPORT_CPU_LOAD:
+		break;
+	default:
+		break;
+	}
+
+	return load;
+}
+
+void sched_get_cpus_busy(struct sched_load *busy,
+			 const struct cpumask *query_cpus)
+{
+	unsigned long flags;
+	struct rq *rq;
+	const int cpus = cpumask_weight(query_cpus);
+	u64 load[cpus], group_load[cpus];
+	u64 nload[cpus], ngload[cpus];
+	u64 pload[cpus];
+	unsigned int max_freq[cpus];
+	int notifier_sent = 0;
+	int early_detection[cpus];
+	int cpu, i = 0;
+	unsigned int window_size;
+	u64 max_prev_sum = 0;
+	int max_busy_cpu = cpumask_first(query_cpus);
+	u64 total_group_load = 0, total_ngload = 0;
+	bool aggregate_load = false;
+	struct sched_cluster *cluster = cpu_cluster(cpumask_first(query_cpus));
+
+	if (unlikely(cpus == 0))
+		return;
+
+	local_irq_save(flags);
+
+	/*
+	 * This function could be called in timer context, and the
+	 * current task may have been executing for a long time. Ensure
+	 * that the window stats are current by doing an update.
+	 */
+
+	for_each_cpu(cpu, query_cpus)
+		raw_spin_lock(&cpu_rq(cpu)->lock);
+
+	window_size = sched_ravg_window;
+
+	/*
+	 * We don't really need the cluster lock for this entire for loop
+	 * block. However, there is no advantage in optimizing this as rq
+	 * locks are held regardless and would prevent migration anyways
+	 */
+	raw_spin_lock(&cluster->load_lock);
+
+	for_each_cpu(cpu, query_cpus) {
+		rq = cpu_rq(cpu);
+
+		update_task_ravg(rq->curr, rq, TASK_UPDATE, sched_ktime_clock(),
+				 0);
+
+		account_load_subtractions(rq);
+		load[i] = rq->prev_runnable_sum;
+		nload[i] = rq->nt_prev_runnable_sum;
+		pload[i] = rq->hmp_stats.pred_demands_sum;
+		rq->old_estimated_time = pload[i];
+
+		if (load[i] > max_prev_sum) {
+			max_prev_sum = load[i];
+			max_busy_cpu = cpu;
+		}
+
+		/*
+		 * sched_get_cpus_busy() is called for all CPUs in a
+		 * frequency domain. So the notifier_sent flag per
+		 * cluster works even when a frequency domain spans
+		 * more than 1 cluster.
+		 */
+		if (rq->cluster->notifier_sent) {
+			notifier_sent = 1;
+			rq->cluster->notifier_sent = 0;
+		}
+		early_detection[i] = (rq->ed_task != NULL);
+		max_freq[i] = cpu_max_freq(cpu);
+		i++;
+	}
+
+	raw_spin_unlock(&cluster->load_lock);
+
+	group_load_in_freq_domain(
+			&cpu_rq(max_busy_cpu)->freq_domain_cpumask,
+			&total_group_load, &total_ngload);
+	aggregate_load = !!(total_group_load > sched_freq_aggregate_threshold);
+
+	i = 0;
+	for_each_cpu(cpu, query_cpus) {
+		group_load[i] = 0;
+		ngload[i] = 0;
+
+		if (early_detection[i])
+			goto skip_early;
+
+		rq = cpu_rq(cpu);
+		if (aggregate_load) {
+			if (cpu == max_busy_cpu) {
+				group_load[i] = total_group_load;
+				ngload[i] = total_ngload;
+			}
+		} else {
+			group_load[i] = rq->grp_time.prev_runnable_sum;
+			ngload[i] = rq->grp_time.nt_prev_runnable_sum;
+		}
+
+		load[i] += group_load[i];
+		nload[i] += ngload[i];
+
+		load[i] = freq_policy_load(rq, load[i]);
+		rq->old_busy_time = load[i];
+
+		/*
+		 * Scale load in reference to cluster max_possible_freq.
+		 *
+		 * Note that scale_load_to_cpu() scales load in reference to
+		 * the cluster max_freq.
+		 */
+		load[i] = scale_load_to_cpu(load[i], cpu);
+		nload[i] = scale_load_to_cpu(nload[i], cpu);
+		pload[i] = scale_load_to_cpu(pload[i], cpu);
+skip_early:
+		i++;
+	}
+
+	for_each_cpu(cpu, query_cpus)
+		raw_spin_unlock(&(cpu_rq(cpu))->lock);
+
+	local_irq_restore(flags);
+
+	i = 0;
+	for_each_cpu(cpu, query_cpus) {
+		rq = cpu_rq(cpu);
+
+		if (early_detection[i]) {
+			busy[i].prev_load = div64_u64(sched_ravg_window,
+							NSEC_PER_USEC);
+			busy[i].new_task_load = 0;
+			busy[i].predicted_load = 0;
+			goto exit_early;
+		}
+
+		load[i] = scale_load_to_freq(load[i], max_freq[i],
+				cpu_max_possible_freq(cpu));
+		nload[i] = scale_load_to_freq(nload[i], max_freq[i],
+				cpu_max_possible_freq(cpu));
+
+		pload[i] = scale_load_to_freq(pload[i], max_freq[i],
+					     rq->cluster->max_possible_freq);
+
+		busy[i].prev_load = div64_u64(load[i], NSEC_PER_USEC);
+		busy[i].new_task_load = div64_u64(nload[i], NSEC_PER_USEC);
+		busy[i].predicted_load = div64_u64(pload[i], NSEC_PER_USEC);
+
+exit_early:
+		trace_sched_get_busy(cpu, busy[i].prev_load,
+				     busy[i].new_task_load,
+				     busy[i].predicted_load,
+				     early_detection[i],
+				     aggregate_load &&
+				      cpu == max_busy_cpu);
+		i++;
+	}
+}
+
+void sched_set_io_is_busy(int val)
+{
+	sched_io_is_busy = val;
+}
+
+int sched_set_window(u64 window_start, unsigned int window_size)
+{
+	u64 now, cur_jiffies, jiffy_ktime_ns;
+	s64 ws;
+	unsigned long flags;
+
+	if (window_size * TICK_NSEC <  MIN_SCHED_RAVG_WINDOW)
+		return -EINVAL;
+
+	mutex_lock(&policy_mutex);
+
+	/*
+	 * Get a consistent view of ktime, jiffies, and the time
+	 * since the last jiffy (based on last_jiffies_update).
+	 */
+	local_irq_save(flags);
+	cur_jiffies = jiffy_to_ktime_ns(&now, &jiffy_ktime_ns);
+	local_irq_restore(flags);
+
+	/* translate window_start from jiffies to nanoseconds */
+	ws = (window_start - cur_jiffies); /* jiffy difference */
+	ws *= TICK_NSEC;
+	ws += jiffy_ktime_ns;
+
+	/*
+	 * Roll back calculated window start so that it is in
+	 * the past (window stats must have a current window).
+	 */
+	while (ws > now)
+		ws -= (window_size * TICK_NSEC);
+
+	BUG_ON(sched_ktime_clock() < ws);
+
+	reset_all_window_stats(ws, window_size);
+
+	sched_update_freq_max_load(cpu_possible_mask);
+
+	mutex_unlock(&policy_mutex);
+
+	return 0;
+}
+
+static inline void create_subtraction_entry(struct rq *rq, u64 ws, int index)
+{
+	rq->load_subs[index].window_start = ws;
+	rq->load_subs[index].subs = 0;
+	rq->load_subs[index].new_subs = 0;
+}
+
+static bool get_subtraction_index(struct rq *rq, u64 ws)
+{
+	int i;
+	u64 oldest = ULLONG_MAX;
+	int oldest_index = 0;
+
+	for (i = 0; i < NUM_TRACKED_WINDOWS; i++) {
+		u64 entry_ws = rq->load_subs[i].window_start;
+
+		if (ws == entry_ws)
+			return i;
+
+		if (entry_ws < oldest) {
+			oldest = entry_ws;
+			oldest_index = i;
+		}
+	}
+
+	create_subtraction_entry(rq, ws, oldest_index);
+	return oldest_index;
+}
+
+static void update_rq_load_subtractions(int index, struct rq *rq,
+					u32 sub_load, bool new_task)
+{
+	rq->load_subs[index].subs +=  sub_load;
+	if (new_task)
+		rq->load_subs[index].new_subs += sub_load;
+}
+
+static void update_cluster_load_subtractions(struct task_struct *p,
+					int cpu, u64 ws, bool new_task)
+{
+	struct sched_cluster *cluster = cpu_cluster(cpu);
+	struct cpumask cluster_cpus = cluster->cpus;
+	u64 prev_ws = ws - sched_ravg_window;
+	int i;
+
+	cpumask_clear_cpu(cpu, &cluster_cpus);
+	raw_spin_lock(&cluster->load_lock);
+
+	for_each_cpu(i, &cluster_cpus) {
+		struct rq *rq = cpu_rq(i);
+		int index;
+
+		if (p->ravg.curr_window_cpu[i]) {
+			index = get_subtraction_index(rq, ws);
+			update_rq_load_subtractions(index, rq,
+				p->ravg.curr_window_cpu[i], new_task);
+			p->ravg.curr_window_cpu[i] = 0;
+		}
+
+		if (p->ravg.prev_window_cpu[i]) {
+			index = get_subtraction_index(rq, prev_ws);
+			update_rq_load_subtractions(index, rq,
+				p->ravg.prev_window_cpu[i], new_task);
+			p->ravg.prev_window_cpu[i] = 0;
+		}
+	}
+
+	raw_spin_unlock(&cluster->load_lock);
+}
+
+static inline void inter_cluster_migration_fixup
+	(struct task_struct *p, int new_cpu, int task_cpu, bool new_task)
+{
+	struct rq *dest_rq = cpu_rq(new_cpu);
+	struct rq *src_rq = cpu_rq(task_cpu);
+
+	if (same_freq_domain(new_cpu, task_cpu))
+		return;
+
+	p->ravg.curr_window_cpu[new_cpu] = p->ravg.curr_window;
+	p->ravg.prev_window_cpu[new_cpu] = p->ravg.prev_window;
+
+	dest_rq->curr_runnable_sum += p->ravg.curr_window;
+	dest_rq->prev_runnable_sum += p->ravg.prev_window;
+
+	src_rq->curr_runnable_sum -=  p->ravg.curr_window_cpu[task_cpu];
+	src_rq->prev_runnable_sum -=  p->ravg.prev_window_cpu[task_cpu];
+
+	if (new_task) {
+		dest_rq->nt_curr_runnable_sum += p->ravg.curr_window;
+		dest_rq->nt_prev_runnable_sum += p->ravg.prev_window;
+
+		src_rq->nt_curr_runnable_sum -=
+				p->ravg.curr_window_cpu[task_cpu];
+		src_rq->nt_prev_runnable_sum -=
+				p->ravg.prev_window_cpu[task_cpu];
+	}
+
+	p->ravg.curr_window_cpu[task_cpu] = 0;
+	p->ravg.prev_window_cpu[task_cpu] = 0;
+
+	update_cluster_load_subtractions(p, task_cpu,
+			src_rq->window_start, new_task);
+
+	BUG_ON((s64)src_rq->prev_runnable_sum < 0);
+	BUG_ON((s64)src_rq->curr_runnable_sum < 0);
+	BUG_ON((s64)src_rq->nt_prev_runnable_sum < 0);
+	BUG_ON((s64)src_rq->nt_curr_runnable_sum < 0);
+}
+
+static int get_top_index(unsigned long *bitmap, unsigned long old_top)
+{
+	int index = find_next_bit(bitmap, NUM_LOAD_INDICES, old_top);
+
+	if (index == NUM_LOAD_INDICES)
+		return 0;
+
+	return NUM_LOAD_INDICES - 1 - index;
+}
+
+static void
+migrate_top_tasks(struct task_struct *p, struct rq *src_rq, struct rq *dst_rq)
+{
+	int index;
+	int top_index;
+	u32 curr_window = p->ravg.curr_window;
+	u32 prev_window = p->ravg.prev_window;
+	u8 src = src_rq->curr_table;
+	u8 dst = dst_rq->curr_table;
+	u8 *src_table;
+	u8 *dst_table;
+
+	if (curr_window) {
+		src_table = src_rq->top_tasks[src];
+		dst_table = dst_rq->top_tasks[dst];
+		index = load_to_index(curr_window);
+		src_table[index] -= 1;
+		dst_table[index] += 1;
+
+		if (!src_table[index])
+			__clear_bit(NUM_LOAD_INDICES - index - 1,
+				src_rq->top_tasks_bitmap[src]);
+
+		if (dst_table[index] == 1)
+			__set_bit(NUM_LOAD_INDICES - index - 1,
+				dst_rq->top_tasks_bitmap[dst]);
+
+		if (index > dst_rq->curr_top)
+			dst_rq->curr_top = index;
+
+		top_index = src_rq->curr_top;
+		if (index == top_index && !src_table[index])
+			src_rq->curr_top = get_top_index(
+				src_rq->top_tasks_bitmap[src], top_index);
+	}
+
+	if (prev_window) {
+		src = 1 - src;
+		dst = 1 - dst;
+		src_table = src_rq->top_tasks[src];
+		dst_table = dst_rq->top_tasks[dst];
+		index = load_to_index(prev_window);
+		src_table[index] -= 1;
+		dst_table[index] += 1;
+
+		if (!src_table[index])
+			__clear_bit(NUM_LOAD_INDICES - index - 1,
+				src_rq->top_tasks_bitmap[src]);
+
+		if (dst_table[index] == 1)
+			__set_bit(NUM_LOAD_INDICES - index - 1,
+				dst_rq->top_tasks_bitmap[dst]);
+
+		if (index > dst_rq->prev_top)
+			dst_rq->prev_top = index;
+
+		top_index = src_rq->prev_top;
+		if (index == top_index && !src_table[index])
+			src_rq->prev_top = get_top_index(
+				src_rq->top_tasks_bitmap[src], top_index);
+	}
+}
+
+void fixup_busy_time(struct task_struct *p, int new_cpu)
+{
+	struct rq *src_rq = task_rq(p);
+	struct rq *dest_rq = cpu_rq(new_cpu);
+	u64 wallclock;
+	u64 *src_curr_runnable_sum, *dst_curr_runnable_sum;
+	u64 *src_prev_runnable_sum, *dst_prev_runnable_sum;
+	u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum;
+	u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum;
+	bool new_task;
+	struct related_thread_group *grp;
+
+	if (!p->on_rq && p->state != TASK_WAKING)
+		return;
+
+	if (exiting_task(p)) {
+		clear_ed_task(p, src_rq);
+		return;
+	}
+
+	if (p->state == TASK_WAKING)
+		double_rq_lock(src_rq, dest_rq);
+
+	if (sched_disable_window_stats)
+		goto done;
+
+	wallclock = sched_ktime_clock();
+
+	update_task_ravg(task_rq(p)->curr, task_rq(p),
+			 TASK_UPDATE,
+			 wallclock, 0);
+	update_task_ravg(dest_rq->curr, dest_rq,
+			 TASK_UPDATE, wallclock, 0);
+
+	update_task_ravg(p, task_rq(p), TASK_MIGRATE,
+			 wallclock, 0);
+
+	update_task_cpu_cycles(p, new_cpu, wallclock);
+
+	new_task = is_new_task(p);
+	/* Protected by rq_lock */
+	grp = p->grp;
+
+	/*
+	 * For frequency aggregation, we continue to do migration fixups
+	 * even for intra cluster migrations. This is because, the aggregated
+	 * load has to reported on a single CPU regardless.
+	 */
+	if (grp && sched_freq_aggregate) {
+		struct group_cpu_time *cpu_time;
+
+		cpu_time = &src_rq->grp_time;
+		src_curr_runnable_sum = &cpu_time->curr_runnable_sum;
+		src_prev_runnable_sum = &cpu_time->prev_runnable_sum;
+		src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
+		src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
+
+		cpu_time = &dest_rq->grp_time;
+		dst_curr_runnable_sum = &cpu_time->curr_runnable_sum;
+		dst_prev_runnable_sum = &cpu_time->prev_runnable_sum;
+		dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
+		dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
+
+		if (p->ravg.curr_window) {
+			*src_curr_runnable_sum -= p->ravg.curr_window;
+			*dst_curr_runnable_sum += p->ravg.curr_window;
+			if (new_task) {
+				*src_nt_curr_runnable_sum -=
+							p->ravg.curr_window;
+				*dst_nt_curr_runnable_sum +=
+							p->ravg.curr_window;
+			}
+		}
+
+		if (p->ravg.prev_window) {
+			*src_prev_runnable_sum -= p->ravg.prev_window;
+			*dst_prev_runnable_sum += p->ravg.prev_window;
+			if (new_task) {
+				*src_nt_prev_runnable_sum -=
+							p->ravg.prev_window;
+				*dst_nt_prev_runnable_sum +=
+							p->ravg.prev_window;
+			}
+		}
+	} else {
+		inter_cluster_migration_fixup(p, new_cpu,
+						task_cpu(p), new_task);
+	}
+
+	migrate_top_tasks(p, src_rq, dest_rq);
+
+	if (p == src_rq->ed_task) {
+		src_rq->ed_task = NULL;
+		if (!dest_rq->ed_task)
+			dest_rq->ed_task = p;
+	}
+
+done:
+	if (p->state == TASK_WAKING)
+		double_rq_unlock(src_rq, dest_rq);
+}
+
+#define sched_up_down_migrate_auto_update 1
+static void check_for_up_down_migrate_update(const struct cpumask *cpus)
+{
+	int i = cpumask_first(cpus);
+
+	if (!sched_up_down_migrate_auto_update)
+		return;
+
+	if (cpu_max_possible_capacity(i) == max_possible_capacity)
+		return;
+
+	if (cpu_max_possible_freq(i) == cpu_max_freq(i))
+		up_down_migrate_scale_factor = 1024;
+	else
+		up_down_migrate_scale_factor = (1024 *
+				 cpu_max_possible_freq(i)) / cpu_max_freq(i);
+
+	update_up_down_migrate();
+}
+
+/* Return cluster which can offer required capacity for group */
+static struct sched_cluster *best_cluster(struct related_thread_group *grp,
+					u64 total_demand, bool group_boost)
+{
+	struct sched_cluster *cluster = NULL;
+
+	for_each_sched_cluster(cluster) {
+		if (group_will_fit(cluster, grp, total_demand, group_boost))
+			return cluster;
+	}
+
+	return sched_cluster[0];
+}
+
+static void _set_preferred_cluster(struct related_thread_group *grp)
+{
+	struct task_struct *p;
+	u64 combined_demand = 0;
+	bool boost_on_big = sched_boost_policy() == SCHED_BOOST_ON_BIG;
+	bool group_boost = false;
+	u64 wallclock;
+
+	if (list_empty(&grp->tasks))
+		return;
+
+	wallclock = sched_ktime_clock();
+
+	/*
+	 * wakeup of two or more related tasks could race with each other and
+	 * could result in multiple calls to _set_preferred_cluster being issued
+	 * at same time. Avoid overhead in such cases of rechecking preferred
+	 * cluster
+	 */
+	if (wallclock - grp->last_update < sched_ravg_window / 10)
+		return;
+
+	list_for_each_entry(p, &grp->tasks, grp_list) {
+		if (boost_on_big && task_sched_boost(p)) {
+			group_boost = true;
+			break;
+		}
+
+		if (p->ravg.mark_start < wallclock -
+		    (sched_ravg_window * sched_ravg_hist_size))
+			continue;
+
+		combined_demand += p->ravg.demand;
+
+	}
+
+	grp->preferred_cluster = best_cluster(grp,
+			combined_demand, group_boost);
+	grp->last_update = sched_ktime_clock();
+	trace_sched_set_preferred_cluster(grp, combined_demand);
+}
+
+void set_preferred_cluster(struct related_thread_group *grp)
+{
+	raw_spin_lock(&grp->lock);
+	_set_preferred_cluster(grp);
+	raw_spin_unlock(&grp->lock);
+}
+
+#define ADD_TASK	0
+#define REM_TASK	1
+
+#define DEFAULT_CGROUP_COLOC_ID 1
+
+/*
+ * Task's cpu usage is accounted in:
+ *	rq->curr/prev_runnable_sum,  when its ->grp is NULL
+ *	grp->cpu_time[cpu]->curr/prev_runnable_sum, when its ->grp is !NULL
+ *
+ * Transfer task's cpu usage between those counters when transitioning between
+ * groups
+ */
+static void transfer_busy_time(struct rq *rq, struct related_thread_group *grp,
+				struct task_struct *p, int event)
+{
+	u64 wallclock;
+	struct group_cpu_time *cpu_time;
+	u64 *src_curr_runnable_sum, *dst_curr_runnable_sum;
+	u64 *src_prev_runnable_sum, *dst_prev_runnable_sum;
+	u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum;
+	u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum;
+	int migrate_type;
+	int cpu = cpu_of(rq);
+	bool new_task;
+	int i;
+
+	if (!sched_freq_aggregate)
+		return;
+
+	wallclock = sched_ktime_clock();
+
+	update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+	update_task_ravg(p, rq, TASK_UPDATE, wallclock, 0);
+	new_task = is_new_task(p);
+
+	cpu_time = &rq->grp_time;
+	if (event == ADD_TASK) {
+		migrate_type = RQ_TO_GROUP;
+
+		src_curr_runnable_sum = &rq->curr_runnable_sum;
+		dst_curr_runnable_sum = &cpu_time->curr_runnable_sum;
+		src_prev_runnable_sum = &rq->prev_runnable_sum;
+		dst_prev_runnable_sum = &cpu_time->prev_runnable_sum;
+
+		src_nt_curr_runnable_sum = &rq->nt_curr_runnable_sum;
+		dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
+		src_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;
+		dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
+
+		*src_curr_runnable_sum -= p->ravg.curr_window_cpu[cpu];
+		*src_prev_runnable_sum -= p->ravg.prev_window_cpu[cpu];
+		if (new_task) {
+			*src_nt_curr_runnable_sum -=
+					p->ravg.curr_window_cpu[cpu];
+			*src_nt_prev_runnable_sum -=
+					p->ravg.prev_window_cpu[cpu];
+		}
+
+		update_cluster_load_subtractions(p, cpu,
+				rq->window_start, new_task);
+
+	} else {
+		migrate_type = GROUP_TO_RQ;
+
+		src_curr_runnable_sum = &cpu_time->curr_runnable_sum;
+		dst_curr_runnable_sum = &rq->curr_runnable_sum;
+		src_prev_runnable_sum = &cpu_time->prev_runnable_sum;
+		dst_prev_runnable_sum = &rq->prev_runnable_sum;
+
+		src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
+		dst_nt_curr_runnable_sum = &rq->nt_curr_runnable_sum;
+		src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
+		dst_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;
+
+		*src_curr_runnable_sum -= p->ravg.curr_window;
+		*src_prev_runnable_sum -= p->ravg.prev_window;
+		if (new_task) {
+			*src_nt_curr_runnable_sum -= p->ravg.curr_window;
+			*src_nt_prev_runnable_sum -= p->ravg.prev_window;
+		}
+
+		/*
+		 * Need to reset curr/prev windows for all CPUs, not just the
+		 * ones in the same cluster. Since inter cluster migrations
+		 * did not result in the appropriate book keeping, the values
+		 * per CPU would be inaccurate.
+		 */
+		for_each_possible_cpu(i) {
+			p->ravg.curr_window_cpu[i] = 0;
+			p->ravg.prev_window_cpu[i] = 0;
+		}
+	}
+
+	*dst_curr_runnable_sum += p->ravg.curr_window;
+	*dst_prev_runnable_sum += p->ravg.prev_window;
+	if (new_task) {
+		*dst_nt_curr_runnable_sum += p->ravg.curr_window;
+		*dst_nt_prev_runnable_sum += p->ravg.prev_window;
+	}
+
+	/*
+	 * When a task enter or exits a group, it's curr and prev windows are
+	 * moved to a single CPU. This behavior might be sub-optimal in the
+	 * exit case, however, it saves us the overhead of handling inter
+	 * cluster migration fixups while the task is part of a related group.
+	 */
+	p->ravg.curr_window_cpu[cpu] = p->ravg.curr_window;
+	p->ravg.prev_window_cpu[cpu] = p->ravg.prev_window;
+
+	trace_sched_migration_update_sum(p, migrate_type, rq);
+
+	BUG_ON((s64)*src_curr_runnable_sum < 0);
+	BUG_ON((s64)*src_prev_runnable_sum < 0);
+	BUG_ON((s64)*src_nt_curr_runnable_sum < 0);
+	BUG_ON((s64)*src_nt_prev_runnable_sum < 0);
+}
+
+static inline struct related_thread_group*
+lookup_related_thread_group(unsigned int group_id)
+{
+	return related_thread_groups[group_id];
+}
+
+int alloc_related_thread_groups(void)
+{
+	int i, ret;
+	struct related_thread_group *grp;
+
+	/* groupd_id = 0 is invalid as it's special id to remove group. */
+	for (i = 1; i < MAX_NUM_CGROUP_COLOC_ID; i++) {
+		grp = kzalloc(sizeof(*grp), GFP_NOWAIT);
+		if (!grp) {
+			ret = -ENOMEM;
+			goto err;
+		}
+
+		grp->id = i;
+		INIT_LIST_HEAD(&grp->tasks);
+		INIT_LIST_HEAD(&grp->list);
+		raw_spin_lock_init(&grp->lock);
+
+		related_thread_groups[i] = grp;
+	}
+
+	return 0;
+
+err:
+	for (i = 1; i < MAX_NUM_CGROUP_COLOC_ID; i++) {
+		grp = lookup_related_thread_group(i);
+		if (grp) {
+			kfree(grp);
+			related_thread_groups[i] = NULL;
+		} else {
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static void remove_task_from_group(struct task_struct *p)
+{
+	struct related_thread_group *grp = p->grp;
+	struct rq *rq;
+	int empty_group = 1;
+
+	raw_spin_lock(&grp->lock);
+
+	rq = __task_rq_lock(p);
+	transfer_busy_time(rq, p->grp, p, REM_TASK);
+	list_del_init(&p->grp_list);
+	rcu_assign_pointer(p->grp, NULL);
+	__task_rq_unlock(rq);
+
+	if (!list_empty(&grp->tasks)) {
+		empty_group = 0;
+		_set_preferred_cluster(grp);
+	}
+
+	raw_spin_unlock(&grp->lock);
+
+	/* Reserved groups cannot be destroyed */
+	if (empty_group && grp->id != DEFAULT_CGROUP_COLOC_ID)
+		 /*
+		  * We test whether grp->list is attached with list_empty()
+		  * hence re-init the list after deletion.
+		  */
+		list_del_init(&grp->list);
+}
+
+static int
+add_task_to_group(struct task_struct *p, struct related_thread_group *grp)
+{
+	struct rq *rq;
+
+	raw_spin_lock(&grp->lock);
+
+	/*
+	 * Change p->grp under rq->lock. Will prevent races with read-side
+	 * reference of p->grp in various hot-paths
+	 */
+	rq = __task_rq_lock(p);
+	transfer_busy_time(rq, grp, p, ADD_TASK);
+	list_add(&p->grp_list, &grp->tasks);
+	rcu_assign_pointer(p->grp, grp);
+	__task_rq_unlock(rq);
+
+	_set_preferred_cluster(grp);
+
+	raw_spin_unlock(&grp->lock);
+
+	return 0;
+}
+
+void add_new_task_to_grp(struct task_struct *new)
+{
+	unsigned long flags;
+	struct related_thread_group *grp;
+	struct task_struct *leader = new->group_leader;
+	unsigned int leader_grp_id = sched_get_group_id(leader);
+
+	if (!sysctl_sched_enable_thread_grouping &&
+	    leader_grp_id != DEFAULT_CGROUP_COLOC_ID)
+		return;
+
+	if (thread_group_leader(new))
+		return;
+
+	if (leader_grp_id == DEFAULT_CGROUP_COLOC_ID) {
+		if (!same_schedtune(new, leader))
+			return;
+	}
+
+	write_lock_irqsave(&related_thread_group_lock, flags);
+
+	rcu_read_lock();
+	grp = task_related_thread_group(leader);
+	rcu_read_unlock();
+
+	/*
+	 * It's possible that someone already added the new task to the
+	 * group. A leader's thread group is updated prior to calling
+	 * this function. It's also possible that the leader has exited
+	 * the group. In either case, there is nothing else to do.
+	 */
+	if (!grp || new->grp) {
+		write_unlock_irqrestore(&related_thread_group_lock, flags);
+		return;
+	}
+
+	raw_spin_lock(&grp->lock);
+
+	rcu_assign_pointer(new->grp, grp);
+	list_add(&new->grp_list, &grp->tasks);
+
+	raw_spin_unlock(&grp->lock);
+	write_unlock_irqrestore(&related_thread_group_lock, flags);
+}
+
+static int __sched_set_group_id(struct task_struct *p, unsigned int group_id)
+{
+	int rc = 0;
+	unsigned long flags;
+	struct related_thread_group *grp = NULL;
+
+	if (group_id >= MAX_NUM_CGROUP_COLOC_ID)
+		return -EINVAL;
+
+	raw_spin_lock_irqsave(&p->pi_lock, flags);
+	write_lock(&related_thread_group_lock);
+
+	/* Switching from one group to another directly is not permitted */
+	if ((current != p && p->flags & PF_EXITING) ||
+			(!p->grp && !group_id) ||
+			(p->grp && group_id))
+		goto done;
+
+	if (!group_id) {
+		remove_task_from_group(p);
+		goto done;
+	}
+
+	grp = lookup_related_thread_group(group_id);
+	if (list_empty(&grp->list))
+		list_add(&grp->list, &active_related_thread_groups);
+
+	rc = add_task_to_group(p, grp);
+done:
+	write_unlock(&related_thread_group_lock);
+	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+	return rc;
+}
+
+int sched_set_group_id(struct task_struct *p, unsigned int group_id)
+{
+	/* DEFAULT_CGROUP_COLOC_ID is a reserved id */
+	if (group_id == DEFAULT_CGROUP_COLOC_ID)
+		return -EINVAL;
+
+	return __sched_set_group_id(p, group_id);
+}
+
+unsigned int sched_get_group_id(struct task_struct *p)
+{
+	unsigned int group_id;
+	struct related_thread_group *grp;
+
+	rcu_read_lock();
+	grp = task_related_thread_group(p);
+	group_id = grp ? grp->id : 0;
+	rcu_read_unlock();
+
+	return group_id;
+}
+
+#if defined(CONFIG_SCHED_TUNE) && defined(CONFIG_CGROUP_SCHEDTUNE)
+/*
+ * We create a default colocation group at boot. There is no need to
+ * synchronize tasks between cgroups at creation time because the
+ * correct cgroup hierarchy is not available at boot. Therefore cgroup
+ * colocation is turned off by default even though the colocation group
+ * itself has been allocated. Furthermore this colocation group cannot
+ * be destroyted once it has been created. All of this has been as part
+ * of runtime optimizations.
+ *
+ * The job of synchronizing tasks to the colocation group is done when
+ * the colocation flag in the cgroup is turned on.
+ */
+static int __init create_default_coloc_group(void)
+{
+	struct related_thread_group *grp = NULL;
+	unsigned long flags;
+
+	grp = lookup_related_thread_group(DEFAULT_CGROUP_COLOC_ID);
+	write_lock_irqsave(&related_thread_group_lock, flags);
+	list_add(&grp->list, &active_related_thread_groups);
+	write_unlock_irqrestore(&related_thread_group_lock, flags);
+
+	update_freq_aggregate_threshold(MAX_FREQ_AGGR_THRESH);
+	return 0;
+}
+late_initcall(create_default_coloc_group);
+
+int sync_cgroup_colocation(struct task_struct *p, bool insert)
+{
+	unsigned int grp_id = insert ? DEFAULT_CGROUP_COLOC_ID : 0;
+
+	return __sched_set_group_id(p, grp_id);
+}
+#endif
+
+static void update_cpu_cluster_capacity(const cpumask_t *cpus)
+{
+	int i;
+	struct sched_cluster *cluster;
+	struct cpumask cpumask;
+
+	cpumask_copy(&cpumask, cpus);
+	pre_big_task_count_change(cpu_possible_mask);
+
+	for_each_cpu(i, &cpumask) {
+		cluster = cpu_rq(i)->cluster;
+		cpumask_andnot(&cpumask, &cpumask, &cluster->cpus);
+
+		cluster->capacity = compute_capacity(cluster);
+		cluster->load_scale_factor = compute_load_scale_factor(cluster);
+
+		/* 'cpus' can contain cpumask more than one cluster */
+		check_for_up_down_migrate_update(&cluster->cpus);
+	}
+
+	__update_min_max_capacity();
+
+	post_big_task_count_change(cpu_possible_mask);
+}
+
+static DEFINE_SPINLOCK(cpu_freq_min_max_lock);
+void sched_update_cpu_freq_min_max(const cpumask_t *cpus, u32 fmin, u32 fmax)
+{
+	struct cpumask cpumask;
+	struct sched_cluster *cluster;
+	int i, update_capacity = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&cpu_freq_min_max_lock, flags);
+	cpumask_copy(&cpumask, cpus);
+	for_each_cpu(i, &cpumask) {
+		cluster = cpu_rq(i)->cluster;
+		cpumask_andnot(&cpumask, &cpumask, &cluster->cpus);
+
+		update_capacity += (cluster->max_mitigated_freq != fmax);
+		cluster->max_mitigated_freq = fmax;
+	}
+	spin_unlock_irqrestore(&cpu_freq_min_max_lock, flags);
+
+	if (update_capacity)
+		update_cpu_cluster_capacity(cpus);
+}
+
+static int cpufreq_notifier_policy(struct notifier_block *nb,
+		unsigned long val, void *data)
+{
+	struct cpufreq_policy *policy = (struct cpufreq_policy *)data;
+	struct sched_cluster *cluster = NULL;
+	struct cpumask policy_cluster = *policy->related_cpus;
+	unsigned int orig_max_freq = 0;
+	int i, j, update_capacity = 0;
+
+	if (val != CPUFREQ_NOTIFY && val != CPUFREQ_REMOVE_POLICY &&
+						val != CPUFREQ_CREATE_POLICY)
+		return 0;
+
+	if (val == CPUFREQ_REMOVE_POLICY || val == CPUFREQ_CREATE_POLICY) {
+		update_min_max_capacity();
+		return 0;
+	}
+
+	max_possible_freq = max(max_possible_freq, policy->cpuinfo.max_freq);
+	if (min_max_freq == 1)
+		min_max_freq = UINT_MAX;
+	min_max_freq = min(min_max_freq, policy->cpuinfo.max_freq);
+	BUG_ON(!min_max_freq);
+	BUG_ON(!policy->max);
+
+	for_each_cpu(i, &policy_cluster) {
+		cluster = cpu_rq(i)->cluster;
+		cpumask_andnot(&policy_cluster, &policy_cluster,
+						&cluster->cpus);
+
+		orig_max_freq = cluster->max_freq;
+		cluster->min_freq = policy->min;
+		cluster->max_freq = policy->max;
+		cluster->cur_freq = policy->cur;
+
+		if (!cluster->freq_init_done) {
+			mutex_lock(&cluster_lock);
+			for_each_cpu(j, &cluster->cpus)
+				cpumask_copy(&cpu_rq(j)->freq_domain_cpumask,
+						policy->related_cpus);
+			cluster->max_possible_freq = policy->cpuinfo.max_freq;
+			cluster->max_possible_capacity =
+				compute_max_possible_capacity(cluster);
+			cluster->freq_init_done = true;
+
+			sort_clusters();
+			update_all_clusters_stats();
+			mutex_unlock(&cluster_lock);
+			continue;
+		}
+
+		update_capacity += (orig_max_freq != cluster->max_freq);
+	}
+
+	if (update_capacity)
+		update_cpu_cluster_capacity(policy->related_cpus);
+
+	return 0;
+}
+
+static int cpufreq_notifier_trans(struct notifier_block *nb,
+		unsigned long val, void *data)
+{
+	struct cpufreq_freqs *freq = (struct cpufreq_freqs *)data;
+	unsigned int cpu = freq->cpu, new_freq = freq->new;
+	unsigned long flags;
+	struct sched_cluster *cluster;
+	struct cpumask policy_cpus = cpu_rq(cpu)->freq_domain_cpumask;
+	int i, j;
+
+	if (val != CPUFREQ_POSTCHANGE)
+		return 0;
+
+	BUG_ON(!new_freq);
+
+	if (cpu_cur_freq(cpu) == new_freq)
+		return 0;
+
+	for_each_cpu(i, &policy_cpus) {
+		cluster = cpu_rq(i)->cluster;
+
+		for_each_cpu(j, &cluster->cpus) {
+			struct rq *rq = cpu_rq(j);
+
+			raw_spin_lock_irqsave(&rq->lock, flags);
+			update_task_ravg(rq->curr, rq, TASK_UPDATE,
+						sched_ktime_clock(), 0);
+			raw_spin_unlock_irqrestore(&rq->lock, flags);
+		}
+
+		cluster->cur_freq = new_freq;
+		cpumask_andnot(&policy_cpus, &policy_cpus, &cluster->cpus);
+	}
+
+	return 0;
+}
+
+static int pwr_stats_ready_notifier(struct notifier_block *nb,
+				    unsigned long cpu, void *data)
+{
+	cpumask_t mask = CPU_MASK_NONE;
+
+	cpumask_set_cpu(cpu, &mask);
+	sched_update_freq_max_load(&mask);
+
+	mutex_lock(&cluster_lock);
+	sort_clusters();
+	mutex_unlock(&cluster_lock);
+
+	return 0;
+}
+
+static struct notifier_block notifier_policy_block = {
+	.notifier_call = cpufreq_notifier_policy
+};
+
+static struct notifier_block notifier_trans_block = {
+	.notifier_call = cpufreq_notifier_trans
+};
+
+static struct notifier_block notifier_pwr_stats_ready = {
+	.notifier_call = pwr_stats_ready_notifier
+};
+
+int __weak register_cpu_pwr_stats_ready_notifier(struct notifier_block *nb)
+{
+	return -EINVAL;
+}
+
+static int register_sched_callback(void)
+{
+	int ret;
+
+	ret = cpufreq_register_notifier(&notifier_policy_block,
+						CPUFREQ_POLICY_NOTIFIER);
+
+	if (!ret)
+		ret = cpufreq_register_notifier(&notifier_trans_block,
+						CPUFREQ_TRANSITION_NOTIFIER);
+
+	register_cpu_pwr_stats_ready_notifier(&notifier_pwr_stats_ready);
+
+	return 0;
+}
+
+/*
+ * cpufreq callbacks can be registered at core_initcall or later time.
+ * Any registration done prior to that is "forgotten" by cpufreq. See
+ * initialization of variable init_cpufreq_transition_notifier_list_called
+ * for further information.
+ */
+core_initcall(register_sched_callback);
+
+int update_preferred_cluster(struct related_thread_group *grp,
+		struct task_struct *p, u32 old_load)
+{
+	u32 new_load = task_load(p);
+
+	if (!grp)
+		return 0;
+
+	/*
+	 * Update if task's load has changed significantly or a complete window
+	 * has passed since we last updated preference
+	 */
+	if (abs(new_load - old_load) > sched_ravg_window / 4 ||
+		sched_ktime_clock() - grp->last_update > sched_ravg_window)
+		return 1;
+
+	return 0;
+}
+
+bool early_detection_notify(struct rq *rq, u64 wallclock)
+{
+	struct task_struct *p;
+	int loop_max = 10;
+
+	if (sched_boost_policy() == SCHED_BOOST_NONE || !rq->cfs.h_nr_running)
+		return 0;
+
+	rq->ed_task = NULL;
+	list_for_each_entry(p, &rq->cfs_tasks, se.group_node) {
+		if (!loop_max)
+			break;
+
+		if (wallclock - p->last_wake_ts >= EARLY_DETECTION_DURATION) {
+			rq->ed_task = p;
+			return 1;
+		}
+
+		loop_max--;
+	}
+
+	return 0;
+}
+
+void update_avg_burst(struct task_struct *p)
+{
+	update_avg(&p->ravg.avg_burst, p->ravg.curr_burst);
+	p->ravg.curr_burst = 0;
+}
+
+void note_task_waking(struct task_struct *p, u64 wallclock)
+{
+	u64 sleep_time = wallclock - p->last_switch_out_ts;
+
+	/*
+	 * When a short burst and short sleeping task goes for a long
+	 * sleep, the task's avg_sleep_time gets boosted. It will not
+	 * come below short_sleep threshold for a lot of time and it
+	 * results in incorrect packing. The idead behind tracking
+	 * avg_sleep_time is to detect if a task is short sleeping
+	 * or not. So limit the sleep time to twice the short sleep
+	 * threshold. For regular long sleeping tasks, the avg_sleep_time
+	 * would be higher than threshold, and packing happens correctly.
+	 */
+	sleep_time = min_t(u64, sleep_time, 2 * sysctl_sched_short_sleep);
+	update_avg(&p->ravg.avg_sleep_time, sleep_time);
+
+	p->last_wake_ts = wallclock;
+}
+
+#ifdef CONFIG_CGROUP_SCHED
+u64 cpu_upmigrate_discourage_read_u64(struct cgroup_subsys_state *css,
+					  struct cftype *cft)
+{
+	struct task_group *tg = css_tg(css);
+
+	return tg->upmigrate_discouraged;
+}
+
+int cpu_upmigrate_discourage_write_u64(struct cgroup_subsys_state *css,
+				struct cftype *cft, u64 upmigrate_discourage)
+{
+	struct task_group *tg = css_tg(css);
+	int discourage = upmigrate_discourage > 0;
+
+	if (tg->upmigrate_discouraged == discourage)
+		return 0;
+
+	/*
+	 * Revisit big-task classification for tasks of this cgroup. It would
+	 * have been efficient to walk tasks of just this cgroup in running
+	 * state, but we don't have easy means to do that. Walk all tasks in
+	 * running state on all cpus instead and re-visit their big task
+	 * classification.
+	 */
+	get_online_cpus();
+	pre_big_task_count_change(cpu_online_mask);
+
+	tg->upmigrate_discouraged = discourage;
+
+	post_big_task_count_change(cpu_online_mask);
+	put_online_cpus();
+
+	return 0;
+}
+#endif /* CONFIG_CGROUP_SCHED */
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 33d7003fa1b8..d562efb04775 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -80,6 +80,26 @@ static void update_curr_idle(struct rq *rq)
 {
 }
 
+#ifdef CONFIG_SCHED_HMP
+
+static void
+inc_hmp_sched_stats_idle(struct rq *rq, struct task_struct *p)
+{
+}
+
+static void
+dec_hmp_sched_stats_idle(struct rq *rq, struct task_struct *p)
+{
+}
+
+static void
+fixup_hmp_sched_stats_idle(struct rq *rq, struct task_struct *p,
+			   u32 new_task_load, u32 new_pred_demand)
+{
+}
+
+#endif
+
 /*
  * Simple, special scheduling class for the per-CPU idle tasks:
  */
@@ -108,4 +128,9 @@ const struct sched_class idle_sched_class = {
 	.prio_changed		= prio_changed_idle,
 	.switched_to		= switched_to_idle,
 	.update_curr		= update_curr_idle,
+#ifdef CONFIG_SCHED_HMP
+	.inc_hmp_sched_stats	= inc_hmp_sched_stats_idle,
+	.dec_hmp_sched_stats	= dec_hmp_sched_stats_idle,
+	.fixup_hmp_sched_stats	= fixup_hmp_sched_stats_idle,
+#endif
 };
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 4184d4a97fd2..af21389466b8 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -5,11 +5,12 @@
 
 #include "sched.h"
 
+#include <linux/interrupt.h>
 #include <linux/slab.h>
 #include <linux/irq_work.h>
+#include <trace/events/sched.h>
 #include <linux/hrtimer.h>
 
-#include "walt.h"
 #include "tune.h"
 
 int sched_rr_timeslice = RR_TIMESLICE;
@@ -257,8 +258,12 @@ static void pull_rt_task(struct rq *this_rq);
 
 static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
 {
-	/* Try to pull RT tasks here if we lower this rq's prio */
-	return rq->rt.highest_prio.curr > prev->prio;
+	/*
+	 * Try to pull RT tasks here if we lower this rq's prio and cpu is not
+	 * isolated
+	 */
+	return rq->rt.highest_prio.curr > prev->prio &&
+	       !cpu_isolated(cpu_of(rq));
 }
 
 static inline int rt_overloaded(struct rq *rq)
@@ -429,7 +434,7 @@ static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
 
 static inline int on_rt_rq(struct sched_rt_entity *rt_se)
 {
-	return !list_empty(&rt_se->run_list);
+	return rt_se->on_rq;
 }
 
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -475,8 +480,8 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 	return rt_se->my_q;
 }
 
-static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head);
-static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
+static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
 
 static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 {
@@ -492,7 +497,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 		if (!rt_se)
 			enqueue_top_rt_rq(rt_rq);
 		else if (!on_rt_rq(rt_se))
-			enqueue_rt_entity(rt_se, false);
+			enqueue_rt_entity(rt_se, 0);
 
 		if (rt_rq->highest_prio.curr < curr->prio)
 			resched_curr(rq);
@@ -509,7 +514,7 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 	if (!rt_se)
 		dequeue_top_rt_rq(rt_rq);
 	else if (on_rt_rq(rt_se))
-		dequeue_rt_entity(rt_se);
+		dequeue_rt_entity(rt_se, 0);
 }
 
 static inline int rt_rq_throttled(struct rt_rq *rt_rq)
@@ -1243,6 +1248,41 @@ void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
 
 #endif /* CONFIG_RT_GROUP_SCHED */
 
+#ifdef CONFIG_SCHED_HMP
+
+static void
+inc_hmp_sched_stats_rt(struct rq *rq, struct task_struct *p)
+{
+	inc_cumulative_runnable_avg(&rq->hmp_stats, p);
+}
+
+static void
+dec_hmp_sched_stats_rt(struct rq *rq, struct task_struct *p)
+{
+	dec_cumulative_runnable_avg(&rq->hmp_stats, p);
+}
+
+static void
+fixup_hmp_sched_stats_rt(struct rq *rq, struct task_struct *p,
+			 u32 new_task_load, u32 new_pred_demand)
+{
+	s64 task_load_delta = (s64)new_task_load - task_load(p);
+	s64 pred_demand_delta = PRED_DEMAND_DELTA;
+
+	fixup_cumulative_runnable_avg(&rq->hmp_stats, p, task_load_delta,
+				      pred_demand_delta);
+}
+
+#else	/* CONFIG_SCHED_HMP */
+
+static inline void
+inc_hmp_sched_stats_rt(struct rq *rq, struct task_struct *p) { }
+
+static inline void
+dec_hmp_sched_stats_rt(struct rq *rq, struct task_struct *p) { }
+
+#endif	/* CONFIG_SCHED_HMP */
+
 static inline
 unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
 {
@@ -1279,7 +1319,30 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 	dec_rt_group(rt_se, rt_rq);
 }
 
-static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
+/*
+ * Change rt_se->run_list location unless SAVE && !MOVE
+ *
+ * assumes ENQUEUE/DEQUEUE flags match
+ */
+static inline bool move_entity(unsigned int flags)
+{
+	if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
+		return false;
+
+	return true;
+}
+
+static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
+{
+	list_del_init(&rt_se->run_list);
+
+	if (list_empty(array->queue + rt_se_prio(rt_se)))
+		__clear_bit(rt_se_prio(rt_se), array->bitmap);
+
+	rt_se->on_list = 0;
+}
+
+static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
 {
 	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 	struct rt_prio_array *array = &rt_rq->active;
@@ -1292,26 +1355,37 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
 	 * get throttled and the current group doesn't have any other
 	 * active members.
 	 */
-	if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
+	if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
+		if (rt_se->on_list)
+			__delist_rt_entity(rt_se, array);
 		return;
+	}
 
-	if (head)
-		list_add(&rt_se->run_list, queue);
-	else
-		list_add_tail(&rt_se->run_list, queue);
-	__set_bit(rt_se_prio(rt_se), array->bitmap);
+	if (move_entity(flags)) {
+		WARN_ON_ONCE(rt_se->on_list);
+		if (flags & ENQUEUE_HEAD)
+			list_add(&rt_se->run_list, queue);
+		else
+			list_add_tail(&rt_se->run_list, queue);
+
+		__set_bit(rt_se_prio(rt_se), array->bitmap);
+		rt_se->on_list = 1;
+	}
+	rt_se->on_rq = 1;
 
 	inc_rt_tasks(rt_se, rt_rq);
 }
 
-static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
+static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
 {
 	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 	struct rt_prio_array *array = &rt_rq->active;
 
-	list_del_init(&rt_se->run_list);
-	if (list_empty(array->queue + rt_se_prio(rt_se)))
-		__clear_bit(rt_se_prio(rt_se), array->bitmap);
+	if (move_entity(flags)) {
+		WARN_ON_ONCE(!rt_se->on_list);
+		__delist_rt_entity(rt_se, array);
+	}
+	rt_se->on_rq = 0;
 
 	dec_rt_tasks(rt_se, rt_rq);
 }
@@ -1320,7 +1394,7 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
  * Because the prio of an upper entry depends on the lower
  * entries, we must remove entries top - down.
  */
-static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
+static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
 {
 	struct sched_rt_entity *back = NULL;
 
@@ -1333,31 +1407,31 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
 
 	for (rt_se = back; rt_se; rt_se = rt_se->back) {
 		if (on_rt_rq(rt_se))
-			__dequeue_rt_entity(rt_se);
+			__dequeue_rt_entity(rt_se, flags);
 	}
 }
 
-static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
 {
 	struct rq *rq = rq_of_rt_se(rt_se);
 
-	dequeue_rt_stack(rt_se);
+	dequeue_rt_stack(rt_se, flags);
 	for_each_sched_rt_entity(rt_se)
-		__enqueue_rt_entity(rt_se, head);
+		__enqueue_rt_entity(rt_se, flags);
 	enqueue_top_rt_rq(&rq->rt);
 }
 
-static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
+static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
 {
 	struct rq *rq = rq_of_rt_se(rt_se);
 
-	dequeue_rt_stack(rt_se);
+	dequeue_rt_stack(rt_se, flags);
 
 	for_each_sched_rt_entity(rt_se) {
 		struct rt_rq *rt_rq = group_rt_rq(rt_se);
 
 		if (rt_rq && rt_rq->rt_nr_running)
-			__enqueue_rt_entity(rt_se, false);
+			__enqueue_rt_entity(rt_se, flags);
 	}
 	enqueue_top_rt_rq(&rq->rt);
 }
@@ -1373,8 +1447,8 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 	if (flags & ENQUEUE_WAKEUP)
 		rt_se->timeout = 0;
 
-	enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
-	walt_inc_cumulative_runnable_avg(rq, p);
+	enqueue_rt_entity(rt_se, flags);
+	inc_hmp_sched_stats_rt(rq, p);
 
 	if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
 		enqueue_pushable_task(rq, p);
@@ -1412,8 +1486,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 	struct sched_rt_entity *rt_se = &p->rt;
 
 	update_curr_rt(rq);
-	dequeue_rt_entity(rt_se);
-	walt_dec_cumulative_runnable_avg(rq, p);
+	dequeue_rt_entity(rt_se, flags);
+	dec_hmp_sched_stats_rt(rq, p);
 
 	dequeue_pushable_task(rq, p);
 
@@ -1468,6 +1542,40 @@ static void yield_task_rt(struct rq *rq)
 #ifdef CONFIG_SMP
 static int find_lowest_rq(struct task_struct *task);
 
+#ifdef CONFIG_SCHED_HMP
+static int
+select_task_rq_rt_hmp(struct task_struct *p, int cpu, int sd_flag, int flags)
+{
+	int target;
+
+	rcu_read_lock();
+	target = find_lowest_rq(p);
+	if (target != -1)
+		cpu = target;
+	rcu_read_unlock();
+
+	return cpu;
+}
+#endif
+
+/*
+ * Return whether the task on the given cpu is currently non-preemptible
+ * while handling a potentially long softint, or if the task is likely
+ * to block preemptions soon because it is a ksoftirq thread that is
+ * handling slow softints.
+ */
+bool
+task_may_not_preempt(struct task_struct *task, int cpu)
+{
+	__u32 softirqs = per_cpu(active_softirqs, cpu) |
+			 __IRQ_STAT(cpu, __softirq_pending);
+	struct task_struct *cpu_ksoftirqd = per_cpu(ksoftirqd, cpu);
+
+	return ((softirqs & LONG_SOFTIRQ_MASK) &&
+		(task == cpu_ksoftirqd ||
+		 task_thread_info(task)->preempt_count & SOFTIRQ_MASK));
+}
+
 /*
  * Perform a schedtune dequeue and cancelation of boost timers if needed.
  * Should be called only with the rq->lock held.
@@ -1500,6 +1608,11 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags,
 {
 	struct task_struct *curr;
 	struct rq *rq;
+	bool may_not_preempt;
+
+#ifdef CONFIG_SCHED_HMP
+	return select_task_rq_rt_hmp(p, cpu, sd_flag, flags);
+#endif
 
 	/* For anything but wake ups, just return the task_cpu */
 	if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
@@ -1511,7 +1624,17 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags,
 	curr = READ_ONCE(rq->curr); /* unlocked access */
 
 	/*
-	 * If the current task on @p's runqueue is an RT task, then
+	 * If the current task on @p's runqueue is a softirq task,
+	 * it may run without preemption for a time that is
+	 * ill-suited for a waiting RT task. Therefore, try to
+	 * wake this RT task on another runqueue.
+	 *
+	 * Also, if the current task on @p's runqueue is an RT task, then
+	 * it may run without preemption for a time that is
+	 * ill-suited for a waiting RT task. Therefore, try to
+	 * wake this RT task on another runqueue.
+	 *
+	 * Also, if the current task on @p's runqueue is an RT task, then
 	 * try to see if we can wake this RT task up on another
 	 * runqueue. Otherwise simply start this RT task
 	 * on its current runqueue.
@@ -1532,17 +1655,22 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags,
 	 * This test is optimistic, if we get it wrong the load-balancer
 	 * will have to sort it out.
 	 */
-	if (curr && unlikely(rt_task(curr)) &&
+	may_not_preempt = task_may_not_preempt(curr, cpu);
+	if (may_not_preempt ||
+	    (unlikely(rt_task(curr)) &&
 	    (curr->nr_cpus_allowed < 2 ||
-	     curr->prio <= p->prio)) {
+	     curr->prio <= p->prio))) {
 		int target = find_lowest_rq(p);
 
 		/*
-		 * Don't bother moving it if the destination CPU is
-		 * not running a lower priority task.
+		 * If cpu is non-preemptible, prefer remote cpu
+		 * even if it's running a higher-prio task.
+		 * Otherwise: Don't bother moving it if the
+		 * destination CPU is not running a lower priority task.
 		 */
 		if (target != -1 &&
-		    p->prio < cpu_rq(target)->rt.highest_prio.curr)
+		   (may_not_preempt ||
+		    p->prio < cpu_rq(target)->rt.highest_prio.curr))
 			cpu = target;
 	}
 	rcu_read_unlock();
@@ -1751,6 +1879,109 @@ static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
 
 static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
 
+#ifdef CONFIG_SCHED_HMP
+
+static int find_lowest_rq_hmp(struct task_struct *task)
+{
+	struct cpumask *lowest_mask = *this_cpu_ptr(&local_cpu_mask);
+	struct cpumask candidate_mask = CPU_MASK_NONE;
+	struct sched_cluster *cluster;
+	int best_cpu = -1;
+	int prev_cpu = task_cpu(task);
+	u64 cpu_load, min_load = ULLONG_MAX;
+	int i;
+	int restrict_cluster;
+	int boost_on_big;
+	int pack_task, wakeup_latency, least_wakeup_latency = INT_MAX;
+
+	boost_on_big = sched_boost() == FULL_THROTTLE_BOOST &&
+			sched_boost_policy() == SCHED_BOOST_ON_BIG;
+
+	restrict_cluster = sysctl_sched_restrict_cluster_spill;
+
+	/* Make sure the mask is initialized first */
+	if (unlikely(!lowest_mask))
+		return best_cpu;
+
+	if (task->nr_cpus_allowed == 1)
+		return best_cpu; /* No other targets possible */
+
+	if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
+		return best_cpu; /* No targets found */
+
+	pack_task = is_short_burst_task(task);
+
+	/*
+	 * At this point we have built a mask of cpus representing the
+	 * lowest priority tasks in the system.  Now we want to elect
+	 * the best one based on our affinity and topology.
+	 */
+
+retry:
+	for_each_sched_cluster(cluster) {
+		if (boost_on_big && cluster->capacity != max_possible_capacity)
+			continue;
+
+		cpumask_and(&candidate_mask, &cluster->cpus, lowest_mask);
+		cpumask_andnot(&candidate_mask, &candidate_mask,
+			       cpu_isolated_mask);
+		/*
+		 * When placement boost is active, if there is no eligible CPU
+		 * in the highest capacity cluster, we fallback to the other
+		 * clusters. So clear the CPUs of the traversed cluster from
+		 * the lowest_mask.
+		 */
+		if (unlikely(boost_on_big))
+			cpumask_andnot(lowest_mask, lowest_mask,
+				       &cluster->cpus);
+
+		if (cpumask_empty(&candidate_mask))
+			continue;
+
+		for_each_cpu(i, &candidate_mask) {
+			if (sched_cpu_high_irqload(i))
+				continue;
+
+			cpu_load = cpu_rq(i)->hmp_stats.cumulative_runnable_avg;
+			if (!restrict_cluster)
+				cpu_load = scale_load_to_cpu(cpu_load, i);
+
+			if (pack_task) {
+				wakeup_latency = cpu_rq(i)->wakeup_latency;
+
+				if (wakeup_latency > least_wakeup_latency)
+					continue;
+
+				if (wakeup_latency < least_wakeup_latency) {
+					least_wakeup_latency = wakeup_latency;
+					min_load = cpu_load;
+					best_cpu = i;
+					continue;
+				}
+			}
+
+			if (cpu_load < min_load ||
+				(cpu_load == min_load &&
+				(i == prev_cpu || (best_cpu != prev_cpu &&
+				cpus_share_cache(prev_cpu, i))))) {
+				min_load = cpu_load;
+				best_cpu = i;
+			}
+		}
+
+		if (restrict_cluster && best_cpu != -1)
+			break;
+	}
+
+	if (unlikely(boost_on_big && best_cpu == -1)) {
+		boost_on_big = 0;
+		goto retry;
+	}
+
+	return best_cpu;
+}
+#endif	/* CONFIG_SCHED_HMP */
+
 static int find_lowest_rq(struct task_struct *task)
 {
 	struct sched_domain *sd;
@@ -1758,6 +1989,10 @@ static int find_lowest_rq(struct task_struct *task)
 	int this_cpu = smp_processor_id();
 	int cpu      = task_cpu(task);
 
+#ifdef CONFIG_SCHED_HMP
+	return find_lowest_rq_hmp(task);
+#endif
+
 	/* Make sure the mask is initialized first */
 	if (unlikely(!lowest_mask))
 		return -1;
@@ -1974,11 +2209,13 @@ retry:
 		goto retry;
 	}
 
+	next_task->on_rq = TASK_ON_RQ_MIGRATING;
 	deactivate_task(rq, next_task, 0);
 	next_task->on_rq = TASK_ON_RQ_MIGRATING;
 	set_task_cpu(next_task, lowest_rq->cpu);
 	next_task->on_rq = TASK_ON_RQ_QUEUED;
 	activate_task(lowest_rq, next_task, 0);
+	next_task->on_rq = TASK_ON_RQ_QUEUED;
 	ret = 1;
 
 	resched_curr(lowest_rq);
@@ -2248,11 +2485,13 @@ static void pull_rt_task(struct rq *this_rq)
 
 			resched = true;
 
+			p->on_rq = TASK_ON_RQ_MIGRATING;
 			deactivate_task(src_rq, p, 0);
 			p->on_rq = TASK_ON_RQ_MIGRATING;
 			set_task_cpu(p, this_cpu);
 			p->on_rq = TASK_ON_RQ_QUEUED;
 			activate_task(this_rq, p, 0);
+			p->on_rq = TASK_ON_RQ_QUEUED;
 			/*
 			 * We continue with the search, just in
 			 * case there's an even higher prio task
@@ -2325,7 +2564,8 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
 	 * we may need to handle the pulling of RT tasks
 	 * now.
 	 */
-	if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
+	if (!task_on_rq_queued(p) || rq->rt.rt_nr_running ||
+		cpu_isolated(cpu_of(rq)))
 		return;
 
 	queue_pull_task(rq);
@@ -2340,6 +2580,7 @@ void __init init_sched_rt_class(void)
 					GFP_KERNEL, cpu_to_node(i));
 	}
 }
+
 #endif /* CONFIG_SMP */
 
 /*
@@ -2513,6 +2754,11 @@ const struct sched_class rt_sched_class = {
 	.switched_to		= switched_to_rt,
 
 	.update_curr		= update_curr_rt,
+#ifdef CONFIG_SCHED_HMP
+	.inc_hmp_sched_stats	= inc_hmp_sched_stats_rt,
+	.dec_hmp_sched_stats	= dec_hmp_sched_stats_rt,
+	.fixup_hmp_sched_stats	= fixup_hmp_sched_stats_rt,
+#endif
 };
 
 #ifdef CONFIG_SCHED_DEBUG
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7f18a3243af5..8cb8b1c555c8 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -27,14 +27,13 @@ extern unsigned long calc_load_update;
 extern atomic_long_t calc_load_tasks;
 
 extern void calc_global_load_tick(struct rq *this_rq);
+
 extern long calc_load_fold_active(struct rq *this_rq);
 
 #ifdef CONFIG_SMP
 extern void update_cpu_load_active(struct rq *this_rq);
-extern void check_for_migration(struct rq *rq, struct task_struct *p);
 #else
 static inline void update_cpu_load_active(struct rq *this_rq) { }
-static inline void check_for_migration(struct rq *rq, struct task_struct *p) { }
 #endif
 
 /*
@@ -244,6 +243,10 @@ struct cfs_bandwidth {
 struct task_group {
 	struct cgroup_subsys_state css;
 
+#ifdef CONFIG_SCHED_HMP
+	bool upmigrate_discouraged;
+#endif
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/* schedulable entities of this group on each cpu */
 	struct sched_entity **se;
@@ -349,12 +352,96 @@ static inline void set_task_rq_fair(struct sched_entity *se,
 #endif /* CONFIG_SMP */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
+extern struct task_group *css_tg(struct cgroup_subsys_state *css);
 #else /* CONFIG_CGROUP_SCHED */
 
 struct cfs_bandwidth { };
 
 #endif	/* CONFIG_CGROUP_SCHED */
 
+#ifdef CONFIG_SCHED_HMP
+
+#define NUM_TRACKED_WINDOWS 2
+#define NUM_LOAD_INDICES 1000
+
+struct hmp_sched_stats {
+	int nr_big_tasks;
+	u64 cumulative_runnable_avg;
+	u64 pred_demands_sum;
+};
+
+struct load_subtractions {
+	u64 window_start;
+	u64 subs;
+	u64 new_subs;
+};
+
+struct group_cpu_time {
+	u64 curr_runnable_sum;
+	u64 prev_runnable_sum;
+	u64 nt_curr_runnable_sum;
+	u64 nt_prev_runnable_sum;
+};
+
+struct sched_cluster {
+	raw_spinlock_t load_lock;
+	struct list_head list;
+	struct cpumask cpus;
+	int id;
+	int max_power_cost;
+	int min_power_cost;
+	int max_possible_capacity;
+	int capacity;
+	int efficiency; /* Differentiate cpus with different IPC capability */
+	int load_scale_factor;
+	unsigned int exec_scale_factor;
+	/*
+	 * max_freq = user maximum
+	 * max_mitigated_freq = thermal defined maximum
+	 * max_possible_freq = maximum supported by hardware
+	 */
+	unsigned int cur_freq, max_freq, max_mitigated_freq, min_freq;
+	unsigned int max_possible_freq;
+	bool freq_init_done;
+	int dstate, dstate_wakeup_latency, dstate_wakeup_energy;
+	unsigned int static_cluster_pwr_cost;
+	int notifier_sent;
+	bool wake_up_idle;
+	atomic64_t last_cc_update;
+	atomic64_t cycles;
+};
+
+extern unsigned long all_cluster_ids[];
+
+static inline int cluster_first_cpu(struct sched_cluster *cluster)
+{
+	return cpumask_first(&cluster->cpus);
+}
+
+struct related_thread_group {
+	int id;
+	raw_spinlock_t lock;
+	struct list_head tasks;
+	struct list_head list;
+	struct sched_cluster *preferred_cluster;
+	struct rcu_head rcu;
+	u64 last_update;
+};
+
+extern struct list_head cluster_head;
+extern int num_clusters;
+extern struct sched_cluster *sched_cluster[NR_CPUS];
+
+struct cpu_cycle {
+	u64 cycles;
+	u64 time;
+};
+
+#define for_each_sched_cluster(cluster) \
+	list_for_each_entry_rcu(cluster, &cluster_head, list)
+
+#endif /* CONFIG_SCHED_HMP */
+
 /* CFS-related fields in a runqueue */
 struct cfs_rq {
 	struct load_weight load;
@@ -423,11 +510,12 @@ struct cfs_rq {
 	struct list_head leaf_cfs_rq_list;
 	struct task_group *tg;	/* group that "owns" this runqueue */
 
-#ifdef CONFIG_SCHED_WALT
-	u64 cumulative_runnable_avg;
+#ifdef CONFIG_CFS_BANDWIDTH
+
+#ifdef CONFIG_SCHED_HMP
+	struct hmp_sched_stats hmp_stats;
 #endif
 
-#ifdef CONFIG_CFS_BANDWIDTH
 	int runtime_enabled;
 	u64 runtime_expires;
 	s64 runtime_remaining;
@@ -697,6 +785,37 @@ struct rq {
 	u64 max_idle_balance_cost;
 #endif
 
+#ifdef CONFIG_SCHED_HMP
+	struct sched_cluster *cluster;
+	struct cpumask freq_domain_cpumask;
+	struct hmp_sched_stats hmp_stats;
+
+	int cstate, wakeup_latency, wakeup_energy;
+	u64 window_start;
+	unsigned long hmp_flags;
+
+	u64 cur_irqload;
+	u64 avg_irqload;
+	u64 irqload_ts;
+	unsigned int static_cpu_pwr_cost;
+	struct task_struct *ed_task;
+	struct cpu_cycle cc;
+	u64 old_busy_time, old_busy_time_group;
+	u64 old_estimated_time;
+	u64 curr_runnable_sum;
+	u64 prev_runnable_sum;
+	u64 nt_curr_runnable_sum;
+	u64 nt_prev_runnable_sum;
+	struct group_cpu_time grp_time;
+	struct load_subtractions load_subs[NUM_TRACKED_WINDOWS];
+	DECLARE_BITMAP_ARRAY(top_tasks_bitmap,
+			NUM_TRACKED_WINDOWS, NUM_LOAD_INDICES);
+	u8 *top_tasks[NUM_TRACKED_WINDOWS];
+	u8 curr_table;
+	int prev_top;
+	int curr_top;
+#endif
+
 #ifdef CONFIG_SCHED_WALT
 	u64 cumulative_runnable_avg;
 	u64 window_start;
@@ -710,7 +829,6 @@ struct rq {
 	u64 cum_window_demand;
 #endif /* CONFIG_SCHED_WALT */
 
-
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 	u64 prev_irq_time;
 #endif
@@ -985,6 +1103,653 @@ static inline void sched_ttwu_pending(void) { }
 #include "stats.h"
 #include "auto_group.h"
 
+enum sched_boost_policy {
+	SCHED_BOOST_NONE,
+	SCHED_BOOST_ON_BIG,
+	SCHED_BOOST_ON_ALL,
+};
+
+#ifdef CONFIG_SCHED_HMP
+
+#define WINDOW_STATS_RECENT		0
+#define WINDOW_STATS_MAX		1
+#define WINDOW_STATS_MAX_RECENT_AVG	2
+#define WINDOW_STATS_AVG		3
+#define WINDOW_STATS_INVALID_POLICY	4
+
+#define SCHED_UPMIGRATE_MIN_NICE 15
+#define EXITING_TASK_MARKER	0xdeaddead
+
+#define UP_MIGRATION		1
+#define DOWN_MIGRATION		2
+#define IRQLOAD_MIGRATION	3
+
+extern struct mutex policy_mutex;
+extern unsigned int sched_ravg_window;
+extern unsigned int sched_disable_window_stats;
+extern unsigned int max_possible_freq;
+extern unsigned int min_max_freq;
+extern unsigned int pct_task_load(struct task_struct *p);
+extern unsigned int max_possible_efficiency;
+extern unsigned int min_possible_efficiency;
+extern unsigned int max_capacity;
+extern unsigned int min_capacity;
+extern unsigned int max_load_scale_factor;
+extern unsigned int max_possible_capacity;
+extern unsigned int min_max_possible_capacity;
+extern unsigned int max_power_cost;
+extern unsigned int sched_init_task_load_windows;
+extern unsigned int up_down_migrate_scale_factor;
+extern unsigned int sysctl_sched_restrict_cluster_spill;
+extern unsigned int sched_pred_alert_load;
+extern struct sched_cluster init_cluster;
+extern unsigned int  __read_mostly sched_short_sleep_task_threshold;
+extern unsigned int  __read_mostly sched_long_cpu_selection_threshold;
+extern unsigned int  __read_mostly sched_big_waker_task_load;
+extern unsigned int  __read_mostly sched_small_wakee_task_load;
+extern unsigned int  __read_mostly sched_spill_load;
+extern unsigned int  __read_mostly sched_upmigrate;
+extern unsigned int  __read_mostly sched_downmigrate;
+extern unsigned int  __read_mostly sysctl_sched_spill_nr_run;
+extern unsigned int  __read_mostly sched_load_granule;
+
+extern void init_new_task_load(struct task_struct *p, bool idle_task);
+extern u64 sched_ktime_clock(void);
+extern int got_boost_kick(void);
+extern int register_cpu_cycle_counter_cb(struct cpu_cycle_counter_cb *cb);
+extern void update_task_ravg(struct task_struct *p, struct rq *rq, int event,
+						u64 wallclock, u64 irqtime);
+extern bool early_detection_notify(struct rq *rq, u64 wallclock);
+extern void clear_ed_task(struct task_struct *p, struct rq *rq);
+extern void fixup_busy_time(struct task_struct *p, int new_cpu);
+extern void clear_boost_kick(int cpu);
+extern void clear_hmp_request(int cpu);
+extern void mark_task_starting(struct task_struct *p);
+extern void set_window_start(struct rq *rq);
+extern void update_cluster_topology(void);
+extern void note_task_waking(struct task_struct *p, u64 wallclock);
+extern void set_task_last_switch_out(struct task_struct *p, u64 wallclock);
+extern void init_clusters(void);
+extern void reset_cpu_hmp_stats(int cpu, int reset_cra);
+extern unsigned int max_task_load(void);
+extern void sched_account_irqtime(int cpu, struct task_struct *curr,
+				 u64 delta, u64 wallclock);
+extern void sched_account_irqstart(int cpu, struct task_struct *curr,
+				   u64 wallclock);
+extern unsigned int cpu_temp(int cpu);
+extern unsigned int nr_eligible_big_tasks(int cpu);
+extern int update_preferred_cluster(struct related_thread_group *grp,
+			struct task_struct *p, u32 old_load);
+extern void set_preferred_cluster(struct related_thread_group *grp);
+extern void add_new_task_to_grp(struct task_struct *new);
+extern unsigned int update_freq_aggregate_threshold(unsigned int threshold);
+extern void update_avg_burst(struct task_struct *p);
+extern void update_avg(u64 *avg, u64 sample);
+
+#define NO_BOOST 0
+#define FULL_THROTTLE_BOOST 1
+#define CONSERVATIVE_BOOST 2
+#define RESTRAINED_BOOST 3
+
+static inline struct sched_cluster *cpu_cluster(int cpu)
+{
+	return cpu_rq(cpu)->cluster;
+}
+
+static inline int cpu_capacity(int cpu)
+{
+	return cpu_rq(cpu)->cluster->capacity;
+}
+
+static inline int cpu_max_possible_capacity(int cpu)
+{
+	return cpu_rq(cpu)->cluster->max_possible_capacity;
+}
+
+static inline int cpu_load_scale_factor(int cpu)
+{
+	return cpu_rq(cpu)->cluster->load_scale_factor;
+}
+
+static inline int cpu_efficiency(int cpu)
+{
+	return cpu_rq(cpu)->cluster->efficiency;
+}
+
+static inline unsigned int cpu_cur_freq(int cpu)
+{
+	return cpu_rq(cpu)->cluster->cur_freq;
+}
+
+static inline unsigned int cpu_min_freq(int cpu)
+{
+	return cpu_rq(cpu)->cluster->min_freq;
+}
+
+static inline unsigned int cluster_max_freq(struct sched_cluster *cluster)
+{
+	/*
+	 * Governor and thermal driver don't know the other party's mitigation
+	 * voting. So struct cluster saves both and return min() for current
+	 * cluster fmax.
+	 */
+	return min(cluster->max_mitigated_freq, cluster->max_freq);
+}
+
+static inline unsigned int cpu_max_freq(int cpu)
+{
+	return cluster_max_freq(cpu_rq(cpu)->cluster);
+}
+
+static inline unsigned int cpu_max_possible_freq(int cpu)
+{
+	return cpu_rq(cpu)->cluster->max_possible_freq;
+}
+
+static inline int same_cluster(int src_cpu, int dst_cpu)
+{
+	return cpu_rq(src_cpu)->cluster == cpu_rq(dst_cpu)->cluster;
+}
+
+static inline int cpu_max_power_cost(int cpu)
+{
+	return cpu_rq(cpu)->cluster->max_power_cost;
+}
+
+static inline int cpu_min_power_cost(int cpu)
+{
+	return cpu_rq(cpu)->cluster->min_power_cost;
+}
+
+static inline u32 cpu_cycles_to_freq(u64 cycles, u64 period)
+{
+	return div64_u64(cycles, period);
+}
+
+static inline bool hmp_capable(void)
+{
+	return max_possible_capacity != min_max_possible_capacity;
+}
+
+static inline bool is_max_capacity_cpu(int cpu)
+{
+	return cpu_max_possible_capacity(cpu) == max_possible_capacity;
+}
+
+static inline bool is_min_capacity_cpu(int cpu)
+{
+	return cpu_max_possible_capacity(cpu) == min_max_possible_capacity;
+}
+
+/*
+ * 'load' is in reference to "best cpu" at its best frequency.
+ * Scale that in reference to a given cpu, accounting for how bad it is
+ * in reference to "best cpu".
+ */
+static inline u64 scale_load_to_cpu(u64 task_load, int cpu)
+{
+	u64 lsf = cpu_load_scale_factor(cpu);
+
+	if (lsf != 1024) {
+		task_load *= lsf;
+		task_load /= 1024;
+	}
+
+	return task_load;
+}
+
+static inline unsigned int task_load(struct task_struct *p)
+{
+	return p->ravg.demand;
+}
+
+static inline void
+inc_cumulative_runnable_avg(struct hmp_sched_stats *stats,
+				 struct task_struct *p)
+{
+	u32 task_load;
+
+	if (sched_disable_window_stats)
+		return;
+
+	task_load = sched_disable_window_stats ? 0 : p->ravg.demand;
+
+	stats->cumulative_runnable_avg += task_load;
+	stats->pred_demands_sum += p->ravg.pred_demand;
+}
+
+static inline void
+dec_cumulative_runnable_avg(struct hmp_sched_stats *stats,
+				struct task_struct *p)
+{
+	u32 task_load;
+
+	if (sched_disable_window_stats)
+		return;
+
+	task_load = sched_disable_window_stats ? 0 : p->ravg.demand;
+
+	stats->cumulative_runnable_avg -= task_load;
+
+	BUG_ON((s64)stats->cumulative_runnable_avg < 0);
+
+	stats->pred_demands_sum -= p->ravg.pred_demand;
+	BUG_ON((s64)stats->pred_demands_sum < 0);
+}
+
+static inline void
+fixup_cumulative_runnable_avg(struct hmp_sched_stats *stats,
+			      struct task_struct *p, s64 task_load_delta,
+			      s64 pred_demand_delta)
+{
+	if (sched_disable_window_stats)
+		return;
+
+	stats->cumulative_runnable_avg += task_load_delta;
+	BUG_ON((s64)stats->cumulative_runnable_avg < 0);
+
+	stats->pred_demands_sum += pred_demand_delta;
+	BUG_ON((s64)stats->pred_demands_sum < 0);
+}
+
+#define pct_to_real(tunable)	\
+		(div64_u64((u64)tunable * (u64)max_task_load(), 100))
+
+#define real_to_pct(tunable)	\
+		(div64_u64((u64)tunable * (u64)100, (u64)max_task_load()))
+
+#define SCHED_HIGH_IRQ_TIMEOUT 3
+static inline u64 sched_irqload(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	s64 delta;
+
+	delta = get_jiffies_64() - rq->irqload_ts;
+	/*
+	 * Current context can be preempted by irq and rq->irqload_ts can be
+	 * updated by irq context so that delta can be negative.
+	 * But this is okay and we can safely return as this means there
+	 * was recent irq occurrence.
+	 */
+
+	if (delta < SCHED_HIGH_IRQ_TIMEOUT)
+		return rq->avg_irqload;
+	else
+		return 0;
+}
+
+static inline int sched_cpu_high_irqload(int cpu)
+{
+	return sched_irqload(cpu) >= sysctl_sched_cpu_high_irqload;
+}
+
+static inline bool task_in_related_thread_group(struct task_struct *p)
+{
+	return !!(rcu_access_pointer(p->grp) != NULL);
+}
+
+static inline
+struct related_thread_group *task_related_thread_group(struct task_struct *p)
+{
+	return rcu_dereference(p->grp);
+}
+
+#define PRED_DEMAND_DELTA ((s64)new_pred_demand - p->ravg.pred_demand)
+
+extern void
+check_for_freq_change(struct rq *rq, bool check_pred, bool check_groups);
+
+extern void notify_migration(int src_cpu, int dest_cpu,
+			bool src_cpu_dead, struct task_struct *p);
+
+/* Is frequency of two cpus synchronized with each other? */
+static inline int same_freq_domain(int src_cpu, int dst_cpu)
+{
+	struct rq *rq = cpu_rq(src_cpu);
+
+	if (src_cpu == dst_cpu)
+		return 1;
+
+	return cpumask_test_cpu(dst_cpu, &rq->freq_domain_cpumask);
+}
+
+#define	BOOST_KICK	0
+#define	CPU_RESERVED	1
+
+static inline int is_reserved(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+
+	return test_bit(CPU_RESERVED, &rq->hmp_flags);
+}
+
+static inline int mark_reserved(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+
+	/* Name boost_flags as hmp_flags? */
+	return test_and_set_bit(CPU_RESERVED, &rq->hmp_flags);
+}
+
+static inline void clear_reserved(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+
+	clear_bit(CPU_RESERVED, &rq->hmp_flags);
+}
+
+static inline u64 cpu_cravg_sync(int cpu, int sync)
+{
+	struct rq *rq = cpu_rq(cpu);
+	u64 load;
+
+	load = rq->hmp_stats.cumulative_runnable_avg;
+
+	/*
+	 * If load is being checked in a sync wakeup environment,
+	 * we may want to discount the load of the currently running
+	 * task.
+	 */
+	if (sync && cpu == smp_processor_id()) {
+		if (load > rq->curr->ravg.demand)
+			load -= rq->curr->ravg.demand;
+		else
+			load = 0;
+	}
+
+	return load;
+}
+
+static inline bool is_short_burst_task(struct task_struct *p)
+{
+	return p->ravg.avg_burst < sysctl_sched_short_burst &&
+	       p->ravg.avg_sleep_time > sysctl_sched_short_sleep;
+}
+
+extern void check_for_migration(struct rq *rq, struct task_struct *p);
+extern void pre_big_task_count_change(const struct cpumask *cpus);
+extern void post_big_task_count_change(const struct cpumask *cpus);
+extern void set_hmp_defaults(void);
+extern int power_delta_exceeded(unsigned int cpu_cost, unsigned int base_cost);
+extern unsigned int power_cost(int cpu, u64 demand);
+extern void reset_all_window_stats(u64 window_start, unsigned int window_size);
+extern int sched_boost(void);
+extern int task_load_will_fit(struct task_struct *p, u64 task_load, int cpu,
+					enum sched_boost_policy boost_policy);
+extern enum sched_boost_policy sched_boost_policy(void);
+extern int task_will_fit(struct task_struct *p, int cpu);
+extern u64 cpu_load(int cpu);
+extern u64 cpu_load_sync(int cpu, int sync);
+extern int preferred_cluster(struct sched_cluster *cluster,
+						struct task_struct *p);
+extern void inc_nr_big_task(struct hmp_sched_stats *stats,
+					struct task_struct *p);
+extern void dec_nr_big_task(struct hmp_sched_stats *stats,
+					struct task_struct *p);
+extern void inc_rq_hmp_stats(struct rq *rq,
+				struct task_struct *p, int change_cra);
+extern void dec_rq_hmp_stats(struct rq *rq,
+				struct task_struct *p, int change_cra);
+extern void reset_hmp_stats(struct hmp_sched_stats *stats, int reset_cra);
+extern int is_big_task(struct task_struct *p);
+extern int upmigrate_discouraged(struct task_struct *p);
+extern struct sched_cluster *rq_cluster(struct rq *rq);
+extern int nr_big_tasks(struct rq *rq);
+extern void fixup_nr_big_tasks(struct hmp_sched_stats *stats,
+					struct task_struct *p, s64 delta);
+extern void reset_task_stats(struct task_struct *p);
+extern void reset_cfs_rq_hmp_stats(int cpu, int reset_cra);
+extern void _inc_hmp_sched_stats_fair(struct rq *rq,
+			struct task_struct *p, int change_cra);
+extern u64 cpu_upmigrate_discourage_read_u64(struct cgroup_subsys_state *css,
+					struct cftype *cft);
+extern int cpu_upmigrate_discourage_write_u64(struct cgroup_subsys_state *css,
+				struct cftype *cft, u64 upmigrate_discourage);
+extern void sched_boost_parse_dt(void);
+extern void clear_top_tasks_bitmap(unsigned long *bitmap);
+
+#if defined(CONFIG_SCHED_TUNE) && defined(CONFIG_CGROUP_SCHEDTUNE)
+extern bool task_sched_boost(struct task_struct *p);
+extern int sync_cgroup_colocation(struct task_struct *p, bool insert);
+extern bool same_schedtune(struct task_struct *tsk1, struct task_struct *tsk2);
+extern void update_cgroup_boost_settings(void);
+extern void restore_cgroup_boost_settings(void);
+
+#else
+static inline bool
+same_schedtune(struct task_struct *tsk1, struct task_struct *tsk2)
+{
+	return true;
+}
+
+static inline bool task_sched_boost(struct task_struct *p)
+{
+	return true;
+}
+
+static inline void update_cgroup_boost_settings(void) { }
+static inline void restore_cgroup_boost_settings(void) { }
+#endif
+
+extern int alloc_related_thread_groups(void);
+
+#else	/* CONFIG_SCHED_HMP */
+
+struct hmp_sched_stats;
+struct related_thread_group;
+struct sched_cluster;
+
+static inline enum sched_boost_policy sched_boost_policy(void)
+{
+	return SCHED_BOOST_NONE;
+}
+
+static inline bool task_sched_boost(struct task_struct *p)
+{
+	return true;
+}
+
+static inline int got_boost_kick(void)
+{
+	return 0;
+}
+
+static inline void update_task_ravg(struct task_struct *p, struct rq *rq,
+				int event, u64 wallclock, u64 irqtime) { }
+
+static inline bool early_detection_notify(struct rq *rq, u64 wallclock)
+{
+	return 0;
+}
+
+static inline void clear_ed_task(struct task_struct *p, struct rq *rq) { }
+static inline void fixup_busy_time(struct task_struct *p, int new_cpu) { }
+static inline void clear_boost_kick(int cpu) { }
+static inline void clear_hmp_request(int cpu) { }
+static inline void mark_task_starting(struct task_struct *p) { }
+static inline void set_window_start(struct rq *rq) { }
+static inline void init_clusters(void) {}
+static inline void update_cluster_topology(void) { }
+static inline void note_task_waking(struct task_struct *p, u64 wallclock) { }
+static inline void set_task_last_switch_out(struct task_struct *p,
+					    u64 wallclock) { }
+
+static inline int task_will_fit(struct task_struct *p, int cpu)
+{
+	return 1;
+}
+
+static inline int select_best_cpu(struct task_struct *p, int target,
+				  int reason, int sync)
+{
+	return 0;
+}
+
+static inline unsigned int power_cost(int cpu, u64 demand)
+{
+	return SCHED_CAPACITY_SCALE;
+}
+
+static inline int sched_boost(void)
+{
+	return 0;
+}
+
+static inline int is_big_task(struct task_struct *p)
+{
+	return 0;
+}
+
+static inline int nr_big_tasks(struct rq *rq)
+{
+	return 0;
+}
+
+static inline int is_cpu_throttling_imminent(int cpu)
+{
+	return 0;
+}
+
+static inline int is_task_migration_throttled(struct task_struct *p)
+{
+	return 0;
+}
+
+static inline unsigned int cpu_temp(int cpu)
+{
+	return 0;
+}
+
+static inline void
+inc_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra) { }
+
+static inline void
+dec_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra) { }
+
+static inline void
+inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p) { }
+
+static inline void
+dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p) { }
+
+static inline int
+preferred_cluster(struct sched_cluster *cluster, struct task_struct *p)
+{
+	return 1;
+}
+
+static inline struct sched_cluster *rq_cluster(struct rq *rq)
+{
+	return NULL;
+}
+
+static inline void init_new_task_load(struct task_struct *p, bool idle_task)
+{
+}
+
+static inline u64 scale_load_to_cpu(u64 load, int cpu)
+{
+	return load;
+}
+
+static inline unsigned int nr_eligible_big_tasks(int cpu)
+{
+	return 0;
+}
+
+static inline bool is_max_capacity_cpu(int cpu) { return true; }
+
+static inline int pct_task_load(struct task_struct *p) { return 0; }
+
+static inline int cpu_capacity(int cpu)
+{
+	return SCHED_LOAD_SCALE;
+}
+
+static inline int same_cluster(int src_cpu, int dst_cpu) { return 1; }
+
+static inline void inc_cumulative_runnable_avg(struct hmp_sched_stats *stats,
+		 struct task_struct *p)
+{
+}
+
+static inline void dec_cumulative_runnable_avg(struct hmp_sched_stats *stats,
+		 struct task_struct *p)
+{
+}
+
+static inline void sched_account_irqtime(int cpu, struct task_struct *curr,
+				 u64 delta, u64 wallclock)
+{
+}
+
+static inline void sched_account_irqstart(int cpu, struct task_struct *curr,
+					  u64 wallclock)
+{
+}
+
+static inline int sched_cpu_high_irqload(int cpu) { return 0; }
+
+static inline void set_preferred_cluster(struct related_thread_group *grp) { }
+
+static inline bool task_in_related_thread_group(struct task_struct *p)
+{
+	return false;
+}
+
+static inline
+struct related_thread_group *task_related_thread_group(struct task_struct *p)
+{
+	return NULL;
+}
+
+static inline u32 task_load(struct task_struct *p) { return 0; }
+
+static inline int update_preferred_cluster(struct related_thread_group *grp,
+			 struct task_struct *p, u32 old_load)
+{
+	return 0;
+}
+
+static inline void add_new_task_to_grp(struct task_struct *new) {}
+
+#define PRED_DEMAND_DELTA (0)
+
+static inline void
+check_for_freq_change(struct rq *rq, bool check_pred, bool check_groups) { }
+
+static inline void notify_migration(int src_cpu, int dest_cpu,
+			bool src_cpu_dead, struct task_struct *p) { }
+
+static inline int same_freq_domain(int src_cpu, int dst_cpu)
+{
+	return 1;
+}
+
+static inline void check_for_migration(struct rq *rq, struct task_struct *p) { }
+static inline void pre_big_task_count_change(void) { }
+static inline void post_big_task_count_change(void) { }
+static inline void set_hmp_defaults(void) { }
+
+static inline void clear_reserved(int cpu) { }
+static inline void sched_boost_parse_dt(void) {}
+static inline int alloc_related_thread_groups(void) { return 0; }
+
+#define trace_sched_cpu_load(...)
+#define trace_sched_cpu_load_lb(...)
+#define trace_sched_cpu_load_cgroup(...)
+#define trace_sched_cpu_load_wakeup(...)
+
+static inline void update_avg_burst(struct task_struct *p) {}
+
+#endif	/* CONFIG_SCHED_HMP */
+
+/*
+ * Returns the rq capacity of any rq in a group. This does not play
+ * well with groups where rq capacity can change independently.
+ */
+#define group_rq_capacity(group) cpu_capacity(group_first_cpu(group))
+
 #ifdef CONFIG_CGROUP_SCHED
 
 /*
@@ -1031,7 +1796,6 @@ static inline struct task_group *task_group(struct task_struct *p)
 {
 	return NULL;
 }
-
 #endif /* CONFIG_CGROUP_SCHED */
 
 static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
@@ -1185,6 +1949,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 #define WF_SYNC		0x01		/* waker goes to sleep after wakeup */
 #define WF_FORK		0x02		/* child wakeup after fork */
 #define WF_MIGRATED	0x4		/* internal use, task got migrated */
+#define WF_NO_NOTIFIER	0x08		/* do not notify governor */
 
 /*
  * To aid in avoiding the subversion of "niceness" due to uneven distribution
@@ -1239,19 +2004,41 @@ static const u32 prio_to_wmult[40] = {
  /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
 };
 
+/*
+ * {de,en}queue flags:
+ *
+ * DEQUEUE_SLEEP  - task is no longer runnable
+ * ENQUEUE_WAKEUP - task just became runnable
+ *
+ * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks
+ *                are in a known state which allows modification. Such pairs
+ *                should preserve as much state as possible.
+ *
+ * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location
+ *        in the runqueue.
+ *
+ * ENQUEUE_HEAD      - place at front of runqueue (tail if not specified)
+ * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
+ * ENQUEUE_WAKING    - sched_class::task_waking was called
+ *
+ */
+
+#define DEQUEUE_SLEEP		0x01
+#define DEQUEUE_SAVE		0x02 /* matches ENQUEUE_RESTORE */
+#define DEQUEUE_MOVE		0x04 /* matches ENQUEUE_MOVE */
+
 #define ENQUEUE_WAKEUP		0x01
-#define ENQUEUE_HEAD		0x02
+#define ENQUEUE_RESTORE		0x02
+#define ENQUEUE_MOVE		0x04
+
+#define ENQUEUE_HEAD		0x08
+#define ENQUEUE_REPLENISH	0x10
 #ifdef CONFIG_SMP
-#define ENQUEUE_WAKING		0x04	/* sched_class::task_waking was called */
+#define ENQUEUE_WAKING		0x20
 #else
 #define ENQUEUE_WAKING		0x00
 #endif
-#define ENQUEUE_REPLENISH	0x08
-#define ENQUEUE_RESTORE	0x10
-#define ENQUEUE_WAKEUP_NEW	0x20
-
-#define DEQUEUE_SLEEP		0x01
-#define DEQUEUE_SAVE		0x02
+#define ENQUEUE_WAKEUP_NEW	0x40
 
 #define RETRY_TASK		((void *)-1UL)
 
@@ -1318,6 +2105,12 @@ struct sched_class {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	void (*task_change_group)(struct task_struct *p, int type);
 #endif
+#ifdef CONFIG_SCHED_HMP
+	void (*inc_hmp_sched_stats)(struct rq *rq, struct task_struct *p);
+	void (*dec_hmp_sched_stats)(struct rq *rq, struct task_struct *p);
+	void (*fixup_hmp_sched_stats)(struct rq *rq, struct task_struct *p,
+				      u32 new_task_load, u32 new_pred_demand);
+#endif
 };
 
 static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
@@ -1342,6 +2135,7 @@ extern void init_max_cpu_capacity(struct max_cpu_capacity *mcc);
 extern void update_group_capacity(struct sched_domain *sd, int cpu);
 
 extern void trigger_load_balance(struct rq *rq);
+extern void nohz_balance_clear_nohz_mask(int cpu);
 
 extern void idle_enter_fair(struct rq *this_rq);
 extern void idle_exit_fair(struct rq *this_rq);
@@ -1399,7 +2193,9 @@ static inline int idle_get_state_idx(struct rq *rq)
 }
 #endif
 
+#ifdef CONFIG_SYSRQ_SCHED_DEBUG
 extern void sysrq_sched_debug_show(void);
+#endif
 extern void sched_init_granularity(void);
 extern void update_max_interval(void);
 
@@ -1427,6 +2223,7 @@ static inline void __add_nr_running(struct rq *rq, unsigned count)
 {
 	unsigned prev_nr = rq->nr_running;
 
+	sched_update_nr_prod(cpu_of(rq), count, true);
 	rq->nr_running = prev_nr + count;
 
 	if (prev_nr < 2 && rq->nr_running >= 2) {
@@ -1453,6 +2250,7 @@ static inline void __add_nr_running(struct rq *rq, unsigned count)
 
 static inline void __sub_nr_running(struct rq *rq, unsigned count)
 {
+	sched_update_nr_prod(cpu_of(rq), count, false);
 	rq->nr_running -= count;
 }
 
@@ -1616,6 +2414,7 @@ static inline unsigned long __cpu_util(int cpu, int delta)
 		util = div64_u64(cpu_rq(cpu)->cumulative_runnable_avg,
 				 walt_ravg_window >> SCHED_LOAD_SHIFT);
 #endif
+
 	delta += util;
 	if (delta < 0)
 		return 0;
@@ -1643,6 +2442,20 @@ static inline unsigned long cpu_util_freq(int cpu)
 
 #endif
 
+#ifdef CONFIG_SCHED_HMP
+/*
+ * HMP and EAS are orthogonal. Hopefully the compiler just elides out all code
+ * with the energy_aware() check, so that we don't even pay the comparison
+ * penalty at runtime.
+ */
+#define energy_aware() false
+#else
+static inline bool energy_aware(void)
+{
+	return sched_feat(ENERGY_AWARE);
+}
+#endif
+
 static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
 {
 	rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
@@ -1883,6 +2696,11 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
 		__release(rq2->lock);
 }
 
+/*
+ * task_may_not_preempt - check whether a task may not be preemptible soon
+ */
+extern bool task_may_not_preempt(struct task_struct *task, int cpu);
+
 #else /* CONFIG_SMP */
 
 /*
@@ -1950,6 +2768,9 @@ enum rq_nohz_flag_bits {
 	NOHZ_BALANCE_KICK,
 };
 
+#define NOHZ_KICK_ANY 0
+#define NOHZ_KICK_RESTRICT 1
+
 #define nohz_flags(cpu)	(&cpu_rq(cpu)->nohz_flags)
 #endif
 
diff --git a/kernel/sched/sched_avg.c b/kernel/sched/sched_avg.c
new file mode 100644
index 000000000000..f03ed685f102
--- /dev/null
+++ b/kernel/sched/sched_avg.c
@@ -0,0 +1,199 @@
+/* Copyright (c) 2012, 2015-2017, 2018 The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+/*
+ * Scheduler hook for average runqueue determination
+ */
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/hrtimer.h>
+#include <linux/sched.h>
+#include <linux/math64.h>
+
+#include "sched.h"
+#include <trace/events/sched.h>
+
+static DEFINE_PER_CPU(u64, nr_prod_sum);
+static DEFINE_PER_CPU(u64, last_time);
+static DEFINE_PER_CPU(u64, nr_big_prod_sum);
+static DEFINE_PER_CPU(u64, nr);
+static DEFINE_PER_CPU(u64, nr_max);
+
+static DEFINE_PER_CPU(unsigned long, iowait_prod_sum);
+static DEFINE_PER_CPU(spinlock_t, nr_lock) = __SPIN_LOCK_UNLOCKED(nr_lock);
+static s64 last_get_time;
+
+#define DIV64_U64_ROUNDUP(X, Y) div64_u64((X) + (Y - 1), Y)
+/**
+ * sched_get_nr_running_avg
+ * @return: Average nr_running, iowait and nr_big_tasks value since last poll.
+ *	    Returns the avg * 100 to return up to two decimal points
+ *	    of accuracy.
+ *
+ * Obtains the average nr_running value since the last poll.
+ * This function may not be called concurrently with itself
+ */
+void sched_get_nr_running_avg(int *avg, int *iowait_avg, int *big_avg,
+			      unsigned int *max_nr, unsigned int *big_max_nr)
+{
+	int cpu;
+	u64 curr_time = sched_clock();
+	u64 diff = curr_time - last_get_time;
+	u64 tmp_avg = 0, tmp_iowait = 0, tmp_big_avg = 0;
+
+	*avg = 0;
+	*iowait_avg = 0;
+	*big_avg = 0;
+	*max_nr = 0;
+	*big_max_nr = 0;
+
+	if (!diff)
+		return;
+
+	/* read and reset nr_running counts */
+	for_each_possible_cpu(cpu) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&per_cpu(nr_lock, cpu), flags);
+		curr_time = sched_clock();
+		diff = curr_time - per_cpu(last_time, cpu);
+		BUG_ON((s64)diff < 0);
+
+		tmp_avg += per_cpu(nr_prod_sum, cpu);
+		tmp_avg += per_cpu(nr, cpu) * diff;
+
+		tmp_big_avg += per_cpu(nr_big_prod_sum, cpu);
+		tmp_big_avg += nr_eligible_big_tasks(cpu) * diff;
+
+		tmp_iowait += per_cpu(iowait_prod_sum, cpu);
+		tmp_iowait +=  nr_iowait_cpu(cpu) * diff;
+
+		per_cpu(last_time, cpu) = curr_time;
+
+		per_cpu(nr_prod_sum, cpu) = 0;
+		per_cpu(nr_big_prod_sum, cpu) = 0;
+		per_cpu(iowait_prod_sum, cpu) = 0;
+
+		if (*max_nr < per_cpu(nr_max, cpu))
+			*max_nr = per_cpu(nr_max, cpu);
+
+		if (is_max_capacity_cpu(cpu)) {
+			if (*big_max_nr < per_cpu(nr_max, cpu))
+				*big_max_nr = per_cpu(nr_max, cpu);
+		}
+
+		per_cpu(nr_max, cpu) = per_cpu(nr, cpu);
+		spin_unlock_irqrestore(&per_cpu(nr_lock, cpu), flags);
+	}
+
+	diff = curr_time - last_get_time;
+	last_get_time = curr_time;
+
+	/*
+	 * Any task running on BIG cluster and BIG tasks running on little
+	 * cluster contributes to big_avg. Small or medium tasks can also
+	 * run on BIG cluster when co-location and scheduler boost features
+	 * are activated. We don't want these tasks to downmigrate to little
+	 * cluster when BIG CPUs are available but isolated. Round up the
+	 * average values so that core_ctl aggressively unisolate BIG CPUs.
+	 */
+	*avg = (int)DIV64_U64_ROUNDUP(tmp_avg, diff);
+	*big_avg = (int)DIV64_U64_ROUNDUP(tmp_big_avg, diff);
+	*iowait_avg = (int)DIV64_U64_ROUNDUP(tmp_iowait, diff);
+
+	trace_sched_get_nr_running_avg(*avg, *big_avg, *iowait_avg,
+				       *max_nr, *big_max_nr);
+
+	BUG_ON(*avg < 0 || *big_avg < 0 || *iowait_avg < 0);
+	pr_debug("%s - avg:%d big_avg:%d iowait_avg:%d\n",
+				 __func__, *avg, *big_avg, *iowait_avg);
+}
+EXPORT_SYMBOL(sched_get_nr_running_avg);
+
+static DEFINE_PER_CPU(atomic64_t, last_busy_time) = ATOMIC64_INIT(0);
+
+#define BUSY_NR_RUN		3
+#define BUSY_LOAD_FACTOR	10
+
+#ifdef CONFIG_SCHED_HMP
+static inline void update_last_busy_time(int cpu, bool dequeue,
+				unsigned long prev_nr_run, u64 curr_time)
+{
+	bool nr_run_trigger = false, load_trigger = false;
+
+	if (!hmp_capable() || is_min_capacity_cpu(cpu))
+		return;
+
+	if (prev_nr_run >= BUSY_NR_RUN && per_cpu(nr, cpu) < BUSY_NR_RUN)
+		nr_run_trigger = true;
+
+	if (dequeue) {
+		u64 load;
+
+		load = cpu_rq(cpu)->hmp_stats.cumulative_runnable_avg;
+		load = scale_load_to_cpu(load, cpu);
+
+		if (load * BUSY_LOAD_FACTOR > sched_ravg_window)
+			load_trigger = true;
+	}
+
+	if (nr_run_trigger || load_trigger)
+		atomic64_set(&per_cpu(last_busy_time, cpu), curr_time);
+}
+#else
+static inline void update_last_busy_time(int cpu, bool dequeue,
+				unsigned long prev_nr_run, u64 curr_time)
+{
+}
+#endif
+
+/**
+ * sched_update_nr_prod
+ * @cpu: The core id of the nr running driver.
+ * @delta: Adjust nr by 'delta' amount
+ * @inc: Whether we are increasing or decreasing the count
+ * @return: N/A
+ *
+ * Update average with latest nr_running value for CPU
+ */
+void sched_update_nr_prod(int cpu, long delta, bool inc)
+{
+	u64 diff;
+	u64 curr_time;
+	unsigned long flags, nr_running;
+
+	spin_lock_irqsave(&per_cpu(nr_lock, cpu), flags);
+	nr_running = per_cpu(nr, cpu);
+	curr_time = sched_clock();
+	diff = curr_time - per_cpu(last_time, cpu);
+	BUG_ON((s64)diff < 0);
+	per_cpu(last_time, cpu) = curr_time;
+	per_cpu(nr, cpu) = nr_running + (inc ? delta : -delta);
+
+	BUG_ON((s64)per_cpu(nr, cpu) < 0);
+
+	if (per_cpu(nr, cpu) > per_cpu(nr_max, cpu))
+		per_cpu(nr_max, cpu) = per_cpu(nr, cpu);
+
+	update_last_busy_time(cpu, !inc, nr_running, curr_time);
+
+	per_cpu(nr_prod_sum, cpu) += nr_running * diff;
+	per_cpu(nr_big_prod_sum, cpu) += nr_eligible_big_tasks(cpu) * diff;
+	per_cpu(iowait_prod_sum, cpu) += nr_iowait_cpu(cpu) * diff;
+	spin_unlock_irqrestore(&per_cpu(nr_lock, cpu), flags);
+}
+EXPORT_SYMBOL(sched_update_nr_prod);
+
+u64 sched_get_cpu_last_busy_time(int cpu)
+{
+	return atomic64_read(&per_cpu(last_busy_time, cpu));
+}
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index a5567ccd8803..3278c81cefb1 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -1,5 +1,4 @@
 #include "sched.h"
-#include "walt.h"
 
 /*
  * stop-task scheduling class.
@@ -19,6 +18,41 @@ select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags,
 }
 #endif /* CONFIG_SMP */
 
+#ifdef CONFIG_SCHED_HMP
+
+static void
+inc_hmp_sched_stats_stop(struct rq *rq, struct task_struct *p)
+{
+	inc_cumulative_runnable_avg(&rq->hmp_stats, p);
+}
+
+static void
+dec_hmp_sched_stats_stop(struct rq *rq, struct task_struct *p)
+{
+	dec_cumulative_runnable_avg(&rq->hmp_stats, p);
+}
+
+static void
+fixup_hmp_sched_stats_stop(struct rq *rq, struct task_struct *p,
+			   u32 new_task_load, u32 new_pred_demand)
+{
+	s64 task_load_delta = (s64)new_task_load - task_load(p);
+	s64 pred_demand_delta = PRED_DEMAND_DELTA;
+
+	fixup_cumulative_runnable_avg(&rq->hmp_stats, p, task_load_delta,
+				      pred_demand_delta);
+}
+
+#else	/* CONFIG_SCHED_HMP */
+
+static inline void
+inc_hmp_sched_stats_stop(struct rq *rq, struct task_struct *p) { }
+
+static inline void
+dec_hmp_sched_stats_stop(struct rq *rq, struct task_struct *p) { }
+
+#endif	/* CONFIG_SCHED_HMP */
+
 static void
 check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
 {
@@ -44,14 +78,14 @@ static void
 enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
 {
 	add_nr_running(rq, 1);
-	walt_inc_cumulative_runnable_avg(rq, p);
+	inc_hmp_sched_stats_stop(rq, p);
 }
 
 static void
 dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
 {
 	sub_nr_running(rq, 1);
-	walt_dec_cumulative_runnable_avg(rq, p);
+	dec_hmp_sched_stats_stop(rq, p);
 }
 
 static void yield_task_stop(struct rq *rq)
@@ -138,4 +172,9 @@ const struct sched_class stop_sched_class = {
 	.prio_changed		= prio_changed_stop,
 	.switched_to		= switched_to_stop,
 	.update_curr		= update_curr_stop,
+#ifdef CONFIG_SCHED_HMP
+	.inc_hmp_sched_stats	= inc_hmp_sched_stats_stop,
+	.dec_hmp_sched_stats	= dec_hmp_sched_stats_stop,
+	.fixup_hmp_sched_stats	= fixup_hmp_sched_stats_stop,
+#endif
 };
diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
index 5e47c29b44f6..b84d13750604 100644
--- a/kernel/sched/tune.c
+++ b/kernel/sched/tune.c
@@ -121,6 +121,33 @@ struct schedtune {
 	/* Boost value for tasks on that SchedTune CGroup */
 	int boost;
 
+#ifdef CONFIG_SCHED_HMP
+	/* Toggle ability to override sched boost enabled */
+	bool sched_boost_no_override;
+
+	/*
+	 * Controls whether a cgroup is eligible for sched boost or not. This
+	 * can temporariliy be disabled by the kernel based on the no_override
+	 * flag above.
+	 */
+	bool sched_boost_enabled;
+
+	/*
+	 * This tracks the default value of sched_boost_enabled and is used
+	 * restore the value following any temporary changes to that flag.
+	 */
+	bool sched_boost_enabled_backup;
+
+	/*
+	 * Controls whether tasks of this cgroup should be colocated with each
+	 * other and tasks of other cgroups that have the same flag turned on.
+	 */
+	bool colocate;
+
+	/* Controls whether further updates are allowed to the colocate flag */
+	bool colocate_update_disabled;
+#endif
+
 	/* Performance Boost (B) region threshold params */
 	int perf_boost_idx;
 
@@ -134,7 +161,7 @@ struct schedtune {
 
 static inline struct schedtune *css_st(struct cgroup_subsys_state *css)
 {
-	return css ? container_of(css, struct schedtune, css) : NULL;
+	return container_of(css, struct schedtune, css);
 }
 
 static inline struct schedtune *task_schedtune(struct task_struct *tsk)
@@ -159,6 +186,13 @@ static inline struct schedtune *parent_st(struct schedtune *st)
 static struct schedtune
 root_schedtune = {
 	.boost	= 0,
+#ifdef CONFIG_SCHED_HMP
+	.sched_boost_no_override = false,
+	.sched_boost_enabled = true,
+	.sched_boost_enabled_backup = true,
+	.colocate = false,
+	.colocate_update_disabled = false,
+#endif
 	.perf_boost_idx = 0,
 	.perf_constrain_idx = 0,
 	.prefer_idle = 0,
@@ -239,6 +273,121 @@ struct boost_groups {
 /* Boost groups affecting each CPU in the system */
 DEFINE_PER_CPU(struct boost_groups, cpu_boost_groups);
 
+#ifdef CONFIG_SCHED_HMP
+static inline void init_sched_boost(struct schedtune *st)
+{
+	st->sched_boost_no_override = false;
+	st->sched_boost_enabled = true;
+	st->sched_boost_enabled_backup = st->sched_boost_enabled;
+	st->colocate = false;
+	st->colocate_update_disabled = false;
+}
+
+bool same_schedtune(struct task_struct *tsk1, struct task_struct *tsk2)
+{
+	return task_schedtune(tsk1) == task_schedtune(tsk2);
+}
+
+void update_cgroup_boost_settings(void)
+{
+	int i;
+
+	for (i = 0; i < BOOSTGROUPS_COUNT; i++) {
+		if (!allocated_group[i])
+			break;
+
+		if (allocated_group[i]->sched_boost_no_override)
+			continue;
+
+		allocated_group[i]->sched_boost_enabled = false;
+	}
+}
+
+void restore_cgroup_boost_settings(void)
+{
+	int i;
+
+	for (i = 0; i < BOOSTGROUPS_COUNT; i++) {
+		if (!allocated_group[i])
+			break;
+
+		allocated_group[i]->sched_boost_enabled =
+			allocated_group[i]->sched_boost_enabled_backup;
+	}
+}
+
+bool task_sched_boost(struct task_struct *p)
+{
+	struct schedtune *st = task_schedtune(p);
+
+	return st->sched_boost_enabled;
+}
+
+static u64
+sched_boost_override_read(struct cgroup_subsys_state *css,
+			struct cftype *cft)
+{
+	struct schedtune *st = css_st(css);
+
+	return st->sched_boost_no_override;
+}
+
+static int sched_boost_override_write(struct cgroup_subsys_state *css,
+			struct cftype *cft, u64 override)
+{
+	struct schedtune *st = css_st(css);
+
+	st->sched_boost_no_override = !!override;
+
+	return 0;
+}
+
+static u64 sched_boost_enabled_read(struct cgroup_subsys_state *css,
+			struct cftype *cft)
+{
+	struct schedtune *st = css_st(css);
+
+	return st->sched_boost_enabled;
+}
+
+static int sched_boost_enabled_write(struct cgroup_subsys_state *css,
+			struct cftype *cft, u64 enable)
+{
+	struct schedtune *st = css_st(css);
+
+	st->sched_boost_enabled = !!enable;
+	st->sched_boost_enabled_backup = st->sched_boost_enabled;
+
+	return 0;
+}
+
+static u64 sched_colocate_read(struct cgroup_subsys_state *css,
+			struct cftype *cft)
+{
+	struct schedtune *st = css_st(css);
+
+	return st->colocate;
+}
+
+static int sched_colocate_write(struct cgroup_subsys_state *css,
+			struct cftype *cft, u64 colocate)
+{
+	struct schedtune *st = css_st(css);
+
+	if (st->colocate_update_disabled)
+		return -EPERM;
+
+	st->colocate = !!colocate;
+	st->colocate_update_disabled = true;
+	return 0;
+}
+
+#else /* CONFIG_SCHED_HMP */
+
+static inline void init_sched_boost(struct schedtune *st) { }
+
+#endif /* CONFIG_SCHED_HMP */
+
 static void
 schedtune_cpu_update(int cpu)
 {
@@ -619,6 +768,22 @@ boost_write(struct cgroup_subsys_state *css, struct cftype *cft,
 	return 0;
 }
 
+static void schedtune_attach(struct cgroup_taskset *tset)
+{
+	struct task_struct *task;
+	struct cgroup_subsys_state *css;
+	struct schedtune *st;
+	bool colocate;
+
+	cgroup_taskset_first(tset, &css);
+	st = css_st(css);
+
+	colocate = st->colocate;
+
+	cgroup_taskset_for_each(task, css, tset)
+		sync_cgroup_colocation(task, colocate);
+}
+
 static struct cftype files[] = {
 	{
 		.name = "boost",
@@ -630,6 +795,23 @@ static struct cftype files[] = {
 		.read_u64 = prefer_idle_read,
 		.write_u64 = prefer_idle_write,
 	},
+#ifdef CONFIG_SCHED_HMP
+	{
+		.name = "sched_boost_no_override",
+		.read_u64 = sched_boost_override_read,
+		.write_u64 = sched_boost_override_write,
+	},
+	{
+		.name = "sched_boost_enabled",
+		.read_u64 = sched_boost_enabled_read,
+		.write_u64 = sched_boost_enabled_write,
+	},
+	{
+		.name = "colocate",
+		.read_u64 = sched_colocate_read,
+		.write_u64 = sched_colocate_write,
+	},
+#endif
 	{ }	/* terminate */
 };
 
@@ -683,6 +865,7 @@ schedtune_css_alloc(struct cgroup_subsys_state *parent_css)
 
 	/* Initialize per CPUs boost group support */
 	st->idx = idx;
+	init_sched_boost(st);
 	if (schedtune_boostgroup_init(st))
 		goto release;
 
@@ -720,6 +903,7 @@ struct cgroup_subsys schedtune_cgrp_subsys = {
 	.cancel_attach  = schedtune_cancel_attach,
 	.legacy_cftypes	= files,
 	.early_init	= 1,
+	.attach		= schedtune_attach,
 };
 
 static inline void
@@ -915,7 +1099,8 @@ schedtune_init(void)
 	 */
 	sd = rcu_dereference(per_cpu(sd_ea, cpumask_first(cpu_online_mask)));
 	if (!sd) {
-		pr_info("schedtune: no energy model data\n");
+		if (energy_aware())
+			pr_warn("schedtune: no energy model data\n");
 		goto nodata;
 	}
 
diff --git a/kernel/smp.c b/kernel/smp.c
index d903c02223af..b2ec21c5c9d6 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -32,6 +32,9 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data);
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue);
 
 static void flush_smp_call_function_queue(bool warn_cpu_offline);
+/* CPU mask indicating which CPUs to bring online during smp_init() */
+static bool have_boot_cpu_mask;
+static cpumask_var_t boot_cpu_mask;
 
 static int
 hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
@@ -548,6 +551,19 @@ static int __init maxcpus(char *str)
 
 early_param("maxcpus", maxcpus);
 
+static int __init boot_cpus(char *str)
+{
+	alloc_bootmem_cpumask_var(&boot_cpu_mask);
+	if (cpulist_parse(str, boot_cpu_mask) < 0) {
+		pr_warn("SMP: Incorrect boot_cpus cpumask\n");
+		return -EINVAL;
+	}
+	have_boot_cpu_mask = true;
+	return 0;
+}
+
+early_param("boot_cpus", boot_cpus);
+
 /* Setup number of possible processor ids */
 int nr_cpu_ids __read_mostly = NR_CPUS;
 EXPORT_SYMBOL(nr_cpu_ids);
@@ -563,6 +579,21 @@ void __weak smp_announce(void)
 	printk(KERN_INFO "Brought up %d CPUs\n", num_online_cpus());
 }
 
+/* Should the given CPU be booted during smp_init() ? */
+static inline bool boot_cpu(int cpu)
+{
+	if (!have_boot_cpu_mask)
+		return true;
+
+	return cpumask_test_cpu(cpu, boot_cpu_mask);
+}
+
+static inline void free_boot_cpu_mask(void)
+{
+	if (have_boot_cpu_mask)	/* Allocated from boot_cpus() */
+		free_bootmem_cpumask_var(boot_cpu_mask);
+}
+
 /* Called by boot processor to activate the rest. */
 void __init smp_init(void)
 {
@@ -574,10 +605,12 @@ void __init smp_init(void)
 	for_each_present_cpu(cpu) {
 		if (num_online_cpus() >= setup_max_cpus)
 			break;
-		if (!cpu_online(cpu))
+		if (!cpu_online(cpu) && boot_cpu(cpu))
 			cpu_up(cpu);
 	}
 
+	free_boot_cpu_mask();
+
 	/* Any cleanup work */
 	smp_announce();
 	smp_cpus_done(setup_max_cpus);
@@ -733,8 +766,8 @@ void wake_up_all_idle_cpus(void)
 	for_each_online_cpu(cpu) {
 		if (cpu == smp_processor_id())
 			continue;
-
-		wake_up_if_idle(cpu);
+		if (!cpu_isolated(cpu))
+			wake_up_if_idle(cpu);
 	}
 	preempt_enable();
 }
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index d264f59bff56..552e154fc77e 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -13,6 +13,7 @@
 #include <linux/percpu.h>
 #include <linux/kthread.h>
 #include <linux/smpboot.h>
+#include <linux/kmemleak.h>
 
 #include "smpboot.h"
 
@@ -31,7 +32,7 @@ struct task_struct *idle_thread_get(unsigned int cpu)
 
 	if (!tsk)
 		return ERR_PTR(-ENOMEM);
-	init_idle(tsk, cpu);
+	init_idle(tsk, cpu, true);
 	return tsk;
 }
 
@@ -121,7 +122,45 @@ static int smpboot_thread_fn(void *data)
 		}
 
 		if (kthread_should_park()) {
+			/*
+			 * Serialize against wakeup. If we take the lock first,
+			 * wakeup is skipped. If we run later, we observe,
+			 * TASK_RUNNING update from wakeup path, before moving
+			 * forward. This helps avoid the race, where wakeup
+			 * observes TASK_INTERRUPTIBLE, and also observes
+			 * the TASK_PARKED in kthread_parkme() before updating
+			 * task state to TASK_RUNNING. In this case, kthread
+			 * gets parked in TASK_RUNNING state. This results
+			 * in panic later on in kthread_unpark(), as it sees
+			 * KTHREAD_IS_PARKED flag set but fails to rebind the
+			 * kthread, due to it being not in TASK_PARKED state.
+			 *
+			 * Control thread                      Hotplug Thread
+			 *
+			 * kthread_park()
+			 *   set KTHREAD_SHOULD_PARK
+			 *                                smpboot_thread_fn()
+			 *                                  set_current_state(
+			 *                                  TASK_INTERRUPTIBLE);
+			 *                                  kthread_parkme()
+			 *
+			 *   wake_up_process()
+			 *
+			 * raw_spin_lock_irqsave(&p->pi_lock, flags);
+			 * if (!(p->state & state))
+			 *            goto out;
+			 *
+			 *                                  __set_current_state(
+			 *                                  TASK_PARKED);
+			 *
+			 * if (p->on_rq && ttwu_remote(p, wake_flags))
+			 *   ttwu_remote()
+			 *     p->state = TASK_RUNNING;
+			 *                                   schedule();
+			 */
+			raw_spin_lock(&current->pi_lock);
 			__set_current_state(TASK_RUNNING);
+			raw_spin_unlock(&current->pi_lock);
 			preempt_enable();
 			if (ht->park && td->status == HP_THREAD_ACTIVE) {
 				BUG_ON(td->cpu != smp_processor_id());
@@ -177,6 +216,8 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
 	td = kzalloc_node(sizeof(*td), GFP_KERNEL, cpu_to_node(cpu));
 	if (!td)
 		return -ENOMEM;
+
+	kmemleak_not_leak(td);
 	td->cpu = cpu;
 	td->ht = ht;
 
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 359be4f39986..d69b77fc7cc1 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -57,6 +57,13 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp
 
 DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
 
+/*
+ * active_softirqs -- per cpu, a mask of softirqs that are being handled,
+ * with the expectation that approximate answers are acceptable and therefore
+ * no synchronization.
+ */
+DEFINE_PER_CPU(__u32, active_softirqs);
+
 const char * const softirq_to_name[NR_SOFTIRQS] = {
 	"HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
 	"TASKLET", "SCHED", "HRTIMER", "RCU"
@@ -227,6 +234,16 @@ static inline bool lockdep_softirq_start(void) { return false; }
 static inline void lockdep_softirq_end(bool in_hardirq) { }
 #endif
 
+#define softirq_deferred_for_rt(pending)		\
+({							\
+	__u32 deferred = 0;				\
+	if (cpupri_check_rt()) {			\
+		deferred = pending & LONG_SOFTIRQ_MASK; \
+		pending &= ~LONG_SOFTIRQ_MASK;		\
+	}						\
+	deferred;					\
+})
+
 asmlinkage __visible void __softirq_entry __do_softirq(void)
 {
 	unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
@@ -234,6 +251,7 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
 	int max_restart = MAX_SOFTIRQ_RESTART;
 	struct softirq_action *h;
 	bool in_hardirq;
+	__u32 deferred;
 	__u32 pending;
 	int softirq_bit;
 
@@ -245,14 +263,15 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
 	current->flags &= ~PF_MEMALLOC;
 
 	pending = local_softirq_pending();
+	deferred = softirq_deferred_for_rt(pending);
 	account_irq_enter_time(current);
-
 	__local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
 	in_hardirq = lockdep_softirq_start();
 
 restart:
 	/* Reset the pending bitmask before enabling irqs */
-	set_softirq_pending(0);
+	set_softirq_pending(deferred);
+	__this_cpu_write(active_softirqs, pending);
 
 	local_irq_enable();
 
@@ -282,18 +301,21 @@ restart:
 		pending >>= softirq_bit;
 	}
 
+	__this_cpu_write(active_softirqs, 0);
 	rcu_bh_qs();
 	local_irq_disable();
 
 	pending = local_softirq_pending();
+	deferred = softirq_deferred_for_rt(pending);
+
 	if (pending) {
 		if (time_before(jiffies, end) && !need_resched() &&
 		    --max_restart)
 			goto restart;
-
-		wakeup_softirqd();
 	}
 
+	if (pending | deferred)
+		wakeup_softirqd();
 	lockdep_softirq_end(in_hardirq);
 	account_irq_exit_time(current);
 	__local_bh_enable(SOFTIRQ_OFFSET);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index bf19e8cbcb51..ae30e357f554 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -125,12 +125,17 @@ static int __maybe_unused neg_one = -1;
 static int zero;
 static int __maybe_unused one = 1;
 static int __maybe_unused two = 2;
+static int __maybe_unused three = 3;
 static int __maybe_unused four = 4;
 static unsigned long one_ul = 1;
 static int one_hundred = 100;
 #ifdef CONFIG_PRINTK
 static int ten_thousand = 10000;
 #endif
+#ifdef CONFIG_SCHED_HMP
+static int one_thousand = 1000;
+static int max_freq_reporting_policy = FREQ_REPORT_INVALID_POLICY - 1;
+#endif
 
 /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
 static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
@@ -286,62 +291,259 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
-#ifdef CONFIG_SCHED_DEBUG
+#if defined(CONFIG_PREEMPT_TRACER) || defined(CONFIG_IRQSOFF_TRACER)
 	{
-		.procname	= "sched_min_granularity_ns",
-		.data		= &sysctl_sched_min_granularity,
+		.procname       = "preemptoff_tracing_threshold_ns",
+		.data           = &sysctl_preemptoff_tracing_threshold_ns,
+		.maxlen         = sizeof(unsigned int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec,
+	},
+	{
+		.procname       = "irqsoff_tracing_threshold_ns",
+		.data           = &sysctl_irqsoff_tracing_threshold_ns,
+		.maxlen         = sizeof(unsigned int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec,
+	},
+#endif
+#ifdef CONFIG_SCHED_HMP
+	{
+		.procname	= "sched_freq_reporting_policy",
+		.data		= &sysctl_sched_freq_reporting_policy,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= sched_proc_update_handler,
-		.extra1		= &min_sched_granularity_ns,
-		.extra2		= &max_sched_granularity_ns,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &max_freq_reporting_policy,
 	},
 	{
-		.procname	= "sched_latency_ns",
-		.data		= &sysctl_sched_latency,
+		.procname	= "sched_freq_inc_notify",
+		.data		= &sysctl_sched_freq_inc_notify,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= sched_proc_update_handler,
-		.extra1		= &min_sched_granularity_ns,
-		.extra2		= &max_sched_granularity_ns,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
 	},
 	{
-		.procname	= "sched_sync_hint_enable",
-		.data		= &sysctl_sched_sync_hint_enable,
+		.procname	= "sched_freq_dec_notify",
+		.data		= &sysctl_sched_freq_dec_notify,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+	},
+	{
+		.procname       = "sched_cpu_high_irqload",
+		.data           = &sysctl_sched_cpu_high_irqload,
+		.maxlen         = sizeof(unsigned int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec,
 	},
-#ifdef CONFIG_SCHED_WALT
 	{
-		.procname	= "sched_use_walt_cpu_util",
-		.data		= &sysctl_sched_use_walt_cpu_util,
+		.procname       = "sched_ravg_hist_size",
+		.data           = &sysctl_sched_ravg_hist_size,
+		.maxlen         = sizeof(unsigned int),
+		.mode           = 0644,
+		.proc_handler   = sched_window_update_handler,
+	},
+	{
+		.procname       = "sched_window_stats_policy",
+		.data           = &sysctl_sched_window_stats_policy,
+		.maxlen         = sizeof(unsigned int),
+		.mode           = 0644,
+		.proc_handler   = sched_window_update_handler,
+	},
+	{
+		.procname	= "sched_spill_load",
+		.data		= &sysctl_sched_spill_load_pct,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= sched_hmp_proc_update_handler,
+		.extra1		= &zero,
+		.extra2		= &one_hundred,
 	},
 	{
-		.procname	= "sched_use_walt_task_util",
-		.data		= &sysctl_sched_use_walt_task_util,
+		.procname	= "sched_spill_nr_run",
+		.data		= &sysctl_sched_spill_nr_run,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+	},
+	{
+		.procname	= "sched_upmigrate",
+		.data		= &sysctl_sched_upmigrate_pct,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= sched_hmp_proc_update_handler,
+		.extra1		= &zero,
+		.extra2		= &one_hundred,
+	},
+	{
+		.procname	= "sched_downmigrate",
+		.data		= &sysctl_sched_downmigrate_pct,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= sched_hmp_proc_update_handler,
+		.extra1		= &zero,
+		.extra2		= &one_hundred,
+	},
+	{
+		.procname	= "sched_group_upmigrate",
+		.data		= &sysctl_sched_group_upmigrate_pct,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= sched_hmp_proc_update_handler,
+		.extra1		= &zero,
+	},
+	{
+		.procname	= "sched_group_downmigrate",
+		.data		= &sysctl_sched_group_downmigrate_pct,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= sched_hmp_proc_update_handler,
+		.extra1		= &zero,
+	},
+	{
+		.procname	= "sched_init_task_load",
+		.data		= &sysctl_sched_init_task_load_pct,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= sched_hmp_proc_update_handler,
+		.extra1		= &zero,
+		.extra2		= &one_hundred,
+	},
+	{
+		.procname	= "sched_select_prev_cpu_us",
+		.data		= &sysctl_sched_select_prev_cpu_us,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler   = sched_hmp_proc_update_handler,
+		.extra1		= &zero,
+	},
+	{
+		.procname	= "sched_restrict_cluster_spill",
+		.data		= &sysctl_sched_restrict_cluster_spill,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+	{
+		.procname	= "sched_small_wakee_task_load",
+		.data		= &sysctl_sched_small_wakee_task_load_pct,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler   = sched_hmp_proc_update_handler,
+		.extra1		= &zero,
+		.extra2		= &one_hundred,
+	},
+	{
+		.procname	= "sched_big_waker_task_load",
+		.data		= &sysctl_sched_big_waker_task_load_pct,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler   = sched_hmp_proc_update_handler,
+		.extra1		= &zero,
+		.extra2		= &one_hundred,
 	},
 	{
-		.procname	= "sched_walt_init_task_load_pct",
-		.data		= &sysctl_sched_walt_init_task_load_pct,
+		.procname	= "sched_prefer_sync_wakee_to_waker",
+		.data		= &sysctl_sched_prefer_sync_wakee_to_waker,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+	{
+		.procname       = "sched_enable_thread_grouping",
+		.data           = &sysctl_sched_enable_thread_grouping,
+		.maxlen         = sizeof(unsigned int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec,
+	},
+	{
+		.procname	= "sched_pred_alert_freq",
+		.data		= &sysctl_sched_pred_alert_freq,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+	},
+	{
+		.procname       = "sched_freq_aggregate",
+		.data           = &sysctl_sched_freq_aggregate,
+		.maxlen         = sizeof(unsigned int),
+		.mode           = 0644,
+		.proc_handler   = sched_window_update_handler,
+	},
+	{
+		.procname	= "sched_freq_aggregate_threshold",
+		.data		= &sysctl_sched_freq_aggregate_threshold_pct,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= sched_hmp_proc_update_handler,
+		.extra1		= &zero,
+		/*
+		 * Special handling for sched_freq_aggregate_threshold_pct
+		 * which can be greater than 100. Use 1000 as an upper bound
+		 * value which works for all practical use cases.
+		 */
+		.extra2		= &one_thousand,
+	},
+	{
+		.procname	= "sched_boost",
+		.data		= &sysctl_sched_boost,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= sched_boost_handler,
+		.extra1         = &zero,
+		.extra2		= &three,
+	},
+	{
+		.procname	= "sched_short_burst_ns",
+		.data		= &sysctl_sched_short_burst,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
 	{
-		.procname	= "sched_walt_cpu_high_irqload",
-		.data		= &sysctl_sched_walt_cpu_high_irqload,
+		.procname       = "sched_short_sleep_ns",
+		.data           = &sysctl_sched_short_sleep,
+		.maxlen         = sizeof(unsigned int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec,
+	},
+#endif	/* CONFIG_SCHED_HMP */
+#ifdef CONFIG_SCHED_DEBUG
+	{
+		.procname	= "sched_min_granularity_ns",
+		.data		= &sysctl_sched_min_granularity,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= sched_proc_update_handler,
+		.extra1		= &min_sched_granularity_ns,
+		.extra2		= &max_sched_granularity_ns,
+	},
+	{
+		.procname	= "sched_latency_ns",
+		.data		= &sysctl_sched_latency,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= sched_proc_update_handler,
+		.extra1		= &min_sched_granularity_ns,
+		.extra2		= &max_sched_granularity_ns,
+	},
+	{
+		.procname	= "sched_sync_hint_enable",
+		.data		= &sysctl_sched_sync_hint_enable,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
-#endif
 	{
 		.procname	= "sched_cstate_aware",
 		.data		= &sysctl_sched_cstate_aware,
@@ -1236,6 +1438,27 @@ static struct ctl_table kern_table[] = {
 		.extra2		= &one,
 	},
 #endif
+#if defined(CONFIG_ARM) || defined(CONFIG_ARM64)
+	{
+		.procname	= "boot_reason",
+		.data		= &boot_reason,
+		.maxlen		= sizeof(int),
+		.mode		= 0444,
+		.proc_handler	= proc_dointvec,
+	},
+
+	{
+		.procname	= "cold_boot",
+		.data		= &cold_boot,
+		.maxlen		= sizeof(int),
+		.mode		= 0444,
+		.proc_handler	= proc_dointvec,
+	},
+#endif
+/*
+ * NOTE: do not add new entries to this table unless you have read
+ * Documentation/sysctl/ctl_unnumbered.txt
+ */
 	{ }
 };
 
@@ -1662,6 +1885,22 @@ static struct ctl_table vm_table[] = {
 		.extra2		= (void *)&mmap_rnd_compat_bits_max,
 	},
 #endif
+#ifdef CONFIG_SWAP
+	{
+		.procname	= "swap_ratio",
+		.data		= &sysctl_swap_ratio,
+		.maxlen		= sizeof(sysctl_swap_ratio),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+	},
+	{
+		.procname	= "swap_ratio_enable",
+		.data		= &sysctl_swap_ratio_enable,
+		.maxlen		= sizeof(sysctl_swap_ratio_enable),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+	},
+#endif
 	{ }
 };
 
@@ -2149,15 +2388,7 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
 				 int write, void *data)
 {
 	if (write) {
-		if (*negp) {
-			if (*lvalp > (unsigned long) INT_MAX + 1)
-				return -EINVAL;
-			*valp = -*lvalp;
-		} else {
-			if (*lvalp > (unsigned long) INT_MAX)
-				return -EINVAL;
-			*valp = *lvalp;
-		}
+		*valp = *negp ? -*lvalp : *lvalp;
 	} else {
 		int val = *valp;
 		if (val < 0) {
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 10a1d7dc9313..d7612fcba10a 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -138,6 +138,8 @@ static const struct bin_table bin_kern_table[] = {
 	{ CTL_INT,	KERN_MAX_LOCK_DEPTH,		"max_lock_depth" },
 	{ CTL_INT,	KERN_PANIC_ON_NMI,		"panic_on_unrecovered_nmi" },
 	{ CTL_INT,	KERN_PANIC_ON_WARN,		"panic_on_warn" },
+	{ CTL_INT,	KERN_BOOT_REASON,		"boot_reason" },
+	{ CTL_INT,	KERN_COLD_BOOT,			"cold_boot" },
 	{}
 };
 
@@ -253,6 +255,7 @@ static const struct bin_table bin_net_ipv4_conf_vars_table[] = {
 	{ CTL_INT,	NET_IPV4_CONF_NOPOLICY,			"disable_policy" },
 	{ CTL_INT,	NET_IPV4_CONF_FORCE_IGMP_VERSION,	"force_igmp_version" },
 	{ CTL_INT,	NET_IPV4_CONF_PROMOTE_SECONDARIES,	"promote_secondaries" },
+	{ CTL_INT, NET_IPV4_CONF_NF_IPV4_DEFRAG_SKIP, "nf_ipv4_defrag_skip" },
 	{}
 };
 
@@ -523,6 +526,7 @@ static const struct bin_table bin_net_ipv6_conf_var_table[] = {
 	{ CTL_INT,	NET_IPV6_PROXY_NDP,			"proxy_ndp" },
 	{ CTL_INT,	NET_IPV6_ACCEPT_SOURCE_ROUTE,		"accept_source_route" },
 	{ CTL_INT,	NET_IPV6_ACCEPT_RA_FROM_LOCAL,		"accept_ra_from_local" },
+	{ CTL_INT,	NET_IPV6_ACCEPT_RA_PREFIX_ROUTE,	"accept_ra_prefix_route" },
 	{}
 };
 
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 21f82c29c914..11cc757795cd 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -54,7 +54,11 @@ static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1
 	[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING },
 	[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },};
 
-static const struct nla_policy cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] = {
+/*
+ * We have to use TASKSTATS_CMD_ATTR_MAX here, it is the maxattr in the family.
+ * Make sure they are always aligned.
+ */
+static const struct nla_policy cgroupstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = {
 	[CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 },
 };
 
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 49eca0beed32..b9b881ebaff3 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -9,6 +9,7 @@ ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y)
 endif
 obj-$(CONFIG_GENERIC_SCHED_CLOCK)		+= sched_clock.o
 obj-$(CONFIG_TICK_ONESHOT)			+= tick-oneshot.o tick-sched.o
-obj-$(CONFIG_TIMER_STATS)			+= timer_stats.o
 obj-$(CONFIG_DEBUG_FS)				+= timekeeping_debug.o
 obj-$(CONFIG_TEST_UDELAY)			+= test_udelay.o
+
+ccflags-y += -Idrivers/cpuidle
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index e78480b81f8d..271b37995a89 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -26,6 +26,11 @@
 #include <linux/workqueue.h>
 #include <linux/freezer.h>
 
+#ifdef CONFIG_MSM_PM
+#include "lpm-levels.h"
+#endif
+#include <linux/workqueue.h>
+
 /**
  * struct alarm_base - Alarm timer bases
  * @lock:		Lock for syncrhonized access to the base
@@ -46,14 +51,130 @@ static ktime_t freezer_delta;
 static DEFINE_SPINLOCK(freezer_delta_lock);
 
 static struct wakeup_source *ws;
+static struct delayed_work work;
+static struct workqueue_struct *power_off_alarm_workqueue;
 
 #ifdef CONFIG_RTC_CLASS
 /* rtc timer and device for setting alarm wakeups at suspend */
 static struct rtc_timer		rtctimer;
 static struct rtc_device	*rtcdev;
 static DEFINE_SPINLOCK(rtcdev_lock);
+static struct mutex power_on_alarm_lock;
+static struct alarm init_alarm;
 
 /**
+ * power_on_alarm_init - Init power on alarm value
+ *
+ * Read rtc alarm value after device booting up and add this alarm
+ * into alarm queue.
+ */
+void power_on_alarm_init(void)
+{
+	struct rtc_wkalrm rtc_alarm;
+	struct rtc_time rt;
+	unsigned long alarm_time;
+	struct rtc_device *rtc;
+	ktime_t alarm_ktime;
+
+	rtc = alarmtimer_get_rtcdev();
+
+	if (!rtc)
+		return;
+
+	rtc_read_alarm(rtc, &rtc_alarm);
+	rt = rtc_alarm.time;
+
+	rtc_tm_to_time(&rt, &alarm_time);
+
+	if (alarm_time) {
+		alarm_ktime = ktime_set(alarm_time, 0);
+		alarm_init(&init_alarm, ALARM_POWEROFF_REALTIME, NULL);
+		alarm_start(&init_alarm, alarm_ktime);
+	}
+}
+
+/**
+ * set_power_on_alarm - set power on alarm value into rtc register
+ *
+ * Get the soonest power off alarm timer and set the alarm value into rtc
+ * register.
+ */
+void set_power_on_alarm(void)
+{
+	int rc;
+	struct timespec wall_time, alarm_ts;
+	long alarm_secs = 0l;
+	long rtc_secs, alarm_time, alarm_delta;
+	struct rtc_time rtc_time;
+	struct rtc_wkalrm alarm;
+	struct rtc_device *rtc;
+	struct timerqueue_node *next;
+	unsigned long flags;
+	struct alarm_base *base = &alarm_bases[ALARM_POWEROFF_REALTIME];
+
+	rc = mutex_lock_interruptible(&power_on_alarm_lock);
+	if (rc != 0)
+		return;
+
+	spin_lock_irqsave(&base->lock, flags);
+	next = timerqueue_getnext(&base->timerqueue);
+	spin_unlock_irqrestore(&base->lock, flags);
+
+	if (next) {
+		alarm_ts = ktime_to_timespec(next->expires);
+		alarm_secs = alarm_ts.tv_sec;
+	}
+
+	if (!alarm_secs)
+		goto disable_alarm;
+
+	getnstimeofday(&wall_time);
+
+	/*
+	 * alarm_secs have to be bigger than "wall_time +1".
+	 * It is to make sure that alarm time will be always
+	 * bigger than wall time.
+	*/
+	if (alarm_secs <= wall_time.tv_sec + 1)
+		goto disable_alarm;
+
+	rtc = alarmtimer_get_rtcdev();
+	if (!rtc)
+		goto exit;
+
+	rtc_read_time(rtc, &rtc_time);
+	rtc_tm_to_time(&rtc_time, &rtc_secs);
+	alarm_delta = wall_time.tv_sec - rtc_secs;
+	alarm_time = alarm_secs - alarm_delta;
+
+	rtc_time_to_tm(alarm_time, &alarm.time);
+	alarm.enabled = 1;
+	rc = rtc_set_alarm(rtcdev, &alarm);
+	if (rc)
+		goto disable_alarm;
+
+	mutex_unlock(&power_on_alarm_lock);
+	return;
+
+disable_alarm:
+	rtc_alarm_irq_enable(rtcdev, 0);
+exit:
+	mutex_unlock(&power_on_alarm_lock);
+}
+
+static void alarmtimer_triggered_func(void *p)
+{
+	struct rtc_device *rtc = rtcdev;
+
+	if (!(rtc->irq_data & RTC_AF))
+		return;
+	__pm_wakeup_event(ws, 2 * MSEC_PER_SEC);
+}
+
+static struct rtc_task alarmtimer_rtc_task = {
+	.func = alarmtimer_triggered_func
+};
+/**
  * alarmtimer_get_rtcdev - Return selected rtcdevice
  *
  * This function returns the rtc device to use for wakealarms.
@@ -63,7 +184,7 @@ static DEFINE_SPINLOCK(rtcdev_lock);
 struct rtc_device *alarmtimer_get_rtcdev(void)
 {
 	unsigned long flags;
-	struct rtc_device *ret;
+	struct rtc_device *ret = NULL;
 
 	spin_lock_irqsave(&rtcdev_lock, flags);
 	ret = rtcdev;
@@ -77,33 +198,48 @@ static int alarmtimer_rtc_add_device(struct device *dev,
 				struct class_interface *class_intf)
 {
 	unsigned long flags;
+	int err = 0;
 	struct rtc_device *rtc = to_rtc_device(dev);
-
 	if (rtcdev)
 		return -EBUSY;
-
 	if (!rtc->ops->set_alarm)
 		return -1;
-	if (!device_may_wakeup(rtc->dev.parent))
-		return -1;
 
 	spin_lock_irqsave(&rtcdev_lock, flags);
 	if (!rtcdev) {
+		err = rtc_irq_register(rtc, &alarmtimer_rtc_task);
+		if (err)
+			goto rtc_irq_reg_err;
 		rtcdev = rtc;
 		/* hold a reference so it doesn't go away */
 		get_device(dev);
 	}
+
+rtc_irq_reg_err:
 	spin_unlock_irqrestore(&rtcdev_lock, flags);
-	return 0;
+	return err;
+
+}
+
+static void alarmtimer_rtc_remove_device(struct device *dev,
+				struct class_interface *class_intf)
+{
+	if (rtcdev && dev == &rtcdev->dev) {
+		rtc_irq_unregister(rtcdev, &alarmtimer_rtc_task);
+		rtcdev = NULL;
+	}
 }
 
 static inline void alarmtimer_rtc_timer_init(void)
 {
+	mutex_init(&power_on_alarm_lock);
+
 	rtc_timer_init(&rtctimer, NULL, NULL);
 }
 
 static struct class_interface alarmtimer_rtc_interface = {
 	.add_dev = &alarmtimer_rtc_add_device,
+	.remove_dev = &alarmtimer_rtc_remove_device,
 };
 
 static int alarmtimer_rtc_interface_setup(void)
@@ -124,8 +260,14 @@ struct rtc_device *alarmtimer_get_rtcdev(void)
 static inline int alarmtimer_rtc_interface_setup(void) { return 0; }
 static inline void alarmtimer_rtc_interface_remove(void) { }
 static inline void alarmtimer_rtc_timer_init(void) { }
+void set_power_on_alarm(void) { }
 #endif
 
+static void alarm_work_func(struct work_struct *unused)
+{
+	set_power_on_alarm();
+}
+
 /**
  * alarmtimer_enqueue - Adds an alarm timer to an alarm_base timerqueue
  * @base: pointer to the base where the timer is being run
@@ -195,6 +337,10 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
 	}
 	spin_unlock_irqrestore(&base->lock, flags);
 
+	/* set next power off alarm */
+	if (alarm->type == ALARM_POWEROFF_REALTIME)
+		queue_delayed_work(power_off_alarm_workqueue, &work, 0);
+
 	return ret;
 
 }
@@ -217,6 +363,68 @@ EXPORT_SYMBOL_GPL(alarm_expires_remaining);
  * set an rtc timer to fire that far into the future, which
  * will wake us from suspend.
  */
+#if defined(CONFIG_RTC_DRV_QPNP) && defined(CONFIG_MSM_PM)
+static int alarmtimer_suspend(struct device *dev)
+{
+	struct rtc_time tm;
+	ktime_t min, now;
+	unsigned long flags;
+	struct rtc_device *rtc;
+	int i;
+	int ret = 0;
+
+	spin_lock_irqsave(&freezer_delta_lock, flags);
+	min = freezer_delta;
+	freezer_delta = ktime_set(0, 0);
+	spin_unlock_irqrestore(&freezer_delta_lock, flags);
+
+	rtc = alarmtimer_get_rtcdev();
+	/* If we have no rtcdev, just return */
+	if (!rtc)
+		return 0;
+
+	/* Find the soonest timer to expire*/
+	for (i = 0; i < ALARM_NUMTYPE; i++) {
+		struct alarm_base *base = &alarm_bases[i];
+		struct timerqueue_node *next;
+		ktime_t delta;
+
+		spin_lock_irqsave(&base->lock, flags);
+		next = timerqueue_getnext(&base->timerqueue);
+		spin_unlock_irqrestore(&base->lock, flags);
+		if (!next)
+			continue;
+		delta = ktime_sub(next->expires, base->gettime());
+		if (!min.tv64 || (delta.tv64 < min.tv64))
+			min = delta;
+	}
+	if (min.tv64 == 0)
+		return 0;
+
+	if (ktime_to_ns(min) < 2 * NSEC_PER_SEC) {
+		__pm_wakeup_event(ws, 2 * MSEC_PER_SEC);
+		return -EBUSY;
+	}
+
+	/* Setup a timer to fire that far in the future */
+	rtc_timer_cancel(rtc, &rtctimer);
+	rtc_read_time(rtc, &tm);
+	now = rtc_tm_to_ktime(tm);
+	now = ktime_add(now, min);
+	if (poweron_alarm) {
+		uint64_t msec = 0;
+
+		msec = ktime_to_ms(min);
+		lpm_suspend_wake_time(msec);
+	} else {
+		/* Set alarm, if in the past reject suspend briefly to handle */
+		ret = rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0));
+		if (ret < 0)
+			__pm_wakeup_event(ws, MSEC_PER_SEC);
+	}
+	return ret;
+}
+#else
 static int alarmtimer_suspend(struct device *dev)
 {
 	struct rtc_time tm;
@@ -226,6 +434,8 @@ static int alarmtimer_suspend(struct device *dev)
 	int i;
 	int ret;
 
+	cancel_delayed_work_sync(&work);
+
 	spin_lock_irqsave(&freezer_delta_lock, flags);
 	min = freezer_delta;
 	freezer_delta = ktime_set(0, 0);
@@ -271,11 +481,31 @@ static int alarmtimer_suspend(struct device *dev)
 		__pm_wakeup_event(ws, MSEC_PER_SEC);
 	return ret;
 }
+#endif
+static int alarmtimer_resume(struct device *dev)
+{
+	struct rtc_device *rtc;
+
+	rtc = alarmtimer_get_rtcdev();
+	/* If we have no rtcdev, just return */
+	if (!rtc)
+		return 0;
+	rtc_timer_cancel(rtc, &rtctimer);
+
+	queue_delayed_work(power_off_alarm_workqueue, &work, 0);
+	return 0;
+}
+
 #else
 static int alarmtimer_suspend(struct device *dev)
 {
 	return 0;
 }
+
+static int alarmtimer_resume(struct device *dev)
+{
+	return 0;
+}
 #endif
 
 static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type)
@@ -443,12 +673,14 @@ EXPORT_SYMBOL_GPL(alarm_forward_now);
  * clock2alarm - helper that converts from clockid to alarmtypes
  * @clockid: clockid.
  */
-static enum alarmtimer_type clock2alarm(clockid_t clockid)
+enum alarmtimer_type clock2alarm(clockid_t clockid)
 {
 	if (clockid == CLOCK_REALTIME_ALARM)
 		return ALARM_REALTIME;
 	if (clockid == CLOCK_BOOTTIME_ALARM)
 		return ALARM_BOOTTIME;
+	if (clockid == CLOCK_POWEROFF_ALARM)
+		return ALARM_POWEROFF_REALTIME;
 	return -1;
 }
 
@@ -810,6 +1042,7 @@ out:
 /* Suspend hook structures */
 static const struct dev_pm_ops alarmtimer_pm_ops = {
 	.suspend = alarmtimer_suspend,
+	.resume = alarmtimer_resume,
 };
 
 static struct platform_driver alarmtimer_driver = {
@@ -844,10 +1077,13 @@ static int __init alarmtimer_init(void)
 
 	posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock);
 	posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock);
+	posix_timers_register_clock(CLOCK_POWEROFF_ALARM, &alarm_clock);
 
 	/* Initialize alarm bases */
 	alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME;
 	alarm_bases[ALARM_REALTIME].gettime = &ktime_get_real;
+	alarm_bases[ALARM_POWEROFF_REALTIME].base_clockid = CLOCK_REALTIME;
+	alarm_bases[ALARM_POWEROFF_REALTIME].gettime = &ktime_get_real;
 	alarm_bases[ALARM_BOOTTIME].base_clockid = CLOCK_BOOTTIME;
 	alarm_bases[ALARM_BOOTTIME].gettime = &ktime_get_boottime;
 	for (i = 0; i < ALARM_NUMTYPE; i++) {
@@ -869,8 +1105,24 @@ static int __init alarmtimer_init(void)
 		goto out_drv;
 	}
 	ws = wakeup_source_register("alarmtimer");
-	return 0;
+	if (!ws) {
+		error = -ENOMEM;
+		goto out_ws;
+	}
+
+	INIT_DELAYED_WORK(&work, alarm_work_func);
+	power_off_alarm_workqueue =
+		create_singlethread_workqueue("power_off_alarm");
+	if (!power_off_alarm_workqueue) {
+		error = -ENOMEM;
+		goto out_wq;
+	}
 
+	return 0;
+out_wq:
+	wakeup_source_unregister(ws);
+out_ws:
+	platform_device_unregister(pdev);
 out_drv:
 	platform_driver_unregister(&alarmtimer_driver);
 out_if:
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index b98810d2f3b4..89cc82a38e4d 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -108,7 +108,7 @@ static int finished_booting;
 
 #ifdef CONFIG_CLOCKSOURCE_WATCHDOG
 static void clocksource_watchdog_work(struct work_struct *work);
-static void clocksource_select(void);
+static void clocksource_select(bool force);
 
 static LIST_HEAD(watchdog_list);
 static struct clocksource *watchdog;
@@ -415,7 +415,7 @@ static int clocksource_watchdog_kthread(void *data)
 {
 	mutex_lock(&clocksource_mutex);
 	if (__clocksource_watchdog_kthread())
-		clocksource_select();
+		clocksource_select(false);
 	mutex_unlock(&clocksource_mutex);
 	return 0;
 }
@@ -555,11 +555,12 @@ static inline void clocksource_update_max_deferment(struct clocksource *cs)
 
 #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET
 
-static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur)
+static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur,
+						bool force)
 {
 	struct clocksource *cs;
 
-	if (!finished_booting || list_empty(&clocksource_list))
+	if ((!finished_booting && !force) || list_empty(&clocksource_list))
 		return NULL;
 
 	/*
@@ -577,13 +578,13 @@ static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur)
 	return NULL;
 }
 
-static void __clocksource_select(bool skipcur)
+static void __clocksource_select(bool skipcur, bool force)
 {
 	bool oneshot = tick_oneshot_mode_active();
 	struct clocksource *best, *cs;
 
 	/* Find the best suitable clocksource */
-	best = clocksource_find_best(oneshot, skipcur);
+	best = clocksource_find_best(oneshot, skipcur, force);
 	if (!best)
 		return;
 
@@ -623,22 +624,40 @@ static void __clocksource_select(bool skipcur)
  * Select the clocksource with the best rating, or the clocksource,
  * which is selected by userspace override.
  */
-static void clocksource_select(void)
+static void clocksource_select(bool force)
 {
-	__clocksource_select(false);
+	return __clocksource_select(false, force);
 }
 
 static void clocksource_select_fallback(void)
 {
-	__clocksource_select(true);
+	__clocksource_select(true, false);
 }
 
 #else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */
-static inline void clocksource_select(void) { }
+
+static inline void clocksource_select(bool force) { }
 static inline void clocksource_select_fallback(void) { }
 
 #endif
 
+/**
+ * clocksource_select_force - Force re-selection of the best clocksource
+ *				among registered clocksources
+ *
+ * clocksource_select() can't select the best clocksource before
+ * calling clocksource_done_booting() and since clocksource_select()
+ * should be called with clocksource_mutex held, provide a new API
+ * can be called from other files to select best clockrouce irrespective
+ * of finished_booting flag.
+ */
+void clocksource_select_force(void)
+{
+	mutex_lock(&clocksource_mutex);
+	clocksource_select(true);
+	mutex_unlock(&clocksource_mutex);
+}
+
 /*
  * clocksource_done_booting - Called near the end of core bootup
  *
@@ -655,7 +674,7 @@ static int __init clocksource_done_booting(void)
 	 * Run the watchdog first to eliminate unstable clock sources
 	 */
 	__clocksource_watchdog_kthread();
-	clocksource_select();
+	clocksource_select(false);
 	mutex_unlock(&clocksource_mutex);
 	return 0;
 }
@@ -744,6 +763,7 @@ void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq
 }
 EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale);
 
+
 /**
  * __clocksource_register_scale - Used to install new clocksources
  * @cs:		clocksource to be registered
@@ -765,7 +785,7 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
 	mutex_lock(&clocksource_mutex);
 	clocksource_enqueue(cs);
 	clocksource_enqueue_watchdog(cs);
-	clocksource_select();
+	clocksource_select(false);
 	clocksource_select_watchdog(false);
 	mutex_unlock(&clocksource_mutex);
 	return 0;
@@ -788,7 +808,7 @@ void clocksource_change_rating(struct clocksource *cs, int rating)
 {
 	mutex_lock(&clocksource_mutex);
 	__clocksource_change_rating(cs, rating);
-	clocksource_select();
+	clocksource_select(false);
 	clocksource_select_watchdog(false);
 	mutex_unlock(&clocksource_mutex);
 }
@@ -892,7 +912,7 @@ static ssize_t sysfs_override_clocksource(struct device *dev,
 
 	ret = sysfs_get_uname(buf, override_name, count);
 	if (ret >= 0)
-		clocksource_select();
+		clocksource_select(false);
 
 	mutex_unlock(&clocksource_mutex);
 
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 8d2e7074860b..79fadcad21ff 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -784,34 +784,6 @@ void hrtimers_resume(void)
 	clock_was_set_delayed();
 }
 
-static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer)
-{
-#ifdef CONFIG_TIMER_STATS
-	if (timer->start_site)
-		return;
-	timer->start_site = __builtin_return_address(0);
-	memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
-	timer->start_pid = current->pid;
-#endif
-}
-
-static inline void timer_stats_hrtimer_clear_start_info(struct hrtimer *timer)
-{
-#ifdef CONFIG_TIMER_STATS
-	timer->start_site = NULL;
-#endif
-}
-
-static inline void timer_stats_account_hrtimer(struct hrtimer *timer)
-{
-#ifdef CONFIG_TIMER_STATS
-	if (likely(!timer_stats_active))
-		return;
-	timer_stats_update_stats(timer, timer->start_pid, timer->start_site,
-				 timer->function, timer->start_comm, 0);
-#endif
-}
-
 /*
  * Counterpart to lock_hrtimer_base above:
  */
@@ -888,7 +860,7 @@ static int enqueue_hrtimer(struct hrtimer *timer,
 
 	base->cpu_base->active_bases |= 1 << base->index;
 
-	timer->state = HRTIMER_STATE_ENQUEUED;
+	timer->state |= HRTIMER_STATE_ENQUEUED;
 
 	return timerqueue_add(&base->active, &timer->node);
 }
@@ -908,11 +880,9 @@ static void __remove_hrtimer(struct hrtimer *timer,
 			     u8 newstate, int reprogram)
 {
 	struct hrtimer_cpu_base *cpu_base = base->cpu_base;
-	u8 state = timer->state;
 
-	timer->state = newstate;
-	if (!(state & HRTIMER_STATE_ENQUEUED))
-		return;
+	if (!(timer->state & HRTIMER_STATE_ENQUEUED))
+		goto out;
 
 	if (!timerqueue_del(&base->active, &timer->node))
 		cpu_base->active_bases &= ~(1 << base->index);
@@ -929,6 +899,13 @@ static void __remove_hrtimer(struct hrtimer *timer,
 	if (reprogram && timer == cpu_base->next_timer)
 		hrtimer_force_reprogram(cpu_base, 1);
 #endif
+
+out:
+	/*
+	* We need to preserve PINNED state here, otherwise we may end up
+	* migrating pinned hrtimers as well.
+	*/
+	timer->state = newstate | (timer->state & HRTIMER_STATE_PINNED);
 }
 
 /*
@@ -950,13 +927,13 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool rest
 		 * rare case and less expensive than a smp call.
 		 */
 		debug_deactivate(timer);
-		timer_stats_hrtimer_clear_start_info(timer);
 		reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
 
 		if (!restart)
 			state = HRTIMER_STATE_INACTIVE;
 
 		__remove_hrtimer(timer, base, state, reprogram);
+		timer->state &= ~HRTIMER_STATE_PINNED;
 		return 1;
 	}
 	return 0;
@@ -1008,7 +985,9 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 	/* Switch the timer base, if necessary: */
 	new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
 
-	timer_stats_hrtimer_set_start_info(timer);
+	/* Update pinned state */
+	timer->state &= ~HRTIMER_STATE_PINNED;
+	timer->state |= (!!(mode & HRTIMER_MODE_PINNED)) << HRTIMER_PINNED_SHIFT;
 
 	leftmost = enqueue_hrtimer(timer, new_base);
 	if (!leftmost)
@@ -1151,12 +1130,6 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
 	base = hrtimer_clockid_to_base(clock_id);
 	timer->base = &cpu_base->clock_base[base];
 	timerqueue_init(&timer->node);
-
-#ifdef CONFIG_TIMER_STATS
-	timer->start_site = NULL;
-	timer->start_pid = -1;
-	memset(timer->start_comm, 0, TASK_COMM_LEN);
-#endif
 }
 
 /**
@@ -1189,8 +1162,8 @@ bool hrtimer_active(const struct hrtimer *timer)
 		cpu_base = READ_ONCE(timer->base->cpu_base);
 		seq = raw_read_seqcount_begin(&cpu_base->seq);
 
-		if (timer->state != HRTIMER_STATE_INACTIVE ||
-		    cpu_base->running == timer)
+		if (((timer->state & ~HRTIMER_STATE_PINNED) !=
+		      HRTIMER_STATE_INACTIVE) || cpu_base->running == timer)
 			return true;
 
 	} while (read_seqcount_retry(&cpu_base->seq, seq) ||
@@ -1240,7 +1213,6 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
 	raw_write_seqcount_barrier(&cpu_base->seq);
 
 	__remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
-	timer_stats_account_hrtimer(timer);
 	fn = timer->function;
 
 	/*
@@ -1628,17 +1600,23 @@ static void init_hrtimers_cpu(int cpu)
 	hrtimer_init_hres(cpu_base);
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
-
+#if defined(CONFIG_HOTPLUG_CPU)
 static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
-				struct hrtimer_clock_base *new_base)
+				 struct hrtimer_clock_base *new_base,
+				 bool remove_pinned)
 {
 	struct hrtimer *timer;
 	struct timerqueue_node *node;
+	struct timerqueue_head pinned;
+	int is_pinned;
+	bool is_hotplug = !cpu_online(old_base->cpu_base->cpu);
+
+	timerqueue_init_head(&pinned);
 
 	while ((node = timerqueue_getnext(&old_base->active))) {
 		timer = container_of(node, struct hrtimer, node);
-		BUG_ON(hrtimer_callback_running(timer));
+		if (is_hotplug)
+			BUG_ON(hrtimer_callback_running(timer));
 		debug_deactivate(timer);
 
 		/*
@@ -1647,6 +1625,13 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
 		 * under us on another CPU
 		 */
 		__remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0);
+
+		is_pinned = timer->state & HRTIMER_STATE_PINNED;
+		if (!remove_pinned && is_pinned) {
+			timerqueue_add(&pinned, &timer->node);
+			continue;
+		}
+
 		timer->base = new_base;
 		/*
 		 * Enqueue the timers on the new cpu. This does not
@@ -1658,17 +1643,23 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
 		 */
 		enqueue_hrtimer(timer, new_base);
 	}
+
+	/* Re-queue pinned timers for non-hotplug usecase */
+	while ((node = timerqueue_getnext(&pinned))) {
+		timer = container_of(node, struct hrtimer, node);
+
+		timerqueue_del(&pinned, &timer->node);
+		enqueue_hrtimer(timer, old_base);
+	}
 }
 
-static void migrate_hrtimers(int scpu)
+static void __migrate_hrtimers(int scpu, bool remove_pinned)
 {
 	struct hrtimer_cpu_base *old_base, *new_base;
+	unsigned long flags;
 	int i;
 
-	BUG_ON(cpu_online(scpu));
-	tick_cancel_sched_timer(scpu);
-
-	local_irq_disable();
+	local_irq_save(flags);
 	old_base = &per_cpu(hrtimer_bases, scpu);
 	new_base = this_cpu_ptr(&hrtimer_bases);
 	/*
@@ -1680,7 +1671,7 @@ static void migrate_hrtimers(int scpu)
 
 	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
 		migrate_hrtimer_list(&old_base->clock_base[i],
-				     &new_base->clock_base[i]);
+				     &new_base->clock_base[i], remove_pinned);
 	}
 
 	raw_spin_unlock(&old_base->lock);
@@ -1688,7 +1679,20 @@ static void migrate_hrtimers(int scpu)
 
 	/* Check, if we got expired work to do */
 	__hrtimer_peek_ahead_timers();
-	local_irq_enable();
+	local_irq_restore(flags);
+}
+
+static void migrate_hrtimers(int scpu)
+{
+	BUG_ON(cpu_online(scpu));
+	tick_cancel_sched_timer(scpu);
+
+	__migrate_hrtimers(scpu, true);
+}
+
+void hrtimer_quiesce_cpu(void *cpup)
+{
+	__migrate_hrtimers(*(int *)cpup, false);
 }
 
 #endif /* CONFIG_HOTPLUG_CPU */
@@ -1796,15 +1800,19 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
  * You can set the task state as follows -
  *
  * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
- * pass before the routine returns.
+ * pass before the routine returns unless the current task is explicitly
+ * woken up, (e.g. by wake_up_process()).
  *
  * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
- * delivered to the current task.
+ * delivered to the current task or the current task is explicitly woken
+ * up.
  *
  * The current task state is guaranteed to be TASK_RUNNING when this
  * routine returns.
  *
- * Returns 0 when the timer has expired otherwise -EINTR
+ * Returns 0 when the timer has expired. If the task was woken before the
+ * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or
+ * by an explicit wakeup, it returns -EINTR.
  */
 int __sched schedule_hrtimeout_range(ktime_t *expires, u64 delta,
 				     const enum hrtimer_mode mode)
@@ -1826,15 +1834,19 @@ EXPORT_SYMBOL_GPL(schedule_hrtimeout_range);
  * You can set the task state as follows -
  *
  * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
- * pass before the routine returns.
+ * pass before the routine returns unless the current task is explicitly
+ * woken up, (e.g. by wake_up_process()).
  *
  * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
- * delivered to the current task.
+ * delivered to the current task or the current task is explicitly woken
+ * up.
  *
  * The current task state is guaranteed to be TASK_RUNNING when this
  * routine returns.
  *
- * Returns 0 when the timer has expired otherwise -EINTR
+ * Returns 0 when the timer has expired. If the task was woken before the
+ * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or
+ * by an explicit wakeup, it returns -EINTR.
  */
 int __sched schedule_hrtimeout(ktime_t *expires,
 			       const enum hrtimer_mode mode)
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 8fc68e60c795..d176c127f744 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1250,7 +1250,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 			   cputime_t *newval, cputime_t *oldval)
 {
-	unsigned long long now;
+	unsigned long long now = 0;
 
 	WARN_ON_ONCE(clock_idx == CPUCLOCK_SCHED);
 	cpu_timer_sample_group(clock_idx, tsk, &now);
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 382b159d8592..1986ab6e5943 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -70,6 +70,11 @@ struct clock_data {
 
 static struct hrtimer sched_clock_timer;
 static int irqtime = -1;
+static int initialized;
+static u64 suspend_ns;
+static u64 suspend_cycles;
+static u64 resume_cycles;
+
 
 core_param(irqtime, irqtime, int, 0400);
 
@@ -236,6 +241,11 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
 	pr_debug("Registered %pF as sched_clock source\n", read);
 }
 
+int sched_clock_initialized(void)
+{
+	return initialized;
+}
+
 void __init sched_clock_postinit(void)
 {
 	/*
@@ -254,6 +264,8 @@ void __init sched_clock_postinit(void)
 	hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	sched_clock_timer.function = sched_clock_poll;
 	hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
+
+	initialized = 1;
 }
 
 /*
@@ -279,6 +291,11 @@ static int sched_clock_suspend(void)
 	struct clock_read_data *rd = &cd.read_data[0];
 
 	update_sched_clock();
+
+	suspend_ns = rd->epoch_ns;
+	suspend_cycles = rd->epoch_cyc;
+	pr_info("suspend ns:%17llu	suspend cycles:%17llu\n",
+				rd->epoch_ns, rd->epoch_cyc);
 	hrtimer_cancel(&sched_clock_timer);
 	rd->read_sched_clock = suspended_sched_clock_read;
 
@@ -290,6 +307,8 @@ static void sched_clock_resume(void)
 	struct clock_read_data *rd = &cd.read_data[0];
 
 	rd->epoch_cyc = cd.actual_read_sched_clock();
+	resume_cycles = rd->epoch_cyc;
+	pr_info("resume cycles:%17llu\n", rd->epoch_cyc);
 	hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
 	rd->read_sched_clock = cd.actual_read_sched_clock;
 }
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 4cd1145ec260..d675f8b06110 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -19,11 +19,13 @@
 #include <linux/percpu.h>
 #include <linux/profile.h>
 #include <linux/sched.h>
+#include <linux/timer.h>
 #include <linux/module.h>
 #include <linux/irq_work.h>
 #include <linux/posix-timers.h>
 #include <linux/perf_event.h>
 #include <linux/context_tracking.h>
+#include <linux/rq_stats.h>
 
 #include <asm/irq_regs.h>
 
@@ -31,6 +33,10 @@
 
 #include <trace/events/timer.h>
 
+struct rq_data rq_info;
+struct workqueue_struct *rq_wq;
+spinlock_t rq_lock;
+
 /*
  * Per cpu nohz control structure
  */
@@ -41,6 +47,21 @@ static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
  */
 static ktime_t last_jiffies_update;
 
+u64 jiffy_to_ktime_ns(u64 *now, u64 *jiffy_ktime_ns)
+{
+	u64 cur_jiffies;
+	unsigned long seq;
+
+	do {
+		seq = read_seqbegin(&jiffies_lock);
+		*now = ktime_get_ns();
+		*jiffy_ktime_ns = ktime_to_ns(last_jiffies_update);
+		cur_jiffies = get_jiffies_64();
+	} while (read_seqretry(&jiffies_lock, seq));
+
+	return cur_jiffies;
+}
+
 struct tick_sched *tick_get_tick_sched(int cpu)
 {
 	return &per_cpu(tick_cpu_sched, cpu);
@@ -143,7 +164,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
 	 * when we go busy again does not account too much ticks.
 	 */
 	if (ts->tick_stopped) {
-		touch_softlockup_watchdog();
+		touch_softlockup_watchdog_sched();
 		if (is_idle_task(current))
 			ts->idle_jiffies++;
 	}
@@ -430,7 +451,7 @@ static void tick_nohz_update_jiffies(ktime_t now)
 	tick_do_update_jiffies64(now);
 	local_irq_restore(flags);
 
-	touch_softlockup_watchdog();
+	touch_softlockup_watchdog_sched();
 }
 
 /*
@@ -716,7 +737,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
 	update_cpu_load_nohz();
 
 	calc_load_exit_idle();
-	touch_softlockup_watchdog();
+	touch_softlockup_watchdog_sched();
 	/*
 	 * Cancel the scheduled timer and restore the tick
 	 */
@@ -804,6 +825,11 @@ static void __tick_nohz_idle_enter(struct tick_sched *ts)
 
 	now = tick_nohz_start_idle(ts);
 
+#ifdef CONFIG_SMP
+	if (check_pending_deferrable_timers(cpu))
+		raise_softirq_irqoff(TIMER_SOFTIRQ);
+#endif
+
 	if (can_stop_idle_tick(cpu, ts)) {
 		int was_stopped = ts->tick_stopped;
 
@@ -1076,6 +1102,51 @@ void tick_irq_enter(void)
  * High resolution timer specific code
  */
 #ifdef CONFIG_HIGH_RES_TIMERS
+static void update_rq_stats(void)
+{
+	unsigned long jiffy_gap = 0;
+	unsigned int rq_avg = 0;
+	unsigned long flags = 0;
+
+	jiffy_gap = jiffies - rq_info.rq_poll_last_jiffy;
+
+	if (jiffy_gap >= rq_info.rq_poll_jiffies) {
+
+		spin_lock_irqsave(&rq_lock, flags);
+
+		if (!rq_info.rq_avg)
+			rq_info.rq_poll_total_jiffies = 0;
+
+		rq_avg = nr_running() * 10;
+
+		if (rq_info.rq_poll_total_jiffies) {
+			rq_avg = (rq_avg * jiffy_gap) +
+				(rq_info.rq_avg *
+				 rq_info.rq_poll_total_jiffies);
+			do_div(rq_avg,
+			       rq_info.rq_poll_total_jiffies + jiffy_gap);
+		}
+
+		rq_info.rq_avg =  rq_avg;
+		rq_info.rq_poll_total_jiffies += jiffy_gap;
+		rq_info.rq_poll_last_jiffy = jiffies;
+
+		spin_unlock_irqrestore(&rq_lock, flags);
+	}
+}
+
+static void wakeup_user(void)
+{
+	unsigned long jiffy_gap;
+
+	jiffy_gap = jiffies - rq_info.def_timer_last_jiffy;
+
+	if (jiffy_gap >= rq_info.def_timer_jiffies) {
+		rq_info.def_timer_last_jiffy = jiffies;
+		queue_work(rq_wq, &rq_info.def_timer_work);
+	}
+}
+
 /*
  * We rearm the timer until we get disabled by the idle code.
  * Called with interrupts disabled.
@@ -1093,9 +1164,23 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
 	 * Do not call, when we are not in irq context and have
 	 * no valid regs pointer
 	 */
-	if (regs)
+	if (regs) {
 		tick_sched_handle(ts, regs);
 
+		if (rq_info.init == 1 &&
+				tick_do_timer_cpu == smp_processor_id()) {
+			/*
+			 * update run queue statistics
+			 */
+			update_rq_stats();
+
+			/*
+			 * wakeup user if needed
+			 */
+			wakeup_user();
+		}
+	}
+
 	/* No need to reprogram if we are in idle or full dynticks mode */
 	if (unlikely(ts->tick_stopped))
 		return HRTIMER_NORESTART;
@@ -1208,3 +1293,8 @@ int tick_check_oneshot_change(int allow_nohz)
 	tick_nohz_switch_to_nohz();
 	return 0;
 }
+
+ktime_t * get_next_event_cpu(unsigned int cpu)
+{
+	return &(per_cpu(tick_cpu_device, cpu).evtdev->next_event);
+}
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 7902ecbce8ec..0651c7f47932 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1669,9 +1669,12 @@ static __always_inline void timekeeping_freqadjust(struct timekeeper *tk,
 {
 	s64 interval = tk->cycle_interval;
 	s64 xinterval = tk->xtime_interval;
+	u32 base = tk->tkr_mono.clock->mult;
+	u32 max = tk->tkr_mono.clock->maxadj;
+	u32 cur_adj = tk->tkr_mono.mult;
 	s64 tick_error;
 	bool negative;
-	u32 adj;
+	u32 adj_scale;
 
 	/* Remove any current error adj from freq calculation */
 	if (tk->ntp_err_mult)
@@ -1690,13 +1693,33 @@ static __always_inline void timekeeping_freqadjust(struct timekeeper *tk,
 	/* preserve the direction of correction */
 	negative = (tick_error < 0);
 
-	/* Sort out the magnitude of the correction */
+	/* If any adjustment would pass the max, just return */
+	if (negative && (cur_adj - 1) <= (base - max))
+		return;
+	if (!negative && (cur_adj + 1) >= (base + max))
+		return;
+	/*
+	 * Sort out the magnitude of the correction, but
+	 * avoid making so large a correction that we go
+	 * over the max adjustment.
+	 */
+	adj_scale = 0;
 	tick_error = abs(tick_error);
-	for (adj = 0; tick_error > interval; adj++)
+	while (tick_error > interval) {
+		u32 adj = 1 << (adj_scale + 1);
+
+		/* Check if adjustment gets us within 1 unit from the max */
+		if (negative && (cur_adj - adj) <= (base - max))
+			break;
+		if (!negative && (cur_adj + adj) >= (base + max))
+			break;
+
+		adj_scale++;
 		tick_error >>= 1;
+	}
 
 	/* scale the corrections */
-	timekeeping_apply_adjustment(tk, offset, negative, adj);
+	timekeeping_apply_adjustment(tk, offset, negative, adj_scale);
 }
 
 /*
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 523fe1669d4c..903705687b52 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -94,12 +94,16 @@ struct tvec_base {
 	struct tvec tv5;
 } ____cacheline_aligned;
 
+static inline void __run_timers(struct tvec_base *base);
 
 static DEFINE_PER_CPU(struct tvec_base, tvec_bases);
 
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
 unsigned int sysctl_timer_migration = 1;
 
+struct tvec_base tvec_base_deferrable;
+static atomic_t deferrable_pending;
+
 void timers_update_migration(bool update_nohz)
 {
 	bool on = sysctl_timer_migration && tick_nohz_active;
@@ -135,18 +139,66 @@ int timer_migration_handler(struct ctl_table *table, int write,
 }
 
 static inline struct tvec_base *get_target_base(struct tvec_base *base,
-						int pinned)
+						int pinned, u32 timer_flags)
 {
+	if (!pinned && !(timer_flags & TIMER_PINNED_ON_CPU) &&
+	    (timer_flags & TIMER_DEFERRABLE))
+		return &tvec_base_deferrable;
 	if (pinned || !base->migration_enabled)
 		return this_cpu_ptr(&tvec_bases);
 	return per_cpu_ptr(&tvec_bases, get_nohz_timer_target());
 }
+
+static inline void __run_deferrable_timers(void)
+{
+	if ((atomic_cmpxchg(&deferrable_pending, 1, 0) &&
+		tick_do_timer_cpu == TICK_DO_TIMER_NONE) ||
+		tick_do_timer_cpu == smp_processor_id()) {
+		if (time_after_eq(jiffies,
+			tvec_base_deferrable.timer_jiffies))
+			__run_timers(&tvec_base_deferrable);
+	}
+}
+
+static inline void init_timer_deferrable_global(void)
+{
+	tvec_base_deferrable.cpu = nr_cpu_ids;
+	spin_lock_init(&tvec_base_deferrable.lock);
+	tvec_base_deferrable.timer_jiffies = jiffies;
+	tvec_base_deferrable.next_timer = tvec_base_deferrable.timer_jiffies;
+}
+
+static inline struct tvec_base *get_timer_base(u32 timer_flags)
+{
+	if (!(timer_flags & TIMER_PINNED_ON_CPU) &&
+	    timer_flags & TIMER_DEFERRABLE)
+		return &tvec_base_deferrable;
+	else
+		return per_cpu_ptr(&tvec_bases, timer_flags & TIMER_CPUMASK);
+}
 #else
 static inline struct tvec_base *get_target_base(struct tvec_base *base,
-						int pinned)
+						int pinned, u32 timer_flags)
 {
 	return this_cpu_ptr(&tvec_bases);
 }
+
+static inline void __run_deferrable_timers(void)
+{
+}
+
+static inline void init_timer_deferrable_global(void)
+{
+	/*
+	 * initialize cpu unbound deferrable timer base only when CONFIG_SMP.
+	 * UP kernel handles the timers with cpu 0 timer base.
+	 */
+}
+
+static inline struct tvec_base *get_timer_base(u32 timer_flags)
+{
+	return per_cpu_ptr(&tvec_bases, timer_flags & TIMER_CPUMASK);
+}
 #endif
 
 static unsigned long round_jiffies_common(unsigned long j, int cpu,
@@ -448,38 +500,6 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
 	}
 }
 
-#ifdef CONFIG_TIMER_STATS
-void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr)
-{
-	if (timer->start_site)
-		return;
-
-	timer->start_site = addr;
-	memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
-	timer->start_pid = current->pid;
-}
-
-static void timer_stats_account_timer(struct timer_list *timer)
-{
-	void *site;
-
-	/*
-	 * start_site can be concurrently reset by
-	 * timer_stats_timer_clear_start_info()
-	 */
-	site = READ_ONCE(timer->start_site);
-	if (likely(!site))
-		return;
-
-	timer_stats_update_stats(timer, timer->start_pid, site,
-				 timer->function, timer->start_comm,
-				 timer->flags);
-}
-
-#else
-static void timer_stats_account_timer(struct timer_list *timer) {}
-#endif
-
 #ifdef CONFIG_DEBUG_OBJECTS_TIMERS
 
 static struct debug_obj_descr timer_debug_descr;
@@ -682,11 +702,6 @@ static void do_init_timer(struct timer_list *timer, unsigned int flags,
 	timer->entry.pprev = NULL;
 	timer->flags = flags | raw_smp_processor_id();
 	timer->slack = -1;
-#ifdef CONFIG_TIMER_STATS
-	timer->start_site = NULL;
-	timer->start_pid = -1;
-	memset(timer->start_comm, 0, TASK_COMM_LEN);
-#endif
 	lockdep_init_map(&timer->lockdep_map, name, key, 0);
 }
 
@@ -775,7 +790,7 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer,
 		tf = READ_ONCE(timer->flags);
 
 		if (!(tf & TIMER_MIGRATING)) {
-			base = per_cpu_ptr(&tvec_bases, tf & TIMER_CPUMASK);
+			base = get_timer_base(tf);
 			spin_lock_irqsave(&base->lock, *flags);
 			if (timer->flags == tf)
 				return base;
@@ -793,7 +808,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
 	unsigned long flags;
 	int ret = 0;
 
-	timer_stats_timer_set_start_info(timer);
 	BUG_ON(!timer->function);
 
 	base = lock_timer_base(timer, &flags);
@@ -804,7 +818,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
 
 	debug_activate(timer, expires);
 
-	new_base = get_target_base(base, pinned);
+	new_base = get_target_base(base, pinned, timer->flags);
 
 	if (base != new_base) {
 		/*
@@ -826,6 +840,10 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
 		}
 	}
 
+	if (pinned == TIMER_PINNED)
+		timer->flags |= TIMER_PINNED_ON_CPU;
+	else
+		timer->flags &= ~TIMER_PINNED_ON_CPU;
 	timer->expires = expires;
 	internal_add_timer(base, timer);
 
@@ -988,7 +1006,6 @@ void add_timer_on(struct timer_list *timer, int cpu)
 	struct tvec_base *base;
 	unsigned long flags;
 
-	timer_stats_timer_set_start_info(timer);
 	BUG_ON(timer_pending(timer) || !timer->function);
 
 	/*
@@ -1007,6 +1024,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
 			   (timer->flags & ~TIMER_BASEMASK) | cpu);
 	}
 
+	timer->flags |= TIMER_PINNED_ON_CPU;
 	debug_activate(timer, timer->expires);
 	internal_add_timer(base, timer);
 	spin_unlock_irqrestore(&base->lock, flags);
@@ -1032,7 +1050,6 @@ int del_timer(struct timer_list *timer)
 
 	debug_assert_init(timer);
 
-	timer_stats_timer_clear_start_info(timer);
 	if (timer_pending(timer)) {
 		base = lock_timer_base(timer, &flags);
 		ret = detach_if_pending(timer, base, true);
@@ -1060,10 +1077,9 @@ int try_to_del_timer_sync(struct timer_list *timer)
 
 	base = lock_timer_base(timer, &flags);
 
-	if (base->running_timer != timer) {
-		timer_stats_timer_clear_start_info(timer);
+	if (base->running_timer != timer)
 		ret = detach_if_pending(timer, base, true);
-	}
+
 	spin_unlock_irqrestore(&base->lock, flags);
 
 	return ret;
@@ -1247,8 +1263,6 @@ static inline void __run_timers(struct tvec_base *base)
 			data = timer->data;
 			irqsafe = timer->flags & TIMER_IRQSAFE;
 
-			timer_stats_account_timer(timer);
-
 			base->running_timer = timer;
 			detach_expired_timer(timer, base);
 
@@ -1376,6 +1390,30 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
 	return DIV_ROUND_UP_ULL(nextevt, TICK_NSEC) * TICK_NSEC;
 }
 
+#ifdef CONFIG_SMP
+/*
+ * check_pending_deferrable_timers - Check for unbound deferrable timer expiry.
+ * @cpu - Current CPU
+ *
+ * The function checks whether any global deferrable pending timers
+ * are exipired or not. This function does not check cpu bounded
+ * diferrable pending timers expiry.
+ *
+ * The function returns true when a cpu unbounded deferrable timer is expired.
+ */
+bool check_pending_deferrable_timers(int cpu)
+{
+	if (cpu == tick_do_timer_cpu ||
+		tick_do_timer_cpu == TICK_DO_TIMER_NONE) {
+		if (time_after_eq(jiffies, tvec_base_deferrable.timer_jiffies)
+			&& !atomic_cmpxchg(&deferrable_pending, 0, 1)) {
+			return true;
+		}
+	}
+	return false;
+}
+#endif
+
 /**
  * get_next_timer_interrupt - return the time (clock mono) of the next timer
  * @basej:	base time jiffies
@@ -1440,6 +1478,8 @@ static void run_timer_softirq(struct softirq_action *h)
 {
 	struct tvec_base *base = this_cpu_ptr(&tvec_bases);
 
+	__run_deferrable_timers();
+
 	if (time_after_eq(jiffies, base->timer_jiffies))
 		__run_timers(base);
 }
@@ -1482,11 +1522,12 @@ static void process_timeout(unsigned long __data)
  * You can set the task state as follows -
  *
  * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
- * pass before the routine returns. The routine will return 0
+ * pass before the routine returns unless the current task is explicitly
+ * woken up, (e.g. by wake_up_process())".
  *
  * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
- * delivered to the current task. In this case the remaining time
- * in jiffies will be returned, or 0 if the timer expired in time
+ * delivered to the current task or the current task is explicitly woken
+ * up.
  *
  * The current task state is guaranteed to be TASK_RUNNING when this
  * routine returns.
@@ -1495,7 +1536,9 @@ static void process_timeout(unsigned long __data)
  * the CPU away without a bound on the timeout. In this case the return
  * value will be %MAX_SCHEDULE_TIMEOUT.
  *
- * In all cases the return value is guaranteed to be non-negative.
+ * Returns 0 when the timer has expired otherwise the remaining time in
+ * jiffies will be returned.  In all cases the return value is guaranteed
+ * to be non-negative.
  */
 signed long __sched schedule_timeout(signed long timeout)
 {
@@ -1573,56 +1616,82 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout)
 }
 EXPORT_SYMBOL(schedule_timeout_uninterruptible);
 
-#ifdef CONFIG_HOTPLUG_CPU
-static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head)
+#if defined(CONFIG_HOTPLUG_CPU)
+static void migrate_timer_list(struct tvec_base *new_base,
+			       struct hlist_head *head, bool remove_pinned)
 {
 	struct timer_list *timer;
 	int cpu = new_base->cpu;
+	struct hlist_node *n;
+	int is_pinned;
+
+	hlist_for_each_entry_safe(timer, n, head, entry) {
+		is_pinned = timer->flags & TIMER_PINNED_ON_CPU;
+		if (!remove_pinned && is_pinned)
+			continue;
 
-	while (!hlist_empty(head)) {
-		timer = hlist_entry(head->first, struct timer_list, entry);
-		/* We ignore the accounting on the dying cpu */
-		detach_timer(timer, false);
+		detach_if_pending(timer, get_timer_base(timer->flags), false);
 		timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu;
 		internal_add_timer(new_base, timer);
 	}
 }
 
-static void migrate_timers(int cpu)
+static void __migrate_timers(int cpu, bool remove_pinned)
 {
 	struct tvec_base *old_base;
 	struct tvec_base *new_base;
+	unsigned long flags;
 	int i;
 
-	BUG_ON(cpu_online(cpu));
 	old_base = per_cpu_ptr(&tvec_bases, cpu);
 	new_base = get_cpu_ptr(&tvec_bases);
 	/*
 	 * The caller is globally serialized and nobody else
 	 * takes two locks at once, deadlock is not possible.
 	 */
-	spin_lock_irq(&new_base->lock);
+	spin_lock_irqsave(&new_base->lock, flags);
 	spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
 
-	BUG_ON(old_base->running_timer);
+	/*
+	 * If we're in the hotplug path, kill the system if there's a running
+	 * timer. It's ok to have a running timer in the isolation case - the
+	 * currently running or just expired timers are off of the timer wheel
+	 * and so everything else can be migrated off.
+	 */
+	if (!cpu_online(cpu))
+		BUG_ON(old_base->running_timer);
 
 	for (i = 0; i < TVR_SIZE; i++)
-		migrate_timer_list(new_base, old_base->tv1.vec + i);
+		migrate_timer_list(new_base, old_base->tv1.vec + i,
+				   remove_pinned);
 	for (i = 0; i < TVN_SIZE; i++) {
-		migrate_timer_list(new_base, old_base->tv2.vec + i);
-		migrate_timer_list(new_base, old_base->tv3.vec + i);
-		migrate_timer_list(new_base, old_base->tv4.vec + i);
-		migrate_timer_list(new_base, old_base->tv5.vec + i);
+		migrate_timer_list(new_base, old_base->tv2.vec + i,
+				remove_pinned);
+		migrate_timer_list(new_base, old_base->tv3.vec + i,
+				remove_pinned);
+		migrate_timer_list(new_base, old_base->tv4.vec + i,
+				remove_pinned);
+		migrate_timer_list(new_base, old_base->tv5.vec + i,
+				remove_pinned);
 	}
 
-	old_base->active_timers = 0;
-	old_base->all_timers = 0;
-
 	spin_unlock(&old_base->lock);
-	spin_unlock_irq(&new_base->lock);
+	spin_unlock_irqrestore(&new_base->lock, flags);
 	put_cpu_ptr(&tvec_bases);
 }
 
+/* Migrate timers from 'cpu' to this_cpu */
+static void migrate_timers(int cpu)
+{
+	BUG_ON(cpu_online(cpu));
+	__migrate_timers(cpu, true);
+}
+
+void timer_quiesce_cpu(void *cpup)
+{
+	__migrate_timers(*(int *)cpup, false);
+}
+
 static int timer_cpu_notify(struct notifier_block *self,
 				unsigned long action, void *hcpu)
 {
@@ -1663,12 +1732,13 @@ static void __init init_timer_cpus(void)
 
 	for_each_possible_cpu(cpu)
 		init_timer_cpu(cpu);
+
+	init_timer_deferrable_global();
 }
 
 void __init init_timers(void)
 {
 	init_timer_cpus();
-	init_timer_stats();
 	timer_register_cpu_notifier();
 	open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
 }
@@ -1702,16 +1772,6 @@ unsigned long msleep_interruptible(unsigned int msecs)
 
 EXPORT_SYMBOL(msleep_interruptible);
 
-static void __sched do_usleep_range(unsigned long min, unsigned long max)
-{
-	ktime_t kmin;
-	u64 delta;
-
-	kmin = ktime_set(0, min * NSEC_PER_USEC);
-	delta = (u64)(max - min) * NSEC_PER_USEC;
-	schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL);
-}
-
 /**
  * usleep_range - Drop in replacement for udelay where wakeup is flexible
  * @min: Minimum time in usecs to sleep
@@ -1719,7 +1779,14 @@ static void __sched do_usleep_range(unsigned long min, unsigned long max)
  */
 void __sched usleep_range(unsigned long min, unsigned long max)
 {
-	__set_current_state(TASK_UNINTERRUPTIBLE);
-	do_usleep_range(min, max);
+	ktime_t exp = ktime_add_us(ktime_get(), min);
+	u64 delta = (u64)(max - min) * NSEC_PER_USEC;
+
+	for (;;) {
+		__set_current_state(TASK_UNINTERRUPTIBLE);
+		/* Do not return before the requested sleep time has elapsed */
+		if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS))
+			break;
+	}
 }
 EXPORT_SYMBOL(usleep_range);
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 1407ed20ea93..d38f8dde3c75 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -63,21 +63,11 @@ static void
 print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer,
 	    int idx, u64 now)
 {
-#ifdef CONFIG_TIMER_STATS
-	char tmp[TASK_COMM_LEN + 1];
-#endif
 	SEQ_printf(m, " #%d: ", idx);
 	print_name_offset(m, taddr);
 	SEQ_printf(m, ", ");
 	print_name_offset(m, timer->function);
 	SEQ_printf(m, ", S:%02x", timer->state);
-#ifdef CONFIG_TIMER_STATS
-	SEQ_printf(m, ", ");
-	print_name_offset(m, timer->start_site);
-	memcpy(tmp, timer->start_comm, TASK_COMM_LEN);
-	tmp[TASK_COMM_LEN] = 0;
-	SEQ_printf(m, ", %s/%d", tmp, timer->start_pid);
-#endif
 	SEQ_printf(m, "\n");
 	SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n",
 		(unsigned long long)ktime_to_ns(hrtimer_get_softexpires(timer)),
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
deleted file mode 100644
index 1adecb4b87c8..000000000000
--- a/kernel/time/timer_stats.c
+++ /dev/null
@@ -1,425 +0,0 @@
-/*
- * kernel/time/timer_stats.c
- *
- * Collect timer usage statistics.
- *
- * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar
- * Copyright(C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
- *
- * timer_stats is based on timer_top, a similar functionality which was part of
- * Con Kolivas dyntick patch set. It was developed by Daniel Petrini at the
- * Instituto Nokia de Tecnologia - INdT - Manaus. timer_top's design was based
- * on dynamic allocation of the statistics entries and linear search based
- * lookup combined with a global lock, rather than the static array, hash
- * and per-CPU locking which is used by timer_stats. It was written for the
- * pre hrtimer kernel code and therefore did not take hrtimers into account.
- * Nevertheless it provided the base for the timer_stats implementation and
- * was a helpful source of inspiration. Kudos to Daniel and the Nokia folks
- * for this effort.
- *
- * timer_top.c is
- *	Copyright (C) 2005 Instituto Nokia de Tecnologia - INdT - Manaus
- *	Written by Daniel Petrini <d.pensator@gmail.com>
- *	timer_top.c was released under the GNU General Public License version 2
- *
- * We export the addresses and counting of timer functions being called,
- * the pid and cmdline from the owner process if applicable.
- *
- * Start/stop data collection:
- * # echo [1|0] >/proc/timer_stats
- *
- * Display the information collected so far:
- * # cat /proc/timer_stats
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/proc_fs.h>
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <linux/sched.h>
-#include <linux/seq_file.h>
-#include <linux/kallsyms.h>
-
-#include <asm/uaccess.h>
-
-/*
- * This is our basic unit of interest: a timer expiry event identified
- * by the timer, its start/expire functions and the PID of the task that
- * started the timer. We count the number of times an event happens:
- */
-struct entry {
-	/*
-	 * Hash list:
-	 */
-	struct entry		*next;
-
-	/*
-	 * Hash keys:
-	 */
-	void			*timer;
-	void			*start_func;
-	void			*expire_func;
-	pid_t			pid;
-
-	/*
-	 * Number of timeout events:
-	 */
-	unsigned long		count;
-	u32			flags;
-
-	/*
-	 * We save the command-line string to preserve
-	 * this information past task exit:
-	 */
-	char			comm[TASK_COMM_LEN + 1];
-
-} ____cacheline_aligned_in_smp;
-
-/*
- * Spinlock protecting the tables - not taken during lookup:
- */
-static DEFINE_RAW_SPINLOCK(table_lock);
-
-/*
- * Per-CPU lookup locks for fast hash lookup:
- */
-static DEFINE_PER_CPU(raw_spinlock_t, tstats_lookup_lock);
-
-/*
- * Mutex to serialize state changes with show-stats activities:
- */
-static DEFINE_MUTEX(show_mutex);
-
-/*
- * Collection status, active/inactive:
- */
-int __read_mostly timer_stats_active;
-
-/*
- * Beginning/end timestamps of measurement:
- */
-static ktime_t time_start, time_stop;
-
-/*
- * tstat entry structs only get allocated while collection is
- * active and never freed during that time - this simplifies
- * things quite a bit.
- *
- * They get freed when a new collection period is started.
- */
-#define MAX_ENTRIES_BITS	10
-#define MAX_ENTRIES		(1UL << MAX_ENTRIES_BITS)
-
-static unsigned long nr_entries;
-static struct entry entries[MAX_ENTRIES];
-
-static atomic_t overflow_count;
-
-/*
- * The entries are in a hash-table, for fast lookup:
- */
-#define TSTAT_HASH_BITS		(MAX_ENTRIES_BITS - 1)
-#define TSTAT_HASH_SIZE		(1UL << TSTAT_HASH_BITS)
-#define TSTAT_HASH_MASK		(TSTAT_HASH_SIZE - 1)
-
-#define __tstat_hashfn(entry)						\
-	(((unsigned long)(entry)->timer       ^				\
-	  (unsigned long)(entry)->start_func  ^				\
-	  (unsigned long)(entry)->expire_func ^				\
-	  (unsigned long)(entry)->pid		) & TSTAT_HASH_MASK)
-
-#define tstat_hashentry(entry)	(tstat_hash_table + __tstat_hashfn(entry))
-
-static struct entry *tstat_hash_table[TSTAT_HASH_SIZE] __read_mostly;
-
-static void reset_entries(void)
-{
-	nr_entries = 0;
-	memset(entries, 0, sizeof(entries));
-	memset(tstat_hash_table, 0, sizeof(tstat_hash_table));
-	atomic_set(&overflow_count, 0);
-}
-
-static struct entry *alloc_entry(void)
-{
-	if (nr_entries >= MAX_ENTRIES)
-		return NULL;
-
-	return entries + nr_entries++;
-}
-
-static int match_entries(struct entry *entry1, struct entry *entry2)
-{
-	return entry1->timer       == entry2->timer	  &&
-	       entry1->start_func  == entry2->start_func  &&
-	       entry1->expire_func == entry2->expire_func &&
-	       entry1->pid	   == entry2->pid;
-}
-
-/*
- * Look up whether an entry matching this item is present
- * in the hash already. Must be called with irqs off and the
- * lookup lock held:
- */
-static struct entry *tstat_lookup(struct entry *entry, char *comm)
-{
-	struct entry **head, *curr, *prev;
-
-	head = tstat_hashentry(entry);
-	curr = *head;
-
-	/*
-	 * The fastpath is when the entry is already hashed,
-	 * we do this with the lookup lock held, but with the
-	 * table lock not held:
-	 */
-	while (curr) {
-		if (match_entries(curr, entry))
-			return curr;
-
-		curr = curr->next;
-	}
-	/*
-	 * Slowpath: allocate, set up and link a new hash entry:
-	 */
-	prev = NULL;
-	curr = *head;
-
-	raw_spin_lock(&table_lock);
-	/*
-	 * Make sure we have not raced with another CPU:
-	 */
-	while (curr) {
-		if (match_entries(curr, entry))
-			goto out_unlock;
-
-		prev = curr;
-		curr = curr->next;
-	}
-
-	curr = alloc_entry();
-	if (curr) {
-		*curr = *entry;
-		curr->count = 0;
-		curr->next = NULL;
-		memcpy(curr->comm, comm, TASK_COMM_LEN);
-
-		smp_mb(); /* Ensure that curr is initialized before insert */
-
-		if (prev)
-			prev->next = curr;
-		else
-			*head = curr;
-	}
- out_unlock:
-	raw_spin_unlock(&table_lock);
-
-	return curr;
-}
-
-/**
- * timer_stats_update_stats - Update the statistics for a timer.
- * @timer:	pointer to either a timer_list or a hrtimer
- * @pid:	the pid of the task which set up the timer
- * @startf:	pointer to the function which did the timer setup
- * @timerf:	pointer to the timer callback function of the timer
- * @comm:	name of the process which set up the timer
- * @tflags:	The flags field of the timer
- *
- * When the timer is already registered, then the event counter is
- * incremented. Otherwise the timer is registered in a free slot.
- */
-void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
-			      void *timerf, char *comm, u32 tflags)
-{
-	/*
-	 * It doesn't matter which lock we take:
-	 */
-	raw_spinlock_t *lock;
-	struct entry *entry, input;
-	unsigned long flags;
-
-	if (likely(!timer_stats_active))
-		return;
-
-	lock = &per_cpu(tstats_lookup_lock, raw_smp_processor_id());
-
-	input.timer = timer;
-	input.start_func = startf;
-	input.expire_func = timerf;
-	input.pid = pid;
-	input.flags = tflags;
-
-	raw_spin_lock_irqsave(lock, flags);
-	if (!timer_stats_active)
-		goto out_unlock;
-
-	entry = tstat_lookup(&input, comm);
-	if (likely(entry))
-		entry->count++;
-	else
-		atomic_inc(&overflow_count);
-
- out_unlock:
-	raw_spin_unlock_irqrestore(lock, flags);
-}
-
-static void print_name_offset(struct seq_file *m, unsigned long addr)
-{
-	char symname[KSYM_NAME_LEN];
-
-	if (lookup_symbol_name(addr, symname) < 0)
-		seq_printf(m, "<%p>", (void *)addr);
-	else
-		seq_printf(m, "%s", symname);
-}
-
-static int tstats_show(struct seq_file *m, void *v)
-{
-	struct timespec period;
-	struct entry *entry;
-	unsigned long ms;
-	long events = 0;
-	ktime_t time;
-	int i;
-
-	mutex_lock(&show_mutex);
-	/*
-	 * If still active then calculate up to now:
-	 */
-	if (timer_stats_active)
-		time_stop = ktime_get();
-
-	time = ktime_sub(time_stop, time_start);
-
-	period = ktime_to_timespec(time);
-	ms = period.tv_nsec / 1000000;
-
-	seq_puts(m, "Timer Stats Version: v0.3\n");
-	seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms);
-	if (atomic_read(&overflow_count))
-		seq_printf(m, "Overflow: %d entries\n", atomic_read(&overflow_count));
-	seq_printf(m, "Collection: %s\n", timer_stats_active ? "active" : "inactive");
-
-	for (i = 0; i < nr_entries; i++) {
-		entry = entries + i;
-		if (entry->flags & TIMER_DEFERRABLE) {
-			seq_printf(m, "%4luD, %5d %-16s ",
-				entry->count, entry->pid, entry->comm);
-		} else {
-			seq_printf(m, " %4lu, %5d %-16s ",
-				entry->count, entry->pid, entry->comm);
-		}
-
-		print_name_offset(m, (unsigned long)entry->start_func);
-		seq_puts(m, " (");
-		print_name_offset(m, (unsigned long)entry->expire_func);
-		seq_puts(m, ")\n");
-
-		events += entry->count;
-	}
-
-	ms += period.tv_sec * 1000;
-	if (!ms)
-		ms = 1;
-
-	if (events && period.tv_sec)
-		seq_printf(m, "%ld total events, %ld.%03ld events/sec\n",
-			   events, events * 1000 / ms,
-			   (events * 1000000 / ms) % 1000);
-	else
-		seq_printf(m, "%ld total events\n", events);
-
-	mutex_unlock(&show_mutex);
-
-	return 0;
-}
-
-/*
- * After a state change, make sure all concurrent lookup/update
- * activities have stopped:
- */
-static void sync_access(void)
-{
-	unsigned long flags;
-	int cpu;
-
-	for_each_online_cpu(cpu) {
-		raw_spinlock_t *lock = &per_cpu(tstats_lookup_lock, cpu);
-
-		raw_spin_lock_irqsave(lock, flags);
-		/* nothing */
-		raw_spin_unlock_irqrestore(lock, flags);
-	}
-}
-
-static ssize_t tstats_write(struct file *file, const char __user *buf,
-			    size_t count, loff_t *offs)
-{
-	char ctl[2];
-
-	if (count != 2 || *offs)
-		return -EINVAL;
-
-	if (copy_from_user(ctl, buf, count))
-		return -EFAULT;
-
-	mutex_lock(&show_mutex);
-	switch (ctl[0]) {
-	case '0':
-		if (timer_stats_active) {
-			timer_stats_active = 0;
-			time_stop = ktime_get();
-			sync_access();
-		}
-		break;
-	case '1':
-		if (!timer_stats_active) {
-			reset_entries();
-			time_start = ktime_get();
-			smp_mb();
-			timer_stats_active = 1;
-		}
-		break;
-	default:
-		count = -EINVAL;
-	}
-	mutex_unlock(&show_mutex);
-
-	return count;
-}
-
-static int tstats_open(struct inode *inode, struct file *filp)
-{
-	return single_open(filp, tstats_show, NULL);
-}
-
-static const struct file_operations tstats_fops = {
-	.open		= tstats_open,
-	.read		= seq_read,
-	.write		= tstats_write,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-void __init init_timer_stats(void)
-{
-	int cpu;
-
-	for_each_possible_cpu(cpu)
-		raw_spin_lock_init(&per_cpu(tstats_lookup_lock, cpu));
-}
-
-static int __init init_tstats_procfs(void)
-{
-	struct proc_dir_entry *pe;
-
-	pe = proc_create("timer_stats", 0644, NULL, &tstats_fops);
-	if (!pe)
-		return -ENOMEM;
-	return 0;
-}
-__initcall(init_tstats_procfs);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 006eefb6ede0..3c7b7a9bcad1 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -89,6 +89,31 @@ config RING_BUFFER_ALLOW_SWAP
 	 Allow the use of ring_buffer_swap_cpu.
 	 Adds a very slight overhead to tracing when enabled.
 
+config IPC_LOGGING
+	bool "Debug Logging for IPC Drivers"
+	select GENERIC_TRACER
+	help
+	  This option allows the debug logging for IPC Drivers.
+
+	  If in doubt, say no.
+
+config QCOM_RTB
+	bool "Register tracing"
+	help
+	  Add support for logging different events to a small uncached
+	  region. This is designed to aid in debugging reset cases where the
+	  caches may not be flushed before the target resets.
+
+config QCOM_RTB_SEPARATE_CPUS
+	bool "Separate entries for each cpu"
+	depends on QCOM_RTB
+	depends on SMP
+	help
+	  Under some circumstances, it may be beneficial to give dedicated space
+	  for each cpu to log accesses. Selecting this option will log each cpu
+	  separately. This will guarantee that the last acesses for each cpu
+	  will be logged but there will be fewer entries per cpu
+
 # All tracer options should select GENERIC_TRACER. For those options that are
 # enabled by all tracers (context switch and event tracer) they select TRACING.
 # This allows those options to appear when no other tracer is selected. But the
@@ -499,6 +524,19 @@ config FUNCTION_PROFILER
 
 	  If in doubt, say N.
 
+config CPU_FREQ_SWITCH_PROFILER
+	bool "CPU frequency switch time profiler"
+	select GENERIC_TRACER
+	help
+	  This option enables the CPU frequency switch profiler. A file is
+	  created in debugfs called "cpu_freq_switch_profile_enabled", which
+	  defaults to zero. When a 1 is echoed into this file, profiling begins.
+	  When a zero is echoed, profiling stops. A "cpu_freq_switch" file is
+	  also created in the trace_stats directory; this file shows the
+	  switches that have occurred and duration statistics.
+
+	  If in doubt, say N.
+
 config FTRACE_MCOUNT_RECORD
 	def_bool y
 	depends on DYNAMIC_FTRACE
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 4b35fb97ae44..a0177ae43058 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -41,6 +41,7 @@ obj-$(CONFIG_PREEMPTIRQ_EVENTS) += trace_irqsoff.o
 obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
 obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
 obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
+obj-$(CONFIG_CPU_FREQ_SWITCH_PROFILER) += trace_cpu_freq_switch.o
 obj-$(CONFIG_NOP_TRACER) += trace_nop.o
 obj-$(CONFIG_STACK_TRACER) += trace_stack.o
 obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
@@ -70,7 +71,12 @@ endif
 obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o
 obj-$(CONFIG_UPROBE_EVENT) += trace_uprobe.o
 obj-$(CONFIG_GPU_TRACEPOINTS) += gpu-traces.o
+obj-$(CONFIG_QCOM_RTB) += msm_rtb.o
 
 obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o
+obj-$(CONFIG_IPC_LOGGING) += ipc_logging.o
+ifdef CONFIG_DEBUG_FS
+obj-$(CONFIG_IPC_LOGGING) += ipc_logging_debug.o
+endif
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 210b8e726a97..c39fc68c4778 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -200,9 +200,9 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
  * blk_io_trace structure and places it in a per-cpu subbuffer.
  */
 static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
-		     int rw, u32 what, int error, int pdu_len, void *pdu_data)
+				int rw, u32 what, int error, int pdu_len,
+				void *pdu_data, struct task_struct *tsk)
 {
-	struct task_struct *tsk = current;
 	struct ring_buffer_event *event = NULL;
 	struct ring_buffer *buffer = NULL;
 	struct blk_io_trace *t;
@@ -723,18 +723,33 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
 			     unsigned int nr_bytes, u32 what)
 {
 	struct blk_trace *bt = q->blk_trace;
+	struct task_struct *tsk = current;
 
 	if (likely(!bt))
 		return;
 
+	/*
+	 * Use the bio context for all events except ISSUE and
+	 * COMPLETE events.
+	 *
+	 * Not all the pages in the bio are dirtied by the same task but
+	 * most likely it will be, since the sectors accessed on the device
+	 * must be adjacent.
+	 */
+	if (!((what == BLK_TA_ISSUE) || (what == BLK_TA_COMPLETE)) &&
+	    bio_has_data(rq->bio) && rq->bio->bi_io_vec &&
+	    rq->bio->bi_io_vec->bv_page &&
+	    rq->bio->bi_io_vec->bv_page->tsk_dirty)
+		tsk = rq->bio->bi_io_vec->bv_page->tsk_dirty;
+
 	if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
 		what |= BLK_TC_ACT(BLK_TC_PC);
 		__blk_add_trace(bt, 0, nr_bytes, rq->cmd_flags,
-				what, rq->errors, rq->cmd_len, rq->cmd);
+				what, rq->errors, rq->cmd_len, rq->cmd, tsk);
 	} else  {
 		what |= BLK_TC_ACT(BLK_TC_FS);
 		__blk_add_trace(bt, blk_rq_pos(rq), nr_bytes,
-				rq->cmd_flags, what, rq->errors, 0, NULL);
+				rq->cmd_flags, what, rq->errors, 0, NULL, tsk);
 	}
 }
 
@@ -786,12 +801,22 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
 			      u32 what, int error)
 {
 	struct blk_trace *bt = q->blk_trace;
+	struct task_struct *tsk = current;
 
 	if (likely(!bt))
 		return;
 
+	/*
+	 * Not all the pages in the bio are dirtied by the same task but
+	 * most likely it will be, since the sectors accessed on the device
+	 * must be adjacent.
+	 */
+	if (bio_has_data(bio) && bio->bi_io_vec && bio->bi_io_vec->bv_page &&
+	    bio->bi_io_vec->bv_page->tsk_dirty)
+		tsk = bio->bi_io_vec->bv_page->tsk_dirty;
+
 	__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
-			bio->bi_rw, what, error, 0, NULL);
+			bio->bi_rw, what, error, 0, NULL, tsk);
 }
 
 static void blk_add_trace_bio_bounce(void *ignore,
@@ -839,7 +864,8 @@ static void blk_add_trace_getrq(void *ignore,
 		struct blk_trace *bt = q->blk_trace;
 
 		if (bt)
-			__blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0, NULL);
+			__blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0,
+					NULL, current);
 	}
 }
 
@@ -855,7 +881,7 @@ static void blk_add_trace_sleeprq(void *ignore,
 
 		if (bt)
 			__blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ,
-					0, 0, NULL);
+					0, 0, NULL, current);
 	}
 }
 
@@ -864,7 +890,8 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q)
 	struct blk_trace *bt = q->blk_trace;
 
 	if (bt)
-		__blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
+		__blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL,
+				current);
 }
 
 static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
@@ -881,7 +908,8 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
 		else
 			what = BLK_TA_UNPLUG_TIMER;
 
-		__blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
+		__blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu,
+				current);
 	}
 }
 
@@ -890,13 +918,19 @@ static void blk_add_trace_split(void *ignore,
 				unsigned int pdu)
 {
 	struct blk_trace *bt = q->blk_trace;
+	struct task_struct *tsk = current;
 
 	if (bt) {
 		__be64 rpdu = cpu_to_be64(pdu);
 
+		if (bio_has_data(bio) && bio->bi_io_vec &&
+		    bio->bi_io_vec->bv_page &&
+		    bio->bi_io_vec->bv_page->tsk_dirty)
+			tsk = bio->bi_io_vec->bv_page->tsk_dirty;
+
 		__blk_add_trace(bt, bio->bi_iter.bi_sector,
 				bio->bi_iter.bi_size, bio->bi_rw, BLK_TA_SPLIT,
-				bio->bi_error, sizeof(rpdu), &rpdu);
+				bio->bi_error, sizeof(rpdu), &rpdu, tsk);
 	}
 }
 
@@ -919,6 +953,7 @@ static void blk_add_trace_bio_remap(void *ignore,
 {
 	struct blk_trace *bt = q->blk_trace;
 	struct blk_io_trace_remap r;
+	struct task_struct *tsk = current;
 
 	if (likely(!bt))
 		return;
@@ -927,9 +962,14 @@ static void blk_add_trace_bio_remap(void *ignore,
 	r.device_to   = cpu_to_be32(bio->bi_bdev->bd_dev);
 	r.sector_from = cpu_to_be64(from);
 
+	if (bio_has_data(bio) && bio->bi_io_vec &&
+	    bio->bi_io_vec->bv_page &&
+	    bio->bi_io_vec->bv_page->tsk_dirty)
+		tsk = bio->bi_io_vec->bv_page->tsk_dirty;
+
 	__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
 			bio->bi_rw, BLK_TA_REMAP, bio->bi_error,
-			sizeof(r), &r);
+			sizeof(r), &r, tsk);
 }
 
 /**
@@ -952,6 +992,7 @@ static void blk_add_trace_rq_remap(void *ignore,
 {
 	struct blk_trace *bt = q->blk_trace;
 	struct blk_io_trace_remap r;
+	struct task_struct *tsk = current;
 
 	if (likely(!bt))
 		return;
@@ -960,9 +1001,14 @@ static void blk_add_trace_rq_remap(void *ignore,
 	r.device_to   = cpu_to_be32(disk_devt(rq->rq_disk));
 	r.sector_from = cpu_to_be64(from);
 
+	if (bio_has_data(rq->bio) && rq->bio->bi_io_vec &&
+	    rq->bio->bi_io_vec->bv_page &&
+	    rq->bio->bi_io_vec->bv_page->tsk_dirty)
+		tsk = rq->bio->bi_io_vec->bv_page->tsk_dirty;
+
 	__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
 			rq_data_dir(rq), BLK_TA_REMAP, !!rq->errors,
-			sizeof(r), &r);
+			sizeof(r), &r, tsk);
 }
 
 /**
@@ -981,16 +1027,22 @@ void blk_add_driver_data(struct request_queue *q,
 			 void *data, size_t len)
 {
 	struct blk_trace *bt = q->blk_trace;
+	struct task_struct *tsk = current;
 
 	if (likely(!bt))
 		return;
 
+	if (bio_has_data(rq->bio) && rq->bio->bi_io_vec &&
+	    rq->bio->bi_io_vec->bv_page &&
+	    rq->bio->bi_io_vec->bv_page->tsk_dirty)
+		tsk = rq->bio->bi_io_vec->bv_page->tsk_dirty;
+
 	if (rq->cmd_type == REQ_TYPE_BLOCK_PC)
 		__blk_add_trace(bt, 0, blk_rq_bytes(rq), 0,
-				BLK_TA_DRV_DATA, rq->errors, len, data);
+				BLK_TA_DRV_DATA, rq->errors, len, data, tsk);
 	else
 		__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0,
-				BLK_TA_DRV_DATA, rq->errors, len, data);
+				BLK_TA_DRV_DATA, rq->errors, len, data, tsk);
 }
 EXPORT_SYMBOL_GPL(blk_add_driver_data);
 
diff --git a/kernel/trace/ipc_logging.c b/kernel/trace/ipc_logging.c
new file mode 100644
index 000000000000..ed29c38cd7fb
--- /dev/null
+++ b/kernel/trace/ipc_logging.c
@@ -0,0 +1,876 @@
+/* Copyright (c) 2012-2017, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <asm/arch_timer.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/jiffies.h>
+#include <linux/debugfs.h>
+#include <linux/io.h>
+#include <linux/idr.h>
+#include <linux/string.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/delay.h>
+#include <linux/completion.h>
+#include <linux/ipc_logging.h>
+
+#include "ipc_logging_private.h"
+
+#define LOG_PAGE_DATA_SIZE	sizeof(((struct ipc_log_page *)0)->data)
+#define LOG_PAGE_FLAG (1 << 31)
+
+static LIST_HEAD(ipc_log_context_list);
+static DEFINE_RWLOCK(context_list_lock_lha1);
+static void *get_deserialization_func(struct ipc_log_context *ilctxt,
+				      int type);
+
+static struct ipc_log_page *get_first_page(struct ipc_log_context *ilctxt)
+{
+	struct ipc_log_page_header *p_pghdr;
+	struct ipc_log_page *pg = NULL;
+
+	if (!ilctxt)
+		return NULL;
+	p_pghdr = list_first_entry(&ilctxt->page_list,
+				   struct ipc_log_page_header, list);
+	pg = container_of(p_pghdr, struct ipc_log_page, hdr);
+	return pg;
+}
+
+/**
+ * is_nd_read_empty - Returns true if no data is available to read in log
+ *
+ * @ilctxt: logging context
+ * @returns: > 1 if context is empty; 0 if not empty; <0 for failure
+ *
+ * This is for the debugfs read pointer which allows for a non-destructive read.
+ * There may still be data in the log, but it may have already been read.
+ */
+static int is_nd_read_empty(struct ipc_log_context *ilctxt)
+{
+	if (!ilctxt)
+		return -EINVAL;
+
+	return ((ilctxt->nd_read_page == ilctxt->write_page) &&
+		(ilctxt->nd_read_page->hdr.nd_read_offset ==
+		 ilctxt->write_page->hdr.write_offset));
+}
+
+/**
+ * is_read_empty - Returns true if no data is available in log
+ *
+ * @ilctxt: logging context
+ * @returns: > 1 if context is empty; 0 if not empty; <0 for failure
+ *
+ * This is for the actual log contents.  If it is empty, then there
+ * is no data at all in the log.
+ */
+static int is_read_empty(struct ipc_log_context *ilctxt)
+{
+	if (!ilctxt)
+		return -EINVAL;
+
+	return ((ilctxt->read_page == ilctxt->write_page) &&
+		(ilctxt->read_page->hdr.read_offset ==
+		 ilctxt->write_page->hdr.write_offset));
+}
+
+/**
+ * is_nd_read_equal_read - Return true if the non-destructive read is equal to
+ * the destructive read
+ *
+ * @ilctxt: logging context
+ * @returns: true if nd read is equal to read; false otherwise
+ */
+static bool is_nd_read_equal_read(struct ipc_log_context *ilctxt)
+{
+	uint16_t read_offset;
+	uint16_t nd_read_offset;
+
+	if (ilctxt->nd_read_page == ilctxt->read_page) {
+		read_offset = ilctxt->read_page->hdr.read_offset;
+		nd_read_offset = ilctxt->nd_read_page->hdr.nd_read_offset;
+
+		if (read_offset == nd_read_offset)
+			return true;
+	}
+
+	return false;
+}
+
+
+static struct ipc_log_page *get_next_page(struct ipc_log_context *ilctxt,
+					  struct ipc_log_page *cur_pg)
+{
+	struct ipc_log_page_header *p_pghdr;
+	struct ipc_log_page *pg = NULL;
+
+	if (!ilctxt || !cur_pg)
+		return NULL;
+
+	if (ilctxt->last_page == cur_pg)
+		return ilctxt->first_page;
+
+	p_pghdr = list_first_entry(&cur_pg->hdr.list,
+			struct ipc_log_page_header, list);
+	pg = container_of(p_pghdr, struct ipc_log_page, hdr);
+
+	return pg;
+}
+
+/**
+ * ipc_log_read - do non-destructive read of the log
+ *
+ * @ilctxt:  Logging context
+ * @data:  Data pointer to receive the data
+ * @data_size:  Number of bytes to read (must be <= bytes available in log)
+ *
+ * This read will update a runtime read pointer, but will not affect the actual
+ * contents of the log which allows for reading the logs continuously while
+ * debugging and if the system crashes, then the full logs can still be
+ * extracted.
+ */
+static void ipc_log_read(struct ipc_log_context *ilctxt,
+			 void *data, int data_size)
+{
+	int bytes_to_read;
+
+	bytes_to_read = MIN(LOG_PAGE_DATA_SIZE
+				- ilctxt->nd_read_page->hdr.nd_read_offset,
+			      data_size);
+
+	memcpy(data, (ilctxt->nd_read_page->data +
+		ilctxt->nd_read_page->hdr.nd_read_offset), bytes_to_read);
+
+	if (bytes_to_read != data_size) {
+		/* not enough space, wrap read to next page */
+		ilctxt->nd_read_page->hdr.nd_read_offset = 0;
+		ilctxt->nd_read_page = get_next_page(ilctxt,
+			ilctxt->nd_read_page);
+		BUG_ON(ilctxt->nd_read_page == NULL);
+
+		memcpy((data + bytes_to_read),
+			   (ilctxt->nd_read_page->data +
+			ilctxt->nd_read_page->hdr.nd_read_offset),
+			   (data_size - bytes_to_read));
+		bytes_to_read = (data_size - bytes_to_read);
+	}
+	ilctxt->nd_read_page->hdr.nd_read_offset += bytes_to_read;
+}
+
+/**
+ * ipc_log_drop - do destructive read of the log
+ *
+ * @ilctxt:  Logging context
+ * @data:  Data pointer to receive the data (or NULL)
+ * @data_size:  Number of bytes to read (must be <= bytes available in log)
+ */
+static void ipc_log_drop(struct ipc_log_context *ilctxt, void *data,
+		int data_size)
+{
+	int bytes_to_read;
+	bool push_nd_read;
+
+	bytes_to_read = MIN(LOG_PAGE_DATA_SIZE
+				- ilctxt->read_page->hdr.read_offset,
+			      data_size);
+	if (data)
+		memcpy(data, (ilctxt->read_page->data +
+			ilctxt->read_page->hdr.read_offset), bytes_to_read);
+
+	if (bytes_to_read != data_size) {
+		/* not enough space, wrap read to next page */
+		push_nd_read = is_nd_read_equal_read(ilctxt);
+
+		ilctxt->read_page->hdr.read_offset = 0;
+		if (push_nd_read) {
+			ilctxt->read_page->hdr.nd_read_offset = 0;
+			ilctxt->read_page = get_next_page(ilctxt,
+				ilctxt->read_page);
+			BUG_ON(ilctxt->read_page == NULL);
+			ilctxt->nd_read_page = ilctxt->read_page;
+		} else {
+			ilctxt->read_page = get_next_page(ilctxt,
+				ilctxt->read_page);
+			BUG_ON(ilctxt->read_page == NULL);
+		}
+
+		if (data)
+			memcpy((data + bytes_to_read),
+				   (ilctxt->read_page->data +
+				ilctxt->read_page->hdr.read_offset),
+				   (data_size - bytes_to_read));
+
+		bytes_to_read = (data_size - bytes_to_read);
+	}
+
+	/* update non-destructive read pointer if necessary */
+	push_nd_read = is_nd_read_equal_read(ilctxt);
+	ilctxt->read_page->hdr.read_offset += bytes_to_read;
+	ilctxt->write_avail += data_size;
+
+	if (push_nd_read)
+		ilctxt->nd_read_page->hdr.nd_read_offset += bytes_to_read;
+}
+
+/**
+ * msg_read - Reads a message.
+ *
+ * If a message is read successfully, then the message context
+ * will be set to:
+ *     .hdr    message header .size and .type values
+ *     .offset beginning of message data
+ *
+ * @ilctxt	Logging context
+ * @ectxt   Message context
+ *
+ * @returns 0 - no message available; >0 message size; <0 error
+ */
+static int msg_read(struct ipc_log_context *ilctxt,
+	     struct encode_context *ectxt)
+{
+	struct tsv_header hdr;
+
+	if (!ectxt)
+		return -EINVAL;
+
+	if (is_nd_read_empty(ilctxt))
+		return 0;
+
+	ipc_log_read(ilctxt, &hdr, sizeof(hdr));
+	ectxt->hdr.type = hdr.type;
+	ectxt->hdr.size = hdr.size;
+	ectxt->offset = sizeof(hdr);
+	ipc_log_read(ilctxt, (ectxt->buff + ectxt->offset),
+			 (int)hdr.size);
+
+	return sizeof(hdr) + (int)hdr.size;
+}
+
+/**
+ * msg_drop - Drops a message.
+ *
+ * @ilctxt	Logging context
+ */
+static void msg_drop(struct ipc_log_context *ilctxt)
+{
+	struct tsv_header hdr;
+
+	if (!is_read_empty(ilctxt)) {
+		ipc_log_drop(ilctxt, &hdr, sizeof(hdr));
+		ipc_log_drop(ilctxt, NULL, (int)hdr.size);
+	}
+}
+
+/*
+ * Commits messages to the FIFO.  If the FIFO is full, then enough
+ * messages are dropped to create space for the new message.
+ */
+void ipc_log_write(void *ctxt, struct encode_context *ectxt)
+{
+	struct ipc_log_context *ilctxt = (struct ipc_log_context *)ctxt;
+	int bytes_to_write;
+	unsigned long flags;
+
+	if (!ilctxt || !ectxt) {
+		pr_err("%s: Invalid ipc_log or encode context\n", __func__);
+		return;
+	}
+
+	read_lock_irqsave(&context_list_lock_lha1, flags);
+	spin_lock(&ilctxt->context_lock_lhb1);
+	while (ilctxt->write_avail <= ectxt->offset)
+		msg_drop(ilctxt);
+
+	bytes_to_write = MIN(LOG_PAGE_DATA_SIZE
+				- ilctxt->write_page->hdr.write_offset,
+				ectxt->offset);
+	memcpy((ilctxt->write_page->data +
+		ilctxt->write_page->hdr.write_offset),
+		ectxt->buff, bytes_to_write);
+
+	if (bytes_to_write != ectxt->offset) {
+		uint64_t t_now = sched_clock();
+
+		ilctxt->write_page->hdr.write_offset += bytes_to_write;
+		ilctxt->write_page->hdr.end_time = t_now;
+
+		ilctxt->write_page = get_next_page(ilctxt, ilctxt->write_page);
+		BUG_ON(ilctxt->write_page == NULL);
+		ilctxt->write_page->hdr.write_offset = 0;
+		ilctxt->write_page->hdr.start_time = t_now;
+		memcpy((ilctxt->write_page->data +
+			ilctxt->write_page->hdr.write_offset),
+		       (ectxt->buff + bytes_to_write),
+		       (ectxt->offset - bytes_to_write));
+		bytes_to_write = (ectxt->offset - bytes_to_write);
+	}
+	ilctxt->write_page->hdr.write_offset += bytes_to_write;
+	ilctxt->write_avail -= ectxt->offset;
+	complete(&ilctxt->read_avail);
+	spin_unlock(&ilctxt->context_lock_lhb1);
+	read_unlock_irqrestore(&context_list_lock_lha1, flags);
+}
+EXPORT_SYMBOL(ipc_log_write);
+
+/*
+ * Starts a new message after which you can add serialized data and
+ * then complete the message by calling msg_encode_end().
+ */
+void msg_encode_start(struct encode_context *ectxt, uint32_t type)
+{
+	if (!ectxt) {
+		pr_err("%s: Invalid encode context\n", __func__);
+		return;
+	}
+
+	ectxt->hdr.type = type;
+	ectxt->hdr.size = 0;
+	ectxt->offset = sizeof(ectxt->hdr);
+}
+EXPORT_SYMBOL(msg_encode_start);
+
+/*
+ * Completes the message
+ */
+void msg_encode_end(struct encode_context *ectxt)
+{
+	if (!ectxt) {
+		pr_err("%s: Invalid encode context\n", __func__);
+		return;
+	}
+
+	/* finalize data size */
+	ectxt->hdr.size = ectxt->offset - sizeof(ectxt->hdr);
+	BUG_ON(ectxt->hdr.size > MAX_MSG_SIZE);
+	memcpy(ectxt->buff, &ectxt->hdr, sizeof(ectxt->hdr));
+}
+EXPORT_SYMBOL(msg_encode_end);
+
+/*
+ * Helper funtion used to write data to a message context.
+ *
+ * @ectxt context initialized by calling msg_encode_start()
+ * @data  data to write
+ * @size  number of bytes of data to write
+ */
+static inline int tsv_write_data(struct encode_context *ectxt,
+				 void *data, uint32_t size)
+{
+	if (!ectxt) {
+		pr_err("%s: Invalid encode context\n", __func__);
+		return -EINVAL;
+	}
+	if ((ectxt->offset + size) > MAX_MSG_SIZE) {
+		pr_err("%s: No space to encode further\n", __func__);
+		return -EINVAL;
+	}
+
+	memcpy((void *)(ectxt->buff + ectxt->offset), data, size);
+	ectxt->offset += size;
+	return 0;
+}
+
+/*
+ * Helper function that writes a type to the context.
+ *
+ * @ectxt context initialized by calling msg_encode_start()
+ * @type  primitive type
+ * @size  size of primitive in bytes
+ */
+static inline int tsv_write_header(struct encode_context *ectxt,
+				   uint32_t type, uint32_t size)
+{
+	struct tsv_header hdr;
+
+	hdr.type = (unsigned char)type;
+	hdr.size = (unsigned char)size;
+	return tsv_write_data(ectxt, &hdr, sizeof(hdr));
+}
+
+/*
+ * Writes the current timestamp count.
+ *
+ * @ectxt   context initialized by calling msg_encode_start()
+ */
+int tsv_timestamp_write(struct encode_context *ectxt)
+{
+	int ret;
+	uint64_t t_now = sched_clock();
+
+	ret = tsv_write_header(ectxt, TSV_TYPE_TIMESTAMP, sizeof(t_now));
+	if (ret)
+		return ret;
+	return tsv_write_data(ectxt, &t_now, sizeof(t_now));
+}
+EXPORT_SYMBOL(tsv_timestamp_write);
+
+/*
+ * Writes the current QTimer timestamp count.
+ *
+ * @ectxt   context initialized by calling msg_encode_start()
+ */
+int tsv_qtimer_write(struct encode_context *ectxt)
+{
+	int ret;
+	uint64_t t_now = arch_counter_get_cntvct();
+
+	ret = tsv_write_header(ectxt, TSV_TYPE_QTIMER, sizeof(t_now));
+	if (ret)
+		return ret;
+	return tsv_write_data(ectxt, &t_now, sizeof(t_now));
+}
+EXPORT_SYMBOL(tsv_qtimer_write);
+
+/*
+ * Writes a data pointer.
+ *
+ * @ectxt   context initialized by calling msg_encode_start()
+ * @pointer pointer value to write
+ */
+int tsv_pointer_write(struct encode_context *ectxt, void *pointer)
+{
+	int ret;
+	ret = tsv_write_header(ectxt, TSV_TYPE_POINTER, sizeof(pointer));
+	if (ret)
+		return ret;
+	return tsv_write_data(ectxt, &pointer, sizeof(pointer));
+}
+EXPORT_SYMBOL(tsv_pointer_write);
+
+/*
+ * Writes a 32-bit integer value.
+ *
+ * @ectxt context initialized by calling msg_encode_start()
+ * @n     integer to write
+ */
+int tsv_int32_write(struct encode_context *ectxt, int32_t n)
+{
+	int ret;
+	ret = tsv_write_header(ectxt, TSV_TYPE_INT32, sizeof(n));
+	if (ret)
+		return ret;
+	return tsv_write_data(ectxt, &n, sizeof(n));
+}
+EXPORT_SYMBOL(tsv_int32_write);
+
+/*
+ * Writes a byte array.
+ *
+ * @ectxt context initialized by calling msg_write_start()
+ * @data  Beginning address of data
+ * @data_size Size of data to be written
+ */
+int tsv_byte_array_write(struct encode_context *ectxt,
+			 void *data, int data_size)
+{
+	int ret;
+	ret = tsv_write_header(ectxt, TSV_TYPE_BYTE_ARRAY, data_size);
+	if (ret)
+		return ret;
+	return tsv_write_data(ectxt, data, data_size);
+}
+EXPORT_SYMBOL(tsv_byte_array_write);
+
+/*
+ * Helper function to log a string
+ *
+ * @ilctxt ipc_log_context created using ipc_log_context_create()
+ * @fmt Data specified using format specifiers
+ */
+int ipc_log_string(void *ilctxt, const char *fmt, ...)
+{
+	struct encode_context ectxt;
+	int avail_size, data_size, hdr_size = sizeof(struct tsv_header);
+	va_list arg_list;
+
+	if (!ilctxt)
+		return -EINVAL;
+
+	msg_encode_start(&ectxt, TSV_TYPE_STRING);
+	tsv_timestamp_write(&ectxt);
+	tsv_qtimer_write(&ectxt);
+	avail_size = (MAX_MSG_SIZE - (ectxt.offset + hdr_size));
+	va_start(arg_list, fmt);
+	data_size = vscnprintf((ectxt.buff + ectxt.offset + hdr_size),
+				avail_size, fmt, arg_list);
+	va_end(arg_list);
+	tsv_write_header(&ectxt, TSV_TYPE_BYTE_ARRAY, data_size);
+	ectxt.offset += data_size;
+	msg_encode_end(&ectxt);
+	ipc_log_write(ilctxt, &ectxt);
+	return 0;
+}
+EXPORT_SYMBOL(ipc_log_string);
+
+/**
+ * ipc_log_extract - Reads and deserializes log
+ *
+ * @ctxt:  logging context
+ * @buff:    buffer to receive the data
+ * @size:    size of the buffer
+ * @returns: 0 if no data read; >0 number of bytes read; < 0 error
+ *
+ * If no data is available to be read, then the ilctxt::read_avail
+ * completion is reinitialized.  This allows clients to block
+ * until new log data is save.
+ */
+int ipc_log_extract(void *ctxt, char *buff, int size)
+{
+	struct encode_context ectxt;
+	struct decode_context dctxt;
+	void (*deserialize_func)(struct encode_context *ectxt,
+				 struct decode_context *dctxt);
+	struct ipc_log_context *ilctxt = (struct ipc_log_context *)ctxt;
+	unsigned long flags;
+
+	if (size < MAX_MSG_DECODED_SIZE)
+		return -EINVAL;
+
+	dctxt.output_format = OUTPUT_DEBUGFS;
+	dctxt.buff = buff;
+	dctxt.size = size;
+	read_lock_irqsave(&context_list_lock_lha1, flags);
+	spin_lock(&ilctxt->context_lock_lhb1);
+	while (dctxt.size >= MAX_MSG_DECODED_SIZE &&
+	       !is_nd_read_empty(ilctxt)) {
+		msg_read(ilctxt, &ectxt);
+		deserialize_func = get_deserialization_func(ilctxt,
+							ectxt.hdr.type);
+		spin_unlock(&ilctxt->context_lock_lhb1);
+		read_unlock_irqrestore(&context_list_lock_lha1, flags);
+		if (deserialize_func)
+			deserialize_func(&ectxt, &dctxt);
+		else
+			pr_err("%s: unknown message 0x%x\n",
+				__func__, ectxt.hdr.type);
+		read_lock_irqsave(&context_list_lock_lha1, flags);
+		spin_lock(&ilctxt->context_lock_lhb1);
+	}
+	if ((size - dctxt.size) == 0)
+		reinit_completion(&ilctxt->read_avail);
+	spin_unlock(&ilctxt->context_lock_lhb1);
+	read_unlock_irqrestore(&context_list_lock_lha1, flags);
+	return size - dctxt.size;
+}
+EXPORT_SYMBOL(ipc_log_extract);
+
+/*
+ * Helper funtion used to read data from a message context.
+ *
+ * @ectxt  context initialized by calling msg_read()
+ * @data  data to read
+ * @size  number of bytes of data to read
+ */
+static void tsv_read_data(struct encode_context *ectxt,
+			  void *data, uint32_t size)
+{
+	BUG_ON((ectxt->offset + size) > MAX_MSG_SIZE);
+	memcpy(data, (ectxt->buff + ectxt->offset), size);
+	ectxt->offset += size;
+}
+
+/*
+ * Helper function that reads a type from the context and updates the
+ * context pointers.
+ *
+ * @ectxt  context initialized by calling msg_read()
+ * @hdr   type header
+ */
+static void tsv_read_header(struct encode_context *ectxt,
+			    struct tsv_header *hdr)
+{
+	BUG_ON((ectxt->offset + sizeof(*hdr)) > MAX_MSG_SIZE);
+	memcpy(hdr, (ectxt->buff + ectxt->offset), sizeof(*hdr));
+	ectxt->offset += sizeof(*hdr);
+}
+
+/*
+ * Reads a timestamp.
+ *
+ * @ectxt   context initialized by calling msg_read()
+ * @dctxt   deserialization context
+ * @format output format (appended to %6u.09u timestamp format)
+ */
+void tsv_timestamp_read(struct encode_context *ectxt,
+			struct decode_context *dctxt, const char *format)
+{
+	struct tsv_header hdr;
+	uint64_t val;
+	unsigned long nanosec_rem;
+
+	tsv_read_header(ectxt, &hdr);
+	BUG_ON(hdr.type != TSV_TYPE_TIMESTAMP);
+	tsv_read_data(ectxt, &val, sizeof(val));
+	nanosec_rem = do_div(val, 1000000000U);
+	IPC_SPRINTF_DECODE(dctxt, "[%6u.%09lu%s/",
+			(unsigned)val, nanosec_rem, format);
+}
+EXPORT_SYMBOL(tsv_timestamp_read);
+
+/*
+ * Reads a QTimer timestamp.
+ *
+ * @ectxt   context initialized by calling msg_read()
+ * @dctxt   deserialization context
+ * @format output format (appended to %#18llx timestamp format)
+ */
+void tsv_qtimer_read(struct encode_context *ectxt,
+		     struct decode_context *dctxt, const char *format)
+{
+	struct tsv_header hdr;
+	uint64_t val;
+
+	tsv_read_header(ectxt, &hdr);
+	BUG_ON(hdr.type != TSV_TYPE_QTIMER);
+	tsv_read_data(ectxt, &val, sizeof(val));
+
+	/*
+	 * This gives 16 hex digits of output. The # prefix prepends
+	 * a 0x, and these characters count as part of the number.
+	 */
+	IPC_SPRINTF_DECODE(dctxt, "%#18llx]%s", val, format);
+}
+EXPORT_SYMBOL(tsv_qtimer_read);
+
+/*
+ * Reads a data pointer.
+ *
+ * @ectxt   context initialized by calling msg_read()
+ * @dctxt   deserialization context
+ * @format output format
+ */
+void tsv_pointer_read(struct encode_context *ectxt,
+		      struct decode_context *dctxt, const char *format)
+{
+	struct tsv_header hdr;
+	void *val;
+
+	tsv_read_header(ectxt, &hdr);
+	BUG_ON(hdr.type != TSV_TYPE_POINTER);
+	tsv_read_data(ectxt, &val, sizeof(val));
+
+	IPC_SPRINTF_DECODE(dctxt, format, val);
+}
+EXPORT_SYMBOL(tsv_pointer_read);
+
+/*
+ * Reads a 32-bit integer value.
+ *
+ * @ectxt   context initialized by calling msg_read()
+ * @dctxt   deserialization context
+ * @format output format
+ */
+int32_t tsv_int32_read(struct encode_context *ectxt,
+		       struct decode_context *dctxt, const char *format)
+{
+	struct tsv_header hdr;
+	int32_t val;
+
+	tsv_read_header(ectxt, &hdr);
+	BUG_ON(hdr.type != TSV_TYPE_INT32);
+	tsv_read_data(ectxt, &val, sizeof(val));
+
+	IPC_SPRINTF_DECODE(dctxt, format, val);
+	return val;
+}
+EXPORT_SYMBOL(tsv_int32_read);
+
+/*
+ * Reads a byte array/string.
+ *
+ * @ectxt   context initialized by calling msg_read()
+ * @dctxt   deserialization context
+ * @format output format
+ */
+void tsv_byte_array_read(struct encode_context *ectxt,
+			 struct decode_context *dctxt, const char *format)
+{
+	struct tsv_header hdr;
+
+	tsv_read_header(ectxt, &hdr);
+	BUG_ON(hdr.type != TSV_TYPE_BYTE_ARRAY);
+	tsv_read_data(ectxt, dctxt->buff, hdr.size);
+	dctxt->buff += hdr.size;
+	dctxt->size -= hdr.size;
+}
+EXPORT_SYMBOL(tsv_byte_array_read);
+
+int add_deserialization_func(void *ctxt, int type,
+			void (*dfunc)(struct encode_context *,
+				      struct decode_context *))
+{
+	struct ipc_log_context *ilctxt = (struct ipc_log_context *)ctxt;
+	struct dfunc_info *df_info;
+	unsigned long flags;
+
+	if (!ilctxt || !dfunc)
+		return -EINVAL;
+
+	df_info = kmalloc(sizeof(struct dfunc_info), GFP_KERNEL);
+	if (!df_info)
+		return -ENOSPC;
+
+	read_lock_irqsave(&context_list_lock_lha1, flags);
+	spin_lock(&ilctxt->context_lock_lhb1);
+	df_info->type = type;
+	df_info->dfunc = dfunc;
+	list_add_tail(&df_info->list, &ilctxt->dfunc_info_list);
+	spin_unlock(&ilctxt->context_lock_lhb1);
+	read_unlock_irqrestore(&context_list_lock_lha1, flags);
+	return 0;
+}
+EXPORT_SYMBOL(add_deserialization_func);
+
+static void *get_deserialization_func(struct ipc_log_context *ilctxt,
+				      int type)
+{
+	struct dfunc_info *df_info = NULL;
+
+	if (!ilctxt)
+		return NULL;
+
+	list_for_each_entry(df_info, &ilctxt->dfunc_info_list, list) {
+		if (df_info->type == type)
+			return df_info->dfunc;
+	}
+	return NULL;
+}
+
+/**
+ * ipc_log_context_create: Create a debug log context
+ *                         Should not be called from atomic context
+ *
+ * @max_num_pages: Number of pages of logging space required (max. 10)
+ * @mod_name     : Name of the directory entry under DEBUGFS
+ * @user_version : Version number of user-defined message formats
+ *
+ * returns context id on success, NULL on failure
+ */
+void *ipc_log_context_create(int max_num_pages,
+			     const char *mod_name, uint16_t user_version)
+{
+	struct ipc_log_context *ctxt;
+	struct ipc_log_page *pg = NULL;
+	int page_cnt;
+	unsigned long flags;
+
+	ctxt = kzalloc(sizeof(struct ipc_log_context), GFP_KERNEL);
+	if (!ctxt) {
+		pr_err("%s: cannot create ipc_log_context\n", __func__);
+		return 0;
+	}
+
+	init_completion(&ctxt->read_avail);
+	INIT_LIST_HEAD(&ctxt->page_list);
+	INIT_LIST_HEAD(&ctxt->dfunc_info_list);
+	spin_lock_init(&ctxt->context_lock_lhb1);
+	for (page_cnt = 0; page_cnt < max_num_pages; page_cnt++) {
+		pg = kzalloc(sizeof(struct ipc_log_page), GFP_KERNEL);
+		if (!pg) {
+			pr_err("%s: cannot create ipc_log_page\n", __func__);
+			goto release_ipc_log_context;
+		}
+		pg->hdr.log_id = (uint64_t)(uintptr_t)ctxt;
+		pg->hdr.page_num = LOG_PAGE_FLAG | page_cnt;
+		pg->hdr.ctx_offset = (int64_t)((uint64_t)(uintptr_t)ctxt -
+			(uint64_t)(uintptr_t)&pg->hdr);
+
+		/* set magic last to signal that page init is complete */
+		pg->hdr.magic = IPC_LOGGING_MAGIC_NUM;
+		pg->hdr.nmagic = ~(IPC_LOGGING_MAGIC_NUM);
+
+		spin_lock_irqsave(&ctxt->context_lock_lhb1, flags);
+		list_add_tail(&pg->hdr.list, &ctxt->page_list);
+		spin_unlock_irqrestore(&ctxt->context_lock_lhb1, flags);
+	}
+
+	ctxt->log_id = (uint64_t)(uintptr_t)ctxt;
+	ctxt->version = IPC_LOG_VERSION;
+	strlcpy(ctxt->name, mod_name, IPC_LOG_MAX_CONTEXT_NAME_LEN);
+	ctxt->user_version = user_version;
+	ctxt->first_page = get_first_page(ctxt);
+	ctxt->last_page = pg;
+	ctxt->write_page = ctxt->first_page;
+	ctxt->read_page = ctxt->first_page;
+	ctxt->nd_read_page = ctxt->first_page;
+	ctxt->write_avail = max_num_pages * LOG_PAGE_DATA_SIZE;
+	ctxt->header_size = sizeof(struct ipc_log_page_header);
+	create_ctx_debugfs(ctxt, mod_name);
+
+	/* set magic last to signal context init is complete */
+	ctxt->magic = IPC_LOG_CONTEXT_MAGIC_NUM;
+	ctxt->nmagic = ~(IPC_LOG_CONTEXT_MAGIC_NUM);
+
+	write_lock_irqsave(&context_list_lock_lha1, flags);
+	list_add_tail(&ctxt->list, &ipc_log_context_list);
+	write_unlock_irqrestore(&context_list_lock_lha1, flags);
+	return (void *)ctxt;
+
+release_ipc_log_context:
+	while (page_cnt-- > 0) {
+		pg = get_first_page(ctxt);
+		list_del(&pg->hdr.list);
+		kfree(pg);
+	}
+	kfree(ctxt);
+	return 0;
+}
+EXPORT_SYMBOL(ipc_log_context_create);
+
+/*
+ * Destroy debug log context
+ *
+ * @ctxt: debug log context created by calling ipc_log_context_create API.
+ */
+int ipc_log_context_destroy(void *ctxt)
+{
+	struct ipc_log_context *ilctxt = (struct ipc_log_context *)ctxt;
+	struct ipc_log_page *pg = NULL;
+	unsigned long flags;
+
+	if (!ilctxt)
+		return 0;
+
+	while (!list_empty(&ilctxt->page_list)) {
+		pg = get_first_page(ctxt);
+		list_del(&pg->hdr.list);
+		kfree(pg);
+	}
+
+	write_lock_irqsave(&context_list_lock_lha1, flags);
+	list_del(&ilctxt->list);
+	write_unlock_irqrestore(&context_list_lock_lha1, flags);
+
+	debugfs_remove_recursive(ilctxt->dent);
+
+	kfree(ilctxt);
+	return 0;
+}
+EXPORT_SYMBOL(ipc_log_context_destroy);
+
+static int __init ipc_logging_init(void)
+{
+	check_and_create_debugfs();
+	return 0;
+}
+
+module_init(ipc_logging_init);
+
+MODULE_DESCRIPTION("ipc logging");
+MODULE_LICENSE("GPL v2");
diff --git a/kernel/trace/ipc_logging_debug.c b/kernel/trace/ipc_logging_debug.c
new file mode 100644
index 000000000000..a54538798f2b
--- /dev/null
+++ b/kernel/trace/ipc_logging_debug.c
@@ -0,0 +1,184 @@
+/* Copyright (c) 2012-2015, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/jiffies.h>
+#include <linux/debugfs.h>
+#include <linux/io.h>
+#include <linux/idr.h>
+#include <linux/string.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/delay.h>
+#include <linux/completion.h>
+#include <linux/ipc_logging.h>
+
+#include "ipc_logging_private.h"
+
+static DEFINE_MUTEX(ipc_log_debugfs_init_lock);
+static struct dentry *root_dent;
+
+static int debug_log(struct ipc_log_context *ilctxt,
+		     char *buff, int size, int cont)
+{
+	int i = 0;
+	int ret;
+
+	if (size < MAX_MSG_DECODED_SIZE) {
+		pr_err("%s: buffer size %d < %d\n", __func__, size,
+			MAX_MSG_DECODED_SIZE);
+		return -ENOMEM;
+	}
+	do {
+		i = ipc_log_extract(ilctxt, buff, size - 1);
+		if (cont && i == 0) {
+			ret = wait_for_completion_interruptible(
+				&ilctxt->read_avail);
+			if (ret < 0)
+				return ret;
+		}
+	} while (cont && i == 0);
+
+	return i;
+}
+
+/*
+ * VFS Read operation helper which dispatches the call to the debugfs
+ * read command stored in file->private_data.
+ *
+ * @file  File structure
+ * @buff   user buffer
+ * @count size of user buffer
+ * @ppos  file position to read from (only a value of 0 is accepted)
+ * @cont  1 = continuous mode (don't return 0 to signal end-of-file)
+ *
+ * @returns ==0 end of file
+ *           >0 number of bytes read
+ *           <0 error
+ */
+static ssize_t debug_read_helper(struct file *file, char __user *buff,
+				 size_t count, loff_t *ppos, int cont)
+{
+	struct ipc_log_context *ilctxt = file->private_data;
+	char *buffer;
+	int bsize;
+
+	buffer = kmalloc(count, GFP_KERNEL);
+	if (!buffer)
+		return -ENOMEM;
+
+	bsize = debug_log(ilctxt, buffer, count, cont);
+	if (bsize > 0) {
+		if (copy_to_user(buff, buffer, bsize)) {
+			kfree(buffer);
+			return -EFAULT;
+		}
+		*ppos += bsize;
+	}
+	kfree(buffer);
+	return bsize;
+}
+
+static ssize_t debug_read(struct file *file, char __user *buff,
+			  size_t count, loff_t *ppos)
+{
+	return debug_read_helper(file, buff, count, ppos, 0);
+}
+
+static ssize_t debug_read_cont(struct file *file, char __user *buff,
+			       size_t count, loff_t *ppos)
+{
+	return debug_read_helper(file, buff, count, ppos, 1);
+}
+
+static int debug_open(struct inode *inode, struct file *file)
+{
+	file->private_data = inode->i_private;
+	return 0;
+}
+
+static const struct file_operations debug_ops = {
+	.read = debug_read,
+	.open = debug_open,
+};
+
+static const struct file_operations debug_ops_cont = {
+	.read = debug_read_cont,
+	.open = debug_open,
+};
+
+static void debug_create(const char *name, mode_t mode,
+			 struct dentry *dent,
+			 struct ipc_log_context *ilctxt,
+			 const struct file_operations *fops)
+{
+	debugfs_create_file(name, mode, dent, ilctxt, fops);
+}
+
+static void dfunc_string(struct encode_context *ectxt,
+			 struct decode_context *dctxt)
+{
+	tsv_timestamp_read(ectxt, dctxt, "");
+	tsv_qtimer_read(ectxt, dctxt, " ");
+	tsv_byte_array_read(ectxt, dctxt, "");
+
+	/* add trailing \n if necessary */
+	if (*(dctxt->buff - 1) != '\n') {
+		if (dctxt->size) {
+			++dctxt->buff;
+			--dctxt->size;
+		}
+		*(dctxt->buff - 1) = '\n';
+	}
+}
+
+void check_and_create_debugfs(void)
+{
+	mutex_lock(&ipc_log_debugfs_init_lock);
+	if (!root_dent) {
+		root_dent = debugfs_create_dir("ipc_logging", 0);
+
+		if (IS_ERR(root_dent)) {
+			pr_err("%s: unable to create debugfs %ld\n",
+				__func__, PTR_ERR(root_dent));
+			root_dent = NULL;
+		}
+	}
+	mutex_unlock(&ipc_log_debugfs_init_lock);
+}
+EXPORT_SYMBOL(check_and_create_debugfs);
+
+void create_ctx_debugfs(struct ipc_log_context *ctxt,
+			const char *mod_name)
+{
+	if (!root_dent)
+		check_and_create_debugfs();
+
+	if (root_dent) {
+		ctxt->dent = debugfs_create_dir(mod_name, root_dent);
+		if (!IS_ERR(ctxt->dent)) {
+			debug_create("log", 0444, ctxt->dent,
+				     ctxt, &debug_ops);
+			debug_create("log_cont", 0444, ctxt->dent,
+				     ctxt, &debug_ops_cont);
+		}
+	}
+	add_deserialization_func((void *)ctxt,
+				 TSV_TYPE_STRING, dfunc_string);
+}
+EXPORT_SYMBOL(create_ctx_debugfs);
diff --git a/kernel/trace/ipc_logging_private.h b/kernel/trace/ipc_logging_private.h
new file mode 100644
index 000000000000..3ac950695086
--- /dev/null
+++ b/kernel/trace/ipc_logging_private.h
@@ -0,0 +1,165 @@
+/* Copyright (c) 2012-2015, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+#ifndef _IPC_LOGGING_PRIVATE_H
+#define _IPC_LOGGING_PRIVATE_H
+
+#include <linux/ipc_logging.h>
+
+#define IPC_LOG_VERSION 0x0003
+#define IPC_LOG_MAX_CONTEXT_NAME_LEN 32
+
+/**
+ * struct ipc_log_page_header - Individual log page header
+ *
+ * @magic: Magic number (used for log extraction)
+ * @nmagic: Inverse of magic number (used for log extraction)
+ * @page_num: Index of page (0.. N - 1) (note top bit is always set)
+ * @read_offset:  Read offset in page
+ * @write_offset: Write offset in page (or 0xFFFF if full)
+ * @log_id: ID of logging context that owns this page
+ * @start_time:  Scheduler clock for first write time in page
+ * @end_time:  Scheduler clock for last write time in page
+ * @ctx_offset:  Signed offset from page to the logging context.  Used to
+ *               optimize ram-dump extraction.
+ *
+ * @list:  Linked list of pages that make up a log
+ * @nd_read_offset:  Non-destructive read offset used for debugfs
+ *
+ * The first part of the structure defines data that is used to extract the
+ * logs from a memory dump and elements in this section should not be changed
+ * or re-ordered.  New local data structures can be added to the end of the
+ * structure since they will be ignored by the extraction tool.
+ */
+struct ipc_log_page_header {
+	uint32_t magic;
+	uint32_t nmagic;
+	uint32_t page_num;
+	uint16_t read_offset;
+	uint16_t write_offset;
+	uint64_t log_id;
+	uint64_t start_time;
+	uint64_t end_time;
+	int64_t ctx_offset;
+
+	/* add local data structures after this point */
+	struct list_head list;
+	uint16_t nd_read_offset;
+};
+
+/**
+ * struct ipc_log_page - Individual log page
+ *
+ * @hdr: Log page header
+ * @data: Log data
+ *
+ * Each log consists of 1 to N log pages.  Data size is adjusted to always fit
+ * the structure into a single kernel page.
+ */
+struct ipc_log_page {
+	struct ipc_log_page_header hdr;
+	char data[PAGE_SIZE - sizeof(struct ipc_log_page_header)];
+};
+
+/**
+ * struct ipc_log_context - main logging context
+ *
+ * @magic:  Magic number (used for log extraction)
+ * @nmagic:  Inverse of magic number (used for log extraction)
+ * @version:  IPC Logging version of log format
+ * @user_version:  Version number for user-defined messages
+ * @header_size:  Size of the log header which is used to determine the offset
+ *                of ipc_log_page::data
+ * @log_id:  Log ID (assigned when log is created)
+ * @name:  Name of the log used to uniquely identify the log during extraction
+ *
+ * @list:  List of log contexts (struct ipc_log_context)
+ * @page_list:  List of log pages (struct ipc_log_page)
+ * @first_page:  First page in list of logging pages
+ * @last_page:  Last page in list of logging pages
+ * @write_page:  Current write page
+ * @read_page:  Current read page (for internal reads)
+ * @nd_read_page:  Current debugfs extraction page (non-destructive)
+ *
+ * @write_avail:  Number of bytes available to write in all pages
+ * @dent:  Debugfs node for run-time log extraction
+ * @dfunc_info_list:  List of deserialization functions
+ * @context_lock_lhb1:  Lock for entire structure
+ * @read_avail:  Completed when new data is added to the log
+ */
+struct ipc_log_context {
+	uint32_t magic;
+	uint32_t nmagic;
+	uint32_t version;
+	uint16_t user_version;
+	uint16_t header_size;
+	uint64_t log_id;
+	char name[IPC_LOG_MAX_CONTEXT_NAME_LEN];
+
+	/* add local data structures after this point */
+	struct list_head list;
+	struct list_head page_list;
+	struct ipc_log_page *first_page;
+	struct ipc_log_page *last_page;
+	struct ipc_log_page *write_page;
+	struct ipc_log_page *read_page;
+	struct ipc_log_page *nd_read_page;
+
+	uint32_t write_avail;
+	struct dentry *dent;
+	struct list_head dfunc_info_list;
+	spinlock_t context_lock_lhb1;
+	struct completion read_avail;
+};
+
+struct dfunc_info {
+	struct list_head list;
+	int type;
+	void (*dfunc) (struct encode_context *, struct decode_context *);
+};
+
+enum {
+	TSV_TYPE_INVALID,
+	TSV_TYPE_TIMESTAMP,
+	TSV_TYPE_POINTER,
+	TSV_TYPE_INT32,
+	TSV_TYPE_BYTE_ARRAY,
+	TSV_TYPE_QTIMER,
+};
+
+enum {
+	OUTPUT_DEBUGFS,
+};
+
+#define IPC_LOG_CONTEXT_MAGIC_NUM 0x25874452
+#define IPC_LOGGING_MAGIC_NUM 0x52784425
+#define MIN(x, y) ((x) < (y) ? (x) : (y))
+#define IS_MSG_TYPE(x) (((x) > TSV_TYPE_MSG_START) && \
+			((x) < TSV_TYPE_MSG_END))
+#define MAX_MSG_DECODED_SIZE (MAX_MSG_SIZE*4)
+
+#if (defined(CONFIG_DEBUG_FS))
+void check_and_create_debugfs(void);
+
+void create_ctx_debugfs(struct ipc_log_context *ctxt,
+			const char *mod_name);
+#else
+void check_and_create_debugfs(void)
+{
+}
+
+void create_ctx_debugfs(struct ipc_log_context *ctxt, const char *mod_name)
+{
+}
+#endif
+
+#endif
diff --git a/kernel/trace/msm_rtb.c b/kernel/trace/msm_rtb.c
new file mode 100644
index 000000000000..587082117842
--- /dev/null
+++ b/kernel/trace/msm_rtb.c
@@ -0,0 +1,346 @@
+/*
+ * Copyright (c) 2013-2017, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/atomic.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/dma-mapping.h>
+#include <linux/mod_devicetable.h>
+#include <linux/platform_device.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/atomic.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/io.h>
+#include <asm-generic/sizes.h>
+#include <linux/msm_rtb.h>
+#include <asm/timex.h>
+#include <soc/qcom/minidump.h>
+
+#define SENTINEL_BYTE_1 0xFF
+#define SENTINEL_BYTE_2 0xAA
+#define SENTINEL_BYTE_3 0xFF
+
+#define RTB_COMPAT_STR	"qcom,msm-rtb"
+
+/* Write
+ * 1) 3 bytes sentinel
+ * 2) 1 bytes of log type
+ * 3) 8 bytes of where the caller came from
+ * 4) 4 bytes index
+ * 4) 8 bytes extra data from the caller
+ * 5) 8 bytes of timestamp
+ * 6) 8 bytes of cyclecount
+ *
+ * Total = 40 bytes.
+ */
+struct msm_rtb_layout {
+	unsigned char sentinel[3];
+	unsigned char log_type;
+	uint32_t idx;
+	uint64_t caller;
+	uint64_t data;
+	uint64_t timestamp;
+	uint64_t cycle_count;
+} __attribute__ ((__packed__));
+
+
+struct msm_rtb_state {
+	struct msm_rtb_layout *rtb;
+	phys_addr_t phys;
+	int nentries;
+	int size;
+	int enabled;
+	int initialized;
+	uint32_t filter;
+	int step_size;
+};
+
+#if defined(CONFIG_QCOM_RTB_SEPARATE_CPUS)
+DEFINE_PER_CPU(atomic_t, msm_rtb_idx_cpu);
+#else
+static atomic_t msm_rtb_idx;
+#endif
+
+static struct msm_rtb_state msm_rtb = {
+	.filter = 1 << LOGK_LOGBUF,
+	.enabled = 1,
+};
+
+module_param_named(filter, msm_rtb.filter, uint, 0644);
+module_param_named(enable, msm_rtb.enabled, int, 0644);
+
+static int msm_rtb_panic_notifier(struct notifier_block *this,
+					unsigned long event, void *ptr)
+{
+	msm_rtb.enabled = 0;
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block msm_rtb_panic_blk = {
+	.notifier_call  = msm_rtb_panic_notifier,
+	.priority = INT_MAX,
+};
+
+int notrace msm_rtb_event_should_log(enum logk_event_type log_type)
+{
+	return msm_rtb.initialized && msm_rtb.enabled &&
+		((1 << (log_type & ~LOGTYPE_NOPC)) & msm_rtb.filter);
+}
+EXPORT_SYMBOL(msm_rtb_event_should_log);
+
+static void msm_rtb_emit_sentinel(struct msm_rtb_layout *start)
+{
+	start->sentinel[0] = SENTINEL_BYTE_1;
+	start->sentinel[1] = SENTINEL_BYTE_2;
+	start->sentinel[2] = SENTINEL_BYTE_3;
+}
+
+static void msm_rtb_write_type(enum logk_event_type log_type,
+			struct msm_rtb_layout *start)
+{
+	start->log_type = (char)log_type;
+}
+
+static void msm_rtb_write_caller(uint64_t caller, struct msm_rtb_layout *start)
+{
+	start->caller = caller;
+}
+
+static void msm_rtb_write_idx(uint32_t idx,
+				struct msm_rtb_layout *start)
+{
+	start->idx = idx;
+}
+
+static void msm_rtb_write_data(uint64_t data, struct msm_rtb_layout *start)
+{
+	start->data = data;
+}
+
+static void msm_rtb_write_timestamp(struct msm_rtb_layout *start)
+{
+	start->timestamp = sched_clock();
+}
+
+static void msm_rtb_write_cyclecount(struct msm_rtb_layout *start)
+{
+	start->cycle_count = get_cycles();
+}
+
+static void uncached_logk_pc_idx(enum logk_event_type log_type, uint64_t caller,
+				 uint64_t data, int idx)
+{
+	struct msm_rtb_layout *start;
+
+	start = &msm_rtb.rtb[idx & (msm_rtb.nentries - 1)];
+
+	msm_rtb_emit_sentinel(start);
+	msm_rtb_write_type(log_type, start);
+	msm_rtb_write_caller(caller, start);
+	msm_rtb_write_idx(idx, start);
+	msm_rtb_write_data(data, start);
+	msm_rtb_write_timestamp(start);
+	msm_rtb_write_cyclecount(start);
+	mb();
+
+	return;
+}
+
+static void uncached_logk_timestamp(int idx)
+{
+	unsigned long long timestamp;
+
+	timestamp = sched_clock();
+	uncached_logk_pc_idx(LOGK_TIMESTAMP|LOGTYPE_NOPC,
+			(uint64_t)lower_32_bits(timestamp),
+			(uint64_t)upper_32_bits(timestamp), idx);
+}
+
+#if defined(CONFIG_QCOM_RTB_SEPARATE_CPUS)
+static int msm_rtb_get_idx(void)
+{
+	int cpu, i, offset;
+	atomic_t *index;
+
+	/*
+	 * ideally we would use get_cpu but this is a close enough
+	 * approximation for our purposes.
+	 */
+	cpu = raw_smp_processor_id();
+
+	index = &per_cpu(msm_rtb_idx_cpu, cpu);
+
+	i = atomic_add_return(msm_rtb.step_size, index);
+	i -= msm_rtb.step_size;
+
+	/* Check if index has wrapped around */
+	offset = (i & (msm_rtb.nentries - 1)) -
+		 ((i - msm_rtb.step_size) & (msm_rtb.nentries - 1));
+	if (offset < 0) {
+		uncached_logk_timestamp(i);
+		i = atomic_add_return(msm_rtb.step_size, index);
+		i -= msm_rtb.step_size;
+	}
+
+	return i;
+}
+#else
+static int msm_rtb_get_idx(void)
+{
+	int i, offset;
+
+	i = atomic_inc_return(&msm_rtb_idx);
+	i--;
+
+	/* Check if index has wrapped around */
+	offset = (i & (msm_rtb.nentries - 1)) -
+		 ((i - 1) & (msm_rtb.nentries - 1));
+	if (offset < 0) {
+		uncached_logk_timestamp(i);
+		i = atomic_inc_return(&msm_rtb_idx);
+		i--;
+	}
+
+	return i;
+}
+#endif
+
+int notrace uncached_logk_pc(enum logk_event_type log_type, void *caller,
+				void *data)
+{
+	int i;
+
+	if (!msm_rtb_event_should_log(log_type))
+		return 0;
+
+	i = msm_rtb_get_idx();
+	uncached_logk_pc_idx(log_type, (uint64_t)((unsigned long) caller),
+				(uint64_t)((unsigned long) data), i);
+
+	return 1;
+}
+EXPORT_SYMBOL(uncached_logk_pc);
+
+noinline int notrace uncached_logk(enum logk_event_type log_type, void *data)
+{
+	return uncached_logk_pc(log_type, __builtin_return_address(0), data);
+}
+EXPORT_SYMBOL(uncached_logk);
+
+static int msm_rtb_probe(struct platform_device *pdev)
+{
+	struct msm_rtb_platform_data *d = pdev->dev.platform_data;
+	struct md_region md_entry;
+#if defined(CONFIG_QCOM_RTB_SEPARATE_CPUS)
+	unsigned int cpu;
+#endif
+	int ret;
+
+	if (!pdev->dev.of_node) {
+		msm_rtb.size = d->size;
+	} else {
+		u64 size;
+		struct device_node *pnode;
+
+		pnode = of_parse_phandle(pdev->dev.of_node,
+						"linux,contiguous-region", 0);
+		if (pnode != NULL) {
+			const u32 *addr;
+
+			addr = of_get_address(pnode, 0, &size, NULL);
+			if (!addr) {
+				of_node_put(pnode);
+				return -EINVAL;
+			}
+			of_node_put(pnode);
+		} else {
+			ret = of_property_read_u32(pdev->dev.of_node,
+					"qcom,rtb-size",
+					(u32 *)&size);
+			if (ret < 0)
+				return ret;
+
+		}
+
+		msm_rtb.size = size;
+	}
+
+	if (msm_rtb.size <= 0 || msm_rtb.size > SZ_1M)
+		return -EINVAL;
+
+	msm_rtb.rtb = dma_alloc_coherent(&pdev->dev, msm_rtb.size,
+						&msm_rtb.phys,
+						GFP_KERNEL);
+
+	if (!msm_rtb.rtb)
+		return -ENOMEM;
+
+	msm_rtb.nentries = msm_rtb.size / sizeof(struct msm_rtb_layout);
+
+	/* Round this down to a power of 2 */
+	msm_rtb.nentries = __rounddown_pow_of_two(msm_rtb.nentries);
+
+	memset(msm_rtb.rtb, 0, msm_rtb.size);
+
+	strlcpy(md_entry.name, "KRTB_BUF", sizeof(md_entry.name));
+	md_entry.virt_addr = (uintptr_t)msm_rtb.rtb;
+	md_entry.phys_addr = msm_rtb.phys;
+	md_entry.size = msm_rtb.size;
+	if (msm_minidump_add_region(&md_entry))
+		pr_info("Failed to add RTB in Minidump\n");
+
+#if defined(CONFIG_QCOM_RTB_SEPARATE_CPUS)
+	for_each_possible_cpu(cpu) {
+		atomic_t *a = &per_cpu(msm_rtb_idx_cpu, cpu);
+		atomic_set(a, cpu);
+	}
+	msm_rtb.step_size = num_possible_cpus();
+#else
+	atomic_set(&msm_rtb_idx, 0);
+	msm_rtb.step_size = 1;
+#endif
+
+	atomic_notifier_chain_register(&panic_notifier_list,
+						&msm_rtb_panic_blk);
+	msm_rtb.initialized = 1;
+	return 0;
+}
+
+static struct of_device_id msm_match_table[] = {
+	{.compatible = RTB_COMPAT_STR},
+	{},
+};
+
+static struct platform_driver msm_rtb_driver = {
+	.driver         = {
+		.name = "msm_rtb",
+		.owner = THIS_MODULE,
+		.of_match_table = msm_match_table
+	},
+};
+
+static int __init msm_rtb_init(void)
+{
+	return platform_driver_probe(&msm_rtb_driver, msm_rtb_probe);
+}
+
+static void __exit msm_rtb_exit(void)
+{
+	platform_driver_unregister(&msm_rtb_driver);
+}
+module_init(msm_rtb_init)
+module_exit(msm_rtb_exit)
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index eb4220a132ec..49fa2e6eea98 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -15,4 +15,3 @@
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume);
 EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);
-
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 88bb99b96150..0e6a38dd6583 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -41,6 +41,7 @@
 #include <linux/nmi.h>
 #include <linux/fs.h>
 #include <linux/sched/rt.h>
+#include <linux/coresight-stm.h>
 
 #include "trace.h"
 #include "trace_output.h"
@@ -573,8 +574,11 @@ int __trace_puts(unsigned long ip, const char *str, int size)
 	if (entry->buf[size - 1] != '\n') {
 		entry->buf[size] = '\n';
 		entry->buf[size + 1] = '\0';
-	} else
+		stm_log(OST_ENTITY_TRACE_PRINTK, entry->buf, size + 2);
+	} else {
 		entry->buf[size] = '\0';
+		stm_log(OST_ENTITY_TRACE_PRINTK, entry->buf, size + 1);
+	}
 
 	__buffer_unlock_commit(buffer, event);
 	ftrace_trace_stack(&global_trace, buffer, irq_flags, 4, pc, NULL);
@@ -615,6 +619,7 @@ int __trace_bputs(unsigned long ip, const char *str)
 	entry = ring_buffer_event_data(event);
 	entry->ip			= ip;
 	entry->str			= str;
+	stm_log(OST_ENTITY_TRACE_PRINTK, entry->str, strlen(entry->str)+1);
 
 	__buffer_unlock_commit(buffer, event);
 	ftrace_trace_stack(&global_trace, buffer, irq_flags, 4, pc, NULL);
@@ -1640,7 +1645,7 @@ static void __trace_find_cmdline(int pid, char comm[])
 
 	map = savedcmd->map_pid_to_cmdline[pid];
 	if (map != NO_CMDLINE_MAP)
-		strcpy(comm, get_saved_cmdlines(map));
+		strlcpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN - 1);
 	else
 		strcpy(comm, "<...>");
 }
@@ -2290,6 +2295,7 @@ __trace_array_vprintk(struct ring_buffer *buffer,
 
 	memcpy(&entry->buf, tbuffer, len + 1);
 	if (!call_filter_check_discard(call, entry, buffer, event)) {
+		stm_log(OST_ENTITY_TRACE_PRINTK, entry->buf, len + 1);
 		__buffer_unlock_commit(buffer, event);
 		ftrace_trace_stack(&global_trace, buffer, flags, 6, pc, NULL);
 	}
@@ -5355,8 +5361,11 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
 	if (entry->buf[cnt - 1] != '\n') {
 		entry->buf[cnt] = '\n';
 		entry->buf[cnt + 1] = '\0';
-	} else
+		stm_log(OST_ENTITY_TRACE_MARKER, entry->buf, cnt + 2);
+	} else {
 		entry->buf[cnt] = '\0';
+		stm_log(OST_ENTITY_TRACE_MARKER, entry->buf, cnt + 1);
+	}
 
 	__buffer_unlock_commit(buffer, event);
 
diff --git a/kernel/trace/trace_cpu_freq_switch.c b/kernel/trace/trace_cpu_freq_switch.c
new file mode 100644
index 000000000000..f9dab6c4bb72
--- /dev/null
+++ b/kernel/trace/trace_cpu_freq_switch.c
@@ -0,0 +1,312 @@
+/*
+ * Copyright (c) 2012, 2016 The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/slab.h>
+#include <linux/rbtree.h>
+#include <linux/hrtimer.h>
+#include <linux/tracefs.h>
+#include <linux/ktime.h>
+#include <trace/events/power.h>
+#include "trace_stat.h"
+#include "trace.h"
+
+struct trans {
+	struct rb_node node;
+	unsigned int cpu;
+	unsigned int start_freq;
+	unsigned int end_freq;
+	unsigned int min_us;
+	unsigned int max_us;
+	ktime_t total_t;
+	unsigned int count;
+};
+static struct rb_root freq_trans_tree = RB_ROOT;
+
+static struct trans *tr_search(struct rb_root *root, unsigned int cpu,
+			       unsigned int start_freq, unsigned int end_freq)
+{
+	struct rb_node *node = root->rb_node;
+
+	while (node) {
+		struct trans *tr = container_of(node, struct trans, node);
+
+		if (cpu < tr->cpu)
+			node = node->rb_left;
+		else if (cpu > tr->cpu)
+			node = node->rb_right;
+		else if (start_freq < tr->start_freq)
+			node = node->rb_left;
+		else if (start_freq > tr->start_freq)
+			node = node->rb_right;
+		else if (end_freq < tr->end_freq)
+			node = node->rb_left;
+		else if (end_freq > tr->end_freq)
+			node = node->rb_right;
+		else
+			return tr;
+	}
+	return NULL;
+}
+
+static int tr_insert(struct rb_root *root, struct trans *tr)
+{
+	struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+	while (*new) {
+		struct trans *this = container_of(*new, struct trans, node);
+
+		parent = *new;
+		if (tr->cpu < this->cpu)
+			new = &((*new)->rb_left);
+		else if (tr->cpu > this->cpu)
+			new = &((*new)->rb_right);
+		else if (tr->start_freq < this->start_freq)
+			new = &((*new)->rb_left);
+		else if (tr->start_freq > this->start_freq)
+			new = &((*new)->rb_right);
+		else if (tr->end_freq < this->end_freq)
+			new = &((*new)->rb_left);
+		else if (tr->end_freq > this->end_freq)
+			new = &((*new)->rb_right);
+		else
+			return -EINVAL;
+	}
+
+	rb_link_node(&tr->node, parent, new);
+	rb_insert_color(&tr->node, root);
+
+	return 0;
+}
+
+struct trans_state {
+	spinlock_t lock;
+	unsigned int start_freq;
+	unsigned int end_freq;
+	ktime_t start_t;
+	bool started;
+};
+static DEFINE_PER_CPU(struct trans_state, freq_trans_state);
+
+static DEFINE_SPINLOCK(state_lock);
+
+static void probe_start(void *ignore, unsigned int start_freq,
+			unsigned int end_freq, unsigned int cpu)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&state_lock, flags);
+	per_cpu(freq_trans_state, cpu).start_freq = start_freq;
+	per_cpu(freq_trans_state, cpu).end_freq = end_freq;
+	per_cpu(freq_trans_state, cpu).start_t = ktime_get();
+	per_cpu(freq_trans_state, cpu).started = true;
+	spin_unlock_irqrestore(&state_lock, flags);
+}
+
+static void probe_end(void *ignore, unsigned int cpu)
+{
+	unsigned long flags;
+	struct trans *tr;
+	s64 dur_us;
+	ktime_t dur_t, end_t = ktime_get();
+
+	spin_lock_irqsave(&state_lock, flags);
+
+	if (!per_cpu(freq_trans_state, cpu).started)
+		goto out;
+
+	dur_t = ktime_sub(end_t, per_cpu(freq_trans_state, cpu).start_t);
+	dur_us = ktime_to_us(dur_t);
+
+	tr = tr_search(&freq_trans_tree, cpu,
+		       per_cpu(freq_trans_state, cpu).start_freq,
+		       per_cpu(freq_trans_state, cpu).end_freq);
+	if (!tr) {
+		tr = kzalloc(sizeof(*tr), GFP_ATOMIC);
+		if (!tr) {
+			WARN_ONCE(1, "CPU frequency trace is now invalid!\n");
+			goto out;
+		}
+
+		tr->start_freq = per_cpu(freq_trans_state, cpu).start_freq;
+		tr->end_freq = per_cpu(freq_trans_state, cpu).end_freq;
+		tr->cpu = cpu;
+		tr->min_us = UINT_MAX;
+		tr_insert(&freq_trans_tree, tr);
+	}
+	tr->total_t = ktime_add(tr->total_t, dur_t);
+	tr->count++;
+
+	if (dur_us > tr->max_us)
+		tr->max_us = dur_us;
+	if (dur_us < tr->min_us)
+		tr->min_us = dur_us;
+
+	per_cpu(freq_trans_state, cpu).started = false;
+out:
+	spin_unlock_irqrestore(&state_lock, flags);
+}
+
+static void *freq_switch_stat_start(struct tracer_stat *trace)
+{
+	struct rb_node *n;
+	unsigned long flags;
+
+	spin_lock_irqsave(&state_lock, flags);
+	n = rb_first(&freq_trans_tree);
+	spin_unlock_irqrestore(&state_lock, flags);
+
+	return n;
+}
+
+static void *freq_switch_stat_next(void *prev, int idx)
+{
+	struct rb_node *n;
+	unsigned long flags;
+
+	spin_lock_irqsave(&state_lock, flags);
+	n = rb_next(prev);
+	spin_unlock_irqrestore(&state_lock, flags);
+
+	return n;
+}
+
+static int freq_switch_stat_show(struct seq_file *s, void *p)
+{
+	unsigned long flags;
+	struct trans *tr = p;
+
+	spin_lock_irqsave(&state_lock, flags);
+	seq_printf(s, "%3d %9d %8d %5d %6lld %6d %6d\n", tr->cpu,
+		   tr->start_freq, tr->end_freq, tr->count,
+		   div_s64(ktime_to_us(tr->total_t), tr->count),
+		   tr->min_us, tr->max_us);
+	spin_unlock_irqrestore(&state_lock, flags);
+
+	return 0;
+}
+
+static void freq_switch_stat_release(void *stat)
+{
+	struct trans *tr = stat;
+	unsigned long flags;
+
+	spin_lock_irqsave(&state_lock, flags);
+	rb_erase(&tr->node, &freq_trans_tree);
+	spin_unlock_irqrestore(&state_lock, flags);
+	kfree(tr);
+}
+
+static int freq_switch_stat_headers(struct seq_file *s)
+{
+	seq_puts(s, "CPU START_KHZ  END_KHZ COUNT AVG_US MIN_US MAX_US\n");
+	seq_puts(s, "  |         |        |     |      |      |      |\n");
+	return 0;
+}
+
+struct tracer_stat freq_switch_stats __read_mostly = {
+	.name = "cpu_freq_switch",
+	.stat_start = freq_switch_stat_start,
+	.stat_next = freq_switch_stat_next,
+	.stat_show = freq_switch_stat_show,
+	.stat_release = freq_switch_stat_release,
+	.stat_headers = freq_switch_stat_headers
+};
+
+static void trace_freq_switch_disable(void)
+{
+	unregister_stat_tracer(&freq_switch_stats);
+	unregister_trace_cpu_frequency_switch_end(probe_end, NULL);
+	unregister_trace_cpu_frequency_switch_start(probe_start, NULL);
+	pr_info("disabled cpu frequency switch time profiling\n");
+}
+
+static int trace_freq_switch_enable(void)
+{
+	int ret;
+
+	ret = register_trace_cpu_frequency_switch_start(probe_start, NULL);
+	if (ret)
+		goto out;
+
+	ret = register_trace_cpu_frequency_switch_end(probe_end, NULL);
+	if (ret)
+		goto err_register_switch_end;
+
+	ret = register_stat_tracer(&freq_switch_stats);
+	if (ret)
+		goto err_register_stat_tracer;
+
+	pr_info("enabled cpu frequency switch time profiling\n");
+	return 0;
+
+err_register_stat_tracer:
+	unregister_trace_cpu_frequency_switch_end(probe_end, NULL);
+err_register_switch_end:
+	register_trace_cpu_frequency_switch_start(probe_start, NULL);
+out:
+	pr_err("failed to enable cpu frequency switch time profiling\n");
+
+	return ret;
+}
+
+static DEFINE_MUTEX(debugfs_lock);
+static bool trace_freq_switch_enabled;
+
+static int debug_toggle_tracing(void *data, u64 val)
+{
+	int ret = 0;
+
+	mutex_lock(&debugfs_lock);
+
+	if (val == 1 && !trace_freq_switch_enabled)
+		ret = trace_freq_switch_enable();
+	else if (val == 0 && trace_freq_switch_enabled)
+		trace_freq_switch_disable();
+	else if (val > 1)
+		ret = -EINVAL;
+
+	if (!ret)
+		trace_freq_switch_enabled = val;
+
+	mutex_unlock(&debugfs_lock);
+
+	return ret;
+}
+
+static int debug_tracing_state_get(void *data, u64 *val)
+{
+	mutex_lock(&debugfs_lock);
+	*val = trace_freq_switch_enabled;
+	mutex_unlock(&debugfs_lock);
+
+	return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(debug_tracing_state_fops, debug_tracing_state_get,
+			debug_toggle_tracing, "%llu\n");
+
+static int __init trace_freq_switch_init(void)
+{
+	struct dentry *d_tracer = tracing_init_dentry();
+
+	if (IS_ERR(d_tracer))
+		return 0;
+
+	tracefs_create_file("cpu_freq_switch_profile_enabled",
+		S_IRUGO | S_IWUSR, d_tracer, NULL, &debug_tracing_state_fops);
+
+	return 0;
+}
+late_initcall(trace_freq_switch_init);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index cc9f7a9319be..731f6484b811 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -256,7 +256,8 @@ int perf_trace_add(struct perf_event *p_event, int flags)
 void perf_trace_del(struct perf_event *p_event, int flags)
 {
 	struct trace_event_call *tp_event = p_event->tp_event;
-	hlist_del_rcu(&p_event->hlist_entry);
+	if (!hlist_unhashed(&p_event->hlist_entry))
+		hlist_del_rcu(&p_event->hlist_entry);
 	tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
 }
 
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index ba5392807912..1235f9fd9fbd 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -287,14 +287,15 @@ static void output_printk(struct trace_event_buffer *fbuffer)
 	spin_unlock_irqrestore(&tracepoint_iter_lock, flags);
 }
 
-void trace_event_buffer_commit(struct trace_event_buffer *fbuffer)
+void trace_event_buffer_commit(struct trace_event_buffer *fbuffer,
+			       unsigned long len)
 {
 	if (tracepoint_printk)
 		output_printk(fbuffer);
 
 	event_trigger_unlock_commit(fbuffer->trace_file, fbuffer->buffer,
 				    fbuffer->event, fbuffer->entry,
-				    fbuffer->flags, fbuffer->pc);
+				    fbuffer->flags, fbuffer->pc, len);
 }
 EXPORT_SYMBOL_GPL(trace_event_buffer_commit);
 
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 21b162c07e83..c00137ea939e 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -13,6 +13,7 @@
 #include <linux/uaccess.h>
 #include <linux/module.h>
 #include <linux/ftrace.h>
+#include <linux/sched/sysctl.h>
 
 #include "trace.h"
 
@@ -39,6 +40,12 @@ static int save_flags;
 static void stop_irqsoff_tracer(struct trace_array *tr, int graph);
 static int start_irqsoff_tracer(struct trace_array *tr, int graph);
 
+/*
+ * irqsoff stack tracing threshold in ns.
+ * default: 1ms
+ */
+unsigned int sysctl_irqsoff_tracing_threshold_ns = 1000000UL;
+
 #ifdef CONFIG_PREEMPT_TRACER
 static inline int
 preempt_trace(void)
@@ -454,17 +461,52 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1)
 
 #else /* !CONFIG_PROVE_LOCKING */
 
+#ifdef CONFIG_PREEMPTIRQ_EVENTS
+struct irqsoff_store {
+	u64 ts;
+	unsigned long caddr[4];
+};
+
+static DEFINE_PER_CPU(struct irqsoff_store, the_irqsoff);
+#endif /* CONFIG_PREEMPTIRQ_EVENTS */
+
 /*
  * We are only interested in hardirq on/off events:
  */
 static inline void tracer_hardirqs_on(void)
 {
+#ifdef CONFIG_PREEMPTIRQ_EVENTS
+	struct irqsoff_store *is = &per_cpu(the_irqsoff,
+						raw_smp_processor_id());
+
+	if (!is->ts) {
+		is->ts = sched_clock();
+		is->caddr[0] = CALLER_ADDR0;
+		is->caddr[1] = CALLER_ADDR1;
+		is->caddr[2] = CALLER_ADDR2;
+		is->caddr[3] = CALLER_ADDR3;
+	}
+#endif /* CONFIG_PREEMPTIRQ_EVENTS */
 	if (!preempt_trace() && irq_trace())
 		stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
 }
 
 static inline void tracer_hardirqs_off(void)
 {
+#ifdef CONFIG_PREEMPTIRQ_EVENTS
+	struct irqsoff_store *is = &per_cpu(the_irqsoff,
+						raw_smp_processor_id());
+	u64 delta = 0;
+
+	if (is->ts) {
+		delta = sched_clock() - is->ts;
+		is->ts = 0;
+	}
+	if (delta > sysctl_irqsoff_tracing_threshold_ns)
+		trace_irqs_disable(delta, is->caddr[0], is->caddr[1],
+						is->caddr[2], is->caddr[3]);
+#endif /* CONFIG_PREEMPTIRQ_EVENTS */
+
 	if (!preempt_trace() && irq_trace())
 		start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
 }
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index ad1d6164e946..e82cff5c842c 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -304,7 +304,7 @@ static int t_show(struct seq_file *m, void *v)
 	if (!*fmt)
 		return 0;
 
-	seq_printf(m, "0x%lx : \"", *(unsigned long *)fmt);
+	seq_printf(m, "0x%lx : \"", 0L);
 
 	/*
 	 * Tabs and new lines need to be converted.
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 9d4399b553a3..78f04e4ad829 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -359,7 +359,8 @@ static bool report_latency(struct trace_array *tr, cycle_t delta)
 }
 
 static void
-probe_wakeup_migrate_task(void *ignore, struct task_struct *task, int cpu)
+probe_wakeup_migrate_task(void *ignore, struct task_struct *task, int cpu,
+							unsigned int load)
 {
 	if (task != wakeup_task)
 		return;
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 0655afbea83f..a01740a98afa 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -336,7 +336,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
 	syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);
 
 	event_trigger_unlock_commit(trace_file, buffer, event, entry,
-				    irq_flags, pc);
+				    irq_flags, pc, 0);
 }
 
 static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
@@ -382,7 +382,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
 	entry->ret = syscall_get_return_value(current, regs);
 
 	event_trigger_unlock_commit(trace_file, buffer, event, entry,
-				    irq_flags, pc);
+				    irq_flags, pc, 0);
 }
 
 static int reg_event_syscall_enter(struct trace_event_file *file,
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 1dc887bab085..31a436f9f13b 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -823,7 +823,7 @@ static void __uprobe_trace_func(struct trace_uprobe *tu,
 
 	memcpy(data, ucb->buf, tu->tp.size + dsize);
 
-	event_trigger_unlock_commit(trace_file, buffer, event, entry, 0, 0);
+	event_trigger_unlock_commit(trace_file, buffer, event, entry, 0, 0, 0);
 }
 
 /* uprobe handler */
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index b718530034d2..f527d8e65d69 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -13,6 +13,7 @@
 
 #include <linux/mm.h>
 #include <linux/cpu.h>
+#include <linux/device.h>
 #include <linux/nmi.h>
 #include <linux/init.h>
 #include <linux/module.h>
@@ -20,11 +21,13 @@
 #include <linux/smpboot.h>
 #include <linux/sched/rt.h>
 #include <linux/tick.h>
+#include <linux/workqueue.h>
 
 #include <asm/irq_regs.h>
 #include <linux/kvm_para.h>
 #include <linux/perf_event.h>
 #include <linux/kthread.h>
+#include <soc/qcom/watchdog.h>
 
 /*
  * The run state of the lockup detectors is controlled by the content of the
@@ -94,6 +97,7 @@ static u64 __read_mostly sample_period;
 static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
 static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
 static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
+static DEFINE_PER_CPU(unsigned int, watchdog_en);
 static DEFINE_PER_CPU(bool, softlockup_touch_sync);
 static DEFINE_PER_CPU(bool, soft_watchdog_warn);
 static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
@@ -230,7 +234,15 @@ static void __touch_watchdog(void)
 	__this_cpu_write(watchdog_touch_ts, get_timestamp());
 }
 
-void touch_softlockup_watchdog(void)
+/**
+ * touch_softlockup_watchdog_sched - touch watchdog on scheduler stalls
+ *
+ * Call when the scheduler may have stalled for legitimate reasons
+ * preventing the watchdog task from executing - e.g. the scheduler
+ * entering idle state.  This should only be used for scheduler events.
+ * Use touch_softlockup_watchdog() for everything else.
+ */
+void touch_softlockup_watchdog_sched(void)
 {
 	/*
 	 * Preemption can be enabled.  It doesn't matter which CPU's timestamp
@@ -238,6 +250,12 @@ void touch_softlockup_watchdog(void)
 	 */
 	raw_cpu_write(watchdog_touch_ts, 0);
 }
+
+void touch_softlockup_watchdog(void)
+{
+	touch_softlockup_watchdog_sched();
+	wq_watchdog_touch(raw_smp_processor_id());
+}
 EXPORT_SYMBOL(touch_softlockup_watchdog);
 
 void touch_all_softlockup_watchdogs(void)
@@ -251,6 +269,7 @@ void touch_all_softlockup_watchdogs(void)
 	 */
 	for_each_watchdog_cpu(cpu)
 		per_cpu(watchdog_touch_ts, cpu) = 0;
+	wq_watchdog_touch(-1);
 }
 
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
@@ -346,8 +365,11 @@ static void watchdog_check_hardlockup_other_cpu(void)
 		if (per_cpu(hard_watchdog_warn, next_cpu) == true)
 			return;
 
-		if (hardlockup_panic)
-			panic("Watchdog detected hard LOCKUP on cpu %u", next_cpu);
+		if (hardlockup_panic) {
+			pr_err("Watchdog detected hard LOCKUP on cpu %u",
+					next_cpu);
+			msm_trigger_wdog_bite();
+		}
 		else
 			WARN(1, "Watchdog detected hard LOCKUP on cpu %u", next_cpu);
 
@@ -409,6 +431,9 @@ static void watchdog_overflow_callback(struct perf_event *event,
 			return;
 
 		pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
+		if (hardlockup_panic)
+			msm_trigger_wdog_bite();
+
 		print_modules();
 		print_irqtrace_events(current);
 		if (regs)
@@ -531,6 +556,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 		pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
 			smp_processor_id(), duration,
 			current->comm, task_pid_nr(current));
+
+		if (softlockup_panic)
+			msm_trigger_wdog_bite();
 		__this_cpu_write(softlockup_task_ptr_saved, current);
 		print_modules();
 		print_irqtrace_events(current);
@@ -567,9 +595,13 @@ static void watchdog_set_prio(unsigned int policy, unsigned int prio)
 	sched_setscheduler(current, policy, &param);
 }
 
-static void watchdog_enable(unsigned int cpu)
+void watchdog_enable(unsigned int cpu)
 {
 	struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
+	unsigned int *enabled = raw_cpu_ptr(&watchdog_en);
+
+	if (*enabled)
+		return;
 
 	/* kick off the timer for the hardlockup detector */
 	hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
@@ -585,16 +617,40 @@ static void watchdog_enable(unsigned int cpu)
 	/* initialize timestamp */
 	watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1);
 	__touch_watchdog();
+
+	/*
+	 * Need to ensure above operations are observed by other CPUs before
+	 * indicating that timer is enabled. This is to synchronize core
+	 * isolation and hotplug. Core isolation will wait for this flag to be
+	 * set.
+	 */
+	mb();
+	*enabled = 1;
 }
 
-static void watchdog_disable(unsigned int cpu)
+void watchdog_disable(unsigned int cpu)
 {
 	struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
+	unsigned int *enabled = raw_cpu_ptr(&watchdog_en);
+
+	if (!*enabled)
+		return;
 
 	watchdog_set_prio(SCHED_NORMAL, 0);
 	hrtimer_cancel(hrtimer);
 	/* disable the perf event */
 	watchdog_nmi_disable(cpu);
+
+	/*
+	 * No need for barrier here since disabling the watchdog is
+	 * synchronized with hotplug lock
+	 */
+	*enabled = 0;
+}
+
+bool watchdog_configured(unsigned int cpu)
+{
+	return *per_cpu_ptr(&watchdog_en, cpu);
 }
 
 static void watchdog_cleanup(unsigned int cpu, bool online)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index d8a2084b88db..89a0f1171f90 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -48,6 +48,8 @@
 #include <linux/nodemask.h>
 #include <linux/moduleparam.h>
 #include <linux/uaccess.h>
+#include <linux/bug.h>
+#include <linux/delay.h>
 
 #include "workqueue_internal.h"
 
@@ -149,6 +151,8 @@ struct worker_pool {
 	int			id;		/* I: pool ID */
 	unsigned int		flags;		/* X: flags */
 
+	unsigned long		watchdog_ts;	/* L: watchdog timestamp */
+
 	struct list_head	worklist;	/* L: list of pending works */
 	int			nr_workers;	/* L: total number of workers */
 
@@ -1123,6 +1127,8 @@ static void pwq_activate_delayed_work(struct work_struct *work)
 	struct pool_workqueue *pwq = get_work_pwq(work);
 
 	trace_workqueue_activate_work(work);
+	if (list_empty(&pwq->pool->worklist))
+		pwq->pool->watchdog_ts = jiffies;
 	move_linked_works(work, &pwq->pool->worklist, NULL);
 	__clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
 	pwq->nr_active++;
@@ -1281,6 +1287,12 @@ fail:
 	if (work_is_canceling(work))
 		return -ENOENT;
 	cpu_relax();
+	/*
+	 * The queueing is in progress in another context. If we keep
+	 * taking the pool->lock in a busy loop, the other context may
+	 * never get the lock. Give 1 usec delay to avoid this contention.
+	 */
+	udelay(1);
 	return -EAGAIN;
 }
 
@@ -1425,6 +1437,8 @@ retry:
 		trace_workqueue_activate_work(work);
 		pwq->nr_active++;
 		worklist = &pwq->pool->worklist;
+		if (list_empty(worklist))
+			pwq->pool->watchdog_ts = jiffies;
 	} else {
 		work_flags |= WORK_STRUCT_DELAYED;
 		worklist = &pwq->delayed_works;
@@ -1496,8 +1510,6 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
 		return;
 	}
 
-	timer_stats_timer_set_start_info(&dwork->timer);
-
 	dwork->wq = wq;
 	dwork->cpu = cpu;
 	timer->expires = jiffies + delay;
@@ -2076,6 +2088,7 @@ __acquires(&pool->lock)
 		       current->comm, preempt_count(), task_pid_nr(current),
 		       worker->current_func);
 		debug_show_held_locks(current);
+		BUG_ON(PANIC_CORRUPTION);
 		dump_stack();
 	}
 
@@ -2191,6 +2204,8 @@ recheck:
 			list_first_entry(&pool->worklist,
 					 struct work_struct, entry);
 
+		pool->watchdog_ts = jiffies;
+
 		if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
 			/* optimization path, not strictly necessary */
 			process_one_work(worker, work);
@@ -2274,6 +2289,7 @@ repeat:
 					struct pool_workqueue, mayday_node);
 		struct worker_pool *pool = pwq->pool;
 		struct work_struct *work, *n;
+		bool first = true;
 
 		__set_current_state(TASK_RUNNING);
 		list_del_init(&pwq->mayday_node);
@@ -2290,9 +2306,14 @@ repeat:
 		 * process'em.
 		 */
 		WARN_ON_ONCE(!list_empty(scheduled));
-		list_for_each_entry_safe(work, n, &pool->worklist, entry)
-			if (get_work_pwq(work) == pwq)
+		list_for_each_entry_safe(work, n, &pool->worklist, entry) {
+			if (get_work_pwq(work) == pwq) {
+				if (first)
+					pool->watchdog_ts = jiffies;
 				move_linked_works(work, scheduled, &n);
+			}
+			first = false;
+		}
 
 		if (!list_empty(scheduled)) {
 			process_scheduled_works(rescuer);
@@ -3103,6 +3124,7 @@ static int init_worker_pool(struct worker_pool *pool)
 	pool->cpu = -1;
 	pool->node = NUMA_NO_NODE;
 	pool->flags |= POOL_DISASSOCIATED;
+	pool->watchdog_ts = jiffies;
 	INIT_LIST_HEAD(&pool->worklist);
 	INIT_LIST_HEAD(&pool->idle_list);
 	hash_init(pool->busy_hash);
@@ -4372,7 +4394,9 @@ void show_workqueue_state(void)
 
 		pr_info("pool %d:", pool->id);
 		pr_cont_pool_info(pool);
-		pr_cont(" workers=%d", pool->nr_workers);
+		pr_cont(" hung=%us workers=%d",
+			jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000,
+			pool->nr_workers);
 		if (pool->manager)
 			pr_cont(" manager: %d",
 				task_pid_nr(pool->manager->task));
@@ -5242,6 +5266,154 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
 static void workqueue_sysfs_unregister(struct workqueue_struct *wq)	{ }
 #endif	/* CONFIG_SYSFS */
 
+/*
+ * Workqueue watchdog.
+ *
+ * Stall may be caused by various bugs - missing WQ_MEM_RECLAIM, illegal
+ * flush dependency, a concurrency managed work item which stays RUNNING
+ * indefinitely.  Workqueue stalls can be very difficult to debug as the
+ * usual warning mechanisms don't trigger and internal workqueue state is
+ * largely opaque.
+ *
+ * Workqueue watchdog monitors all worker pools periodically and dumps
+ * state if some pools failed to make forward progress for a while where
+ * forward progress is defined as the first item on ->worklist changing.
+ *
+ * This mechanism is controlled through the kernel parameter
+ * "workqueue.watchdog_thresh" which can be updated at runtime through the
+ * corresponding sysfs parameter file.
+ */
+#ifdef CONFIG_WQ_WATCHDOG
+
+static void wq_watchdog_timer_fn(unsigned long data);
+
+static unsigned long wq_watchdog_thresh = 30;
+static struct timer_list wq_watchdog_timer =
+	TIMER_DEFERRED_INITIALIZER(wq_watchdog_timer_fn, 0, 0);
+
+static unsigned long wq_watchdog_touched = INITIAL_JIFFIES;
+static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES;
+
+static void wq_watchdog_reset_touched(void)
+{
+	int cpu;
+
+	wq_watchdog_touched = jiffies;
+	for_each_possible_cpu(cpu)
+		per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;
+}
+
+static void wq_watchdog_timer_fn(unsigned long data)
+{
+	unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ;
+	bool lockup_detected = false;
+	struct worker_pool *pool;
+	int pi;
+
+	if (!thresh)
+		return;
+
+	rcu_read_lock();
+
+	for_each_pool(pool, pi) {
+		unsigned long pool_ts, touched, ts;
+
+		if (list_empty(&pool->worklist))
+			continue;
+
+		/* get the latest of pool and touched timestamps */
+		pool_ts = READ_ONCE(pool->watchdog_ts);
+		touched = READ_ONCE(wq_watchdog_touched);
+
+		if (time_after(pool_ts, touched))
+			ts = pool_ts;
+		else
+			ts = touched;
+
+		if (pool->cpu >= 0) {
+			unsigned long cpu_touched =
+				READ_ONCE(per_cpu(wq_watchdog_touched_cpu,
+						  pool->cpu));
+			if (time_after(cpu_touched, ts))
+				ts = cpu_touched;
+		}
+
+		/* did we stall? */
+		if (time_after(jiffies, ts + thresh)) {
+			lockup_detected = true;
+			pr_emerg("BUG: workqueue lockup - pool");
+			pr_cont_pool_info(pool);
+			pr_cont(" stuck for %us!\n",
+				jiffies_to_msecs(jiffies - pool_ts) / 1000);
+		}
+	}
+
+	rcu_read_unlock();
+
+	if (lockup_detected)
+		show_workqueue_state();
+
+	wq_watchdog_reset_touched();
+	mod_timer(&wq_watchdog_timer, jiffies + thresh);
+}
+
+void wq_watchdog_touch(int cpu)
+{
+	if (cpu >= 0)
+		per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;
+	else
+		wq_watchdog_touched = jiffies;
+}
+
+static void wq_watchdog_set_thresh(unsigned long thresh)
+{
+	wq_watchdog_thresh = 0;
+	del_timer_sync(&wq_watchdog_timer);
+
+	if (thresh) {
+		wq_watchdog_thresh = thresh;
+		wq_watchdog_reset_touched();
+		mod_timer(&wq_watchdog_timer, jiffies + thresh * HZ);
+	}
+}
+
+static int wq_watchdog_param_set_thresh(const char *val,
+					const struct kernel_param *kp)
+{
+	unsigned long thresh;
+	int ret;
+
+	ret = kstrtoul(val, 0, &thresh);
+	if (ret)
+		return ret;
+
+	if (system_wq)
+		wq_watchdog_set_thresh(thresh);
+	else
+		wq_watchdog_thresh = thresh;
+
+	return 0;
+}
+
+static const struct kernel_param_ops wq_watchdog_thresh_ops = {
+	.set	= wq_watchdog_param_set_thresh,
+	.get	= param_get_ulong,
+};
+
+module_param_cb(watchdog_thresh, &wq_watchdog_thresh_ops, &wq_watchdog_thresh,
+		0644);
+
+static void wq_watchdog_init(void)
+{
+	wq_watchdog_set_thresh(wq_watchdog_thresh);
+}
+
+#else	/* CONFIG_WQ_WATCHDOG */
+
+static inline void wq_watchdog_init(void) { }
+
+#endif	/* CONFIG_WQ_WATCHDOG */
+
 static void __init wq_numa_init(void)
 {
 	cpumask_var_t *tbl;
@@ -5365,6 +5537,9 @@ static int __init init_workqueues(void)
 	       !system_unbound_wq || !system_freezable_wq ||
 	       !system_power_efficient_wq ||
 	       !system_freezable_power_efficient_wq);
+
+	wq_watchdog_init();
+
 	return 0;
 }
 early_initcall(init_workqueues);