diff options
author | Linux Build Service Account <lnxbuild@localhost> | 2016-08-26 22:22:38 -0700 |
---|---|---|
committer | Gerrit - the friendly Code Review server <code-review@localhost> | 2016-08-26 22:22:38 -0700 |
commit | 1b7819036eaa2ba0c93bcc65fbbc1e72fa9b824a (patch) | |
tree | a08a77e7e4a5c5bd332a7a98637516dd65b217d3 /kernel | |
parent | 373c6ba99fddd8071aa8eebd889c73426760fa3c (diff) | |
parent | 2552980f79e476b99d9f489c265a836dd61a2102 (diff) |
Merge "sched: handle frequency alert notifications better"
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/sched/Makefile | 1 | ||||
-rw-r--r-- | kernel/sched/core.c | 3394 | ||||
-rw-r--r-- | kernel/sched/debug.c | 5 | ||||
-rw-r--r-- | kernel/sched/fair.c | 1179 | ||||
-rw-r--r-- | kernel/sched/hmp.c | 3970 | ||||
-rw-r--r-- | kernel/sched/sched.h | 435 | ||||
-rw-r--r-- | kernel/sysctl.c | 20 |
7 files changed, 4409 insertions, 4595 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 846c15156616..1f159743ebfc 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -15,6 +15,7 @@ obj-y += core.o loadavg.o clock.o cputime.o obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o obj-y += wait.o completion.o idle.o sched_avg.o obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o +obj-$(CONFIG_SCHED_HMP) += hmp.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 25afcb8a1402..a1626bdf8729 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -74,9 +74,6 @@ #include <linux/binfmts.h> #include <linux/context_tracking.h> #include <linux/compiler.h> -#include <linux/cpufreq.h> -#include <linux/syscore_ops.h> -#include <linux/list_sort.h> #include <asm/switch_to.h> #include <asm/tlb.h> @@ -100,7 +97,6 @@ const char *task_event_names[] = {"PUT_PREV_TASK", "PICK_NEXT_TASK", const char *migrate_type_names[] = {"GROUP_TO_RQ", "RQ_TO_GROUP", "RQ_TO_RQ", "GROUP_TO_GROUP"}; -ATOMIC_NOTIFIER_HEAD(migration_notifier_head); ATOMIC_NOTIFIER_HEAD(load_alert_notifier_head); DEFINE_MUTEX(sched_domains_mutex); @@ -775,202 +771,8 @@ void sched_avg_update(struct rq *rq) } } -/* - * Note C-state for (idle) cpus. - * - * @cstate = cstate index, 0 -> active state - * @wakeup_energy = energy spent in waking up cpu - * @wakeup_latency = latency to wakeup from cstate - * - */ -void -sched_set_cpu_cstate(int cpu, int cstate, int wakeup_energy, int wakeup_latency) -{ - struct rq *rq = cpu_rq(cpu); - - rq->cstate = cstate; /* C1, C2 etc */ - rq->wakeup_energy = wakeup_energy; - rq->wakeup_latency = wakeup_latency; -} - #endif /* CONFIG_SMP */ -#ifdef CONFIG_SCHED_HMP - -static ktime_t ktime_last; -static bool sched_ktime_suspended; - -static bool use_cycle_counter; -static struct cpu_cycle_counter_cb cpu_cycle_counter_cb; - -u64 sched_ktime_clock(void) -{ - if (unlikely(sched_ktime_suspended)) - return ktime_to_ns(ktime_last); - return ktime_get_ns(); -} - -static void sched_resume(void) -{ - sched_ktime_suspended = false; -} - -static int sched_suspend(void) -{ - ktime_last = ktime_get(); - sched_ktime_suspended = true; - return 0; -} - -static struct syscore_ops sched_syscore_ops = { - .resume = sched_resume, - .suspend = sched_suspend -}; - -static int __init sched_init_ops(void) -{ - register_syscore_ops(&sched_syscore_ops); - return 0; -} -late_initcall(sched_init_ops); - -static inline void clear_ed_task(struct task_struct *p, struct rq *rq) -{ - if (p == rq->ed_task) - rq->ed_task = NULL; -} - -static inline void set_task_last_wake(struct task_struct *p, u64 wallclock) -{ - p->last_wake_ts = wallclock; -} - -static inline void set_task_last_switch_out(struct task_struct *p, - u64 wallclock) -{ - p->last_switch_out_ts = wallclock; -} - -/* - * Note D-state for (idle) cluster. - * - * @dstate = dstate index, 0 -> active state - * @wakeup_energy = energy spent in waking up cluster - * @wakeup_latency = latency to wakeup from cluster - * - */ -void sched_set_cluster_dstate(const cpumask_t *cluster_cpus, int dstate, - int wakeup_energy, int wakeup_latency) -{ - struct sched_cluster *cluster = - cpu_rq(cpumask_first(cluster_cpus))->cluster; - cluster->dstate = dstate; - cluster->dstate_wakeup_energy = wakeup_energy; - cluster->dstate_wakeup_latency = wakeup_latency; -} - -u32 __weak get_freq_max_load(int cpu, u32 freq) -{ - /* 100% by default */ - return 100; -} - -DEFINE_PER_CPU(struct freq_max_load *, freq_max_load); -static DEFINE_SPINLOCK(freq_max_load_lock); - -int sched_update_freq_max_load(const cpumask_t *cpumask) -{ - int i, cpu, ret; - unsigned int freq; - struct cpu_pstate_pwr *costs; - struct cpu_pwr_stats *per_cpu_info = get_cpu_pwr_stats(); - struct freq_max_load *max_load, *old_max_load; - struct freq_max_load_entry *entry; - u64 max_demand_capacity, max_demand; - unsigned long flags; - u32 hfreq; - int hpct; - - if (!per_cpu_info) - return 0; - - spin_lock_irqsave(&freq_max_load_lock, flags); - max_demand_capacity = div64_u64(max_task_load(), max_possible_capacity); - for_each_cpu(cpu, cpumask) { - if (!per_cpu_info[cpu].ptable) { - ret = -EINVAL; - goto fail; - } - - old_max_load = rcu_dereference(per_cpu(freq_max_load, cpu)); - - /* - * allocate len + 1 and leave the last power cost as 0 for - * power_cost() can stop iterating index when - * per_cpu_info[cpu].len > len of max_load due to race between - * cpu power stats update and get_cpu_pwr_stats(). - */ - max_load = kzalloc(sizeof(struct freq_max_load) + - sizeof(struct freq_max_load_entry) * - (per_cpu_info[cpu].len + 1), GFP_ATOMIC); - if (unlikely(!max_load)) { - ret = -ENOMEM; - goto fail; - } - - max_load->length = per_cpu_info[cpu].len; - - max_demand = max_demand_capacity * - cpu_max_possible_capacity(cpu); - - i = 0; - costs = per_cpu_info[cpu].ptable; - while (costs[i].freq) { - entry = &max_load->freqs[i]; - freq = costs[i].freq; - hpct = get_freq_max_load(cpu, freq); - if (hpct <= 0 && hpct > 100) - hpct = 100; - hfreq = div64_u64((u64)freq * hpct, 100); - entry->hdemand = - div64_u64(max_demand * hfreq, - cpu_max_possible_freq(cpu)); - i++; - } - - rcu_assign_pointer(per_cpu(freq_max_load, cpu), max_load); - if (old_max_load) - kfree_rcu(old_max_load, rcu); - } - - spin_unlock_irqrestore(&freq_max_load_lock, flags); - return 0; - -fail: - for_each_cpu(cpu, cpumask) { - max_load = rcu_dereference(per_cpu(freq_max_load, cpu)); - if (max_load) { - rcu_assign_pointer(per_cpu(freq_max_load, cpu), NULL); - kfree_rcu(max_load, rcu); - } - } - - spin_unlock_irqrestore(&freq_max_load_lock, flags); - return ret; -} - -#else /* CONFIG_SCHED_HMP */ -u64 sched_ktime_clock(void) -{ - return 0; -} - -static inline void clear_ed_task(struct task_struct *p, struct rq *rq) {} -static inline void set_task_last_wake(struct task_struct *p, u64 wallclock) {} -static inline void set_task_last_switch_out(struct task_struct *p, - u64 wallclock) {} -#endif /* CONFIG_SCHED_HMP */ - #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) /* @@ -1259,3004 +1061,6 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) rq_clock_skip_update(rq, true); } -#ifdef CONFIG_SCHED_HMP -unsigned int max_possible_efficiency = 1; -unsigned int min_possible_efficiency = UINT_MAX; - -unsigned long __weak arch_get_cpu_efficiency(int cpu) -{ - return SCHED_LOAD_SCALE; -} - -/* Keep track of max/min capacity possible across CPUs "currently" */ -static void __update_min_max_capacity(void) -{ - int i; - int max_cap = 0, min_cap = INT_MAX; - - for_each_online_cpu(i) { - max_cap = max(max_cap, cpu_capacity(i)); - min_cap = min(min_cap, cpu_capacity(i)); - } - - max_capacity = max_cap; - min_capacity = min_cap; -} - -static void update_min_max_capacity(void) -{ - unsigned long flags; - int i; - - local_irq_save(flags); - for_each_possible_cpu(i) - raw_spin_lock(&cpu_rq(i)->lock); - - __update_min_max_capacity(); - - for_each_possible_cpu(i) - raw_spin_unlock(&cpu_rq(i)->lock); - local_irq_restore(flags); -} - -/* - * Return 'capacity' of a cpu in reference to "least" efficient cpu, such that - * least efficient cpu gets capacity of 1024 - */ -static unsigned long -capacity_scale_cpu_efficiency(struct sched_cluster *cluster) -{ - return (1024 * cluster->efficiency) / min_possible_efficiency; -} - -/* - * Return 'capacity' of a cpu in reference to cpu with lowest max_freq - * (min_max_freq), such that one with lowest max_freq gets capacity of 1024. - */ -static unsigned long capacity_scale_cpu_freq(struct sched_cluster *cluster) -{ - return (1024 * cluster_max_freq(cluster)) / min_max_freq; -} - -/* - * Return load_scale_factor of a cpu in reference to "most" efficient cpu, so - * that "most" efficient cpu gets a load_scale_factor of 1 - */ -static inline unsigned long -load_scale_cpu_efficiency(struct sched_cluster *cluster) -{ - return DIV_ROUND_UP(1024 * max_possible_efficiency, - cluster->efficiency); -} - -/* - * Return load_scale_factor of a cpu in reference to cpu with best max_freq - * (max_possible_freq), so that one with best max_freq gets a load_scale_factor - * of 1. - */ -static inline unsigned long load_scale_cpu_freq(struct sched_cluster *cluster) -{ - return DIV_ROUND_UP(1024 * max_possible_freq, - cluster_max_freq(cluster)); -} - -static int compute_capacity(struct sched_cluster *cluster) -{ - int capacity = 1024; - - capacity *= capacity_scale_cpu_efficiency(cluster); - capacity >>= 10; - - capacity *= capacity_scale_cpu_freq(cluster); - capacity >>= 10; - - return capacity; -} - -static int compute_max_possible_capacity(struct sched_cluster *cluster) -{ - int capacity = 1024; - - capacity *= capacity_scale_cpu_efficiency(cluster); - capacity >>= 10; - - capacity *= (1024 * cluster->max_possible_freq) / min_max_freq; - capacity >>= 10; - - return capacity; -} - -static int compute_load_scale_factor(struct sched_cluster *cluster) -{ - int load_scale = 1024; - - /* - * load_scale_factor accounts for the fact that task load - * is in reference to "best" performing cpu. Task's load will need to be - * scaled (up) by a factor to determine suitability to be placed on a - * (little) cpu. - */ - load_scale *= load_scale_cpu_efficiency(cluster); - load_scale >>= 10; - - load_scale *= load_scale_cpu_freq(cluster); - load_scale >>= 10; - - return load_scale; -} - -struct list_head cluster_head; -static DEFINE_MUTEX(cluster_lock); -static cpumask_t all_cluster_cpus = CPU_MASK_NONE; -DECLARE_BITMAP(all_cluster_ids, NR_CPUS); -struct sched_cluster *sched_cluster[NR_CPUS]; -int num_clusters; - -static struct sched_cluster init_cluster = { - .list = LIST_HEAD_INIT(init_cluster.list), - .id = 0, - .max_power_cost = 1, - .min_power_cost = 1, - .capacity = 1024, - .max_possible_capacity = 1024, - .efficiency = 1, - .load_scale_factor = 1024, - .cur_freq = 1, - .max_freq = 1, - .max_mitigated_freq = UINT_MAX, - .min_freq = 1, - .max_possible_freq = 1, - .dstate = 0, - .dstate_wakeup_energy = 0, - .dstate_wakeup_latency = 0, - .exec_scale_factor = 1024, -}; - -void update_all_clusters_stats(void) -{ - struct sched_cluster *cluster; - u64 highest_mpc = 0, lowest_mpc = U64_MAX; - - pre_big_task_count_change(cpu_possible_mask); - - for_each_sched_cluster(cluster) { - u64 mpc; - - cluster->capacity = compute_capacity(cluster); - mpc = cluster->max_possible_capacity = - compute_max_possible_capacity(cluster); - cluster->load_scale_factor = compute_load_scale_factor(cluster); - - cluster->exec_scale_factor = - DIV_ROUND_UP(cluster->efficiency * 1024, - max_possible_efficiency); - - if (mpc > highest_mpc) - highest_mpc = mpc; - - if (mpc < lowest_mpc) - lowest_mpc = mpc; - } - - max_possible_capacity = highest_mpc; - min_max_possible_capacity = lowest_mpc; - - __update_min_max_capacity(); - sched_update_freq_max_load(cpu_possible_mask); - post_big_task_count_change(cpu_possible_mask); -} - -static void assign_cluster_ids(struct list_head *head) -{ - struct sched_cluster *cluster; - int pos = 0; - - list_for_each_entry(cluster, head, list) { - cluster->id = pos; - sched_cluster[pos++] = cluster; - } -} - -static void -move_list(struct list_head *dst, struct list_head *src, bool sync_rcu) -{ - struct list_head *first, *last; - - first = src->next; - last = src->prev; - - if (sync_rcu) { - INIT_LIST_HEAD_RCU(src); - synchronize_rcu(); - } - - first->prev = dst; - dst->prev = last; - last->next = dst; - - /* Ensure list sanity before making the head visible to all CPUs. */ - smp_mb(); - dst->next = first; -} - -static int -compare_clusters(void *priv, struct list_head *a, struct list_head *b) -{ - struct sched_cluster *cluster1, *cluster2; - int ret; - - cluster1 = container_of(a, struct sched_cluster, list); - cluster2 = container_of(b, struct sched_cluster, list); - - ret = cluster1->max_power_cost > cluster2->max_power_cost || - (cluster1->max_power_cost == cluster2->max_power_cost && - cluster1->max_possible_capacity < - cluster2->max_possible_capacity); - - return ret; -} - -static void sort_clusters(void) -{ - struct sched_cluster *cluster; - struct list_head new_head; - - INIT_LIST_HEAD(&new_head); - - for_each_sched_cluster(cluster) { - cluster->max_power_cost = power_cost(cluster_first_cpu(cluster), - max_task_load()); - cluster->min_power_cost = power_cost(cluster_first_cpu(cluster), - 0); - } - - move_list(&new_head, &cluster_head, true); - - list_sort(NULL, &new_head, compare_clusters); - assign_cluster_ids(&new_head); - - /* - * Ensure cluster ids are visible to all CPUs before making - * cluster_head visible. - */ - move_list(&cluster_head, &new_head, false); -} - -static void -insert_cluster(struct sched_cluster *cluster, struct list_head *head) -{ - struct sched_cluster *tmp; - struct list_head *iter = head; - - list_for_each_entry(tmp, head, list) { - if (cluster->max_power_cost < tmp->max_power_cost) - break; - iter = &tmp->list; - } - - list_add(&cluster->list, iter); -} - -static struct sched_cluster *alloc_new_cluster(const struct cpumask *cpus) -{ - struct sched_cluster *cluster = NULL; - - cluster = kzalloc(sizeof(struct sched_cluster), GFP_ATOMIC); - if (!cluster) { - __WARN_printf("Cluster allocation failed. \ - Possible bad scheduling\n"); - return NULL; - } - - INIT_LIST_HEAD(&cluster->list); - cluster->max_power_cost = 1; - cluster->min_power_cost = 1; - cluster->capacity = 1024; - cluster->max_possible_capacity = 1024; - cluster->efficiency = 1; - cluster->load_scale_factor = 1024; - cluster->cur_freq = 1; - cluster->max_freq = 1; - cluster->max_mitigated_freq = UINT_MAX; - cluster->min_freq = 1; - cluster->max_possible_freq = 1; - cluster->dstate = 0; - cluster->dstate_wakeup_energy = 0; - cluster->dstate_wakeup_latency = 0; - cluster->freq_init_done = false; - - cluster->cpus = *cpus; - cluster->efficiency = arch_get_cpu_efficiency(cpumask_first(cpus)); - - if (cluster->efficiency > max_possible_efficiency) - max_possible_efficiency = cluster->efficiency; - if (cluster->efficiency < min_possible_efficiency) - min_possible_efficiency = cluster->efficiency; - - return cluster; -} - -static void add_cluster(const struct cpumask *cpus, struct list_head *head) -{ - struct sched_cluster *cluster = alloc_new_cluster(cpus); - int i; - - if (!cluster) - return; - - for_each_cpu(i, cpus) - cpu_rq(i)->cluster = cluster; - - insert_cluster(cluster, head); - set_bit(num_clusters, all_cluster_ids); - num_clusters++; -} - -#ifdef CONFIG_SMP -static void update_cluster_topology(void) -{ - struct cpumask cpus = *cpu_possible_mask; - const struct cpumask *cluster_cpus; - struct list_head new_head; - int i; - - INIT_LIST_HEAD(&new_head); - - for_each_cpu(i, &cpus) { - cluster_cpus = cpu_coregroup_mask(i); - cpumask_or(&all_cluster_cpus, &all_cluster_cpus, cluster_cpus); - cpumask_andnot(&cpus, &cpus, cluster_cpus); - add_cluster(cluster_cpus, &new_head); - } - - assign_cluster_ids(&new_head); - - /* - * Ensure cluster ids are visible to all CPUs before making - * cluster_head visible. - */ - move_list(&cluster_head, &new_head, false); -} -#endif - -static void init_clusters(void) -{ - bitmap_clear(all_cluster_ids, 0, NR_CPUS); - init_cluster.cpus = *cpu_possible_mask; - INIT_LIST_HEAD(&cluster_head); -} - -int register_cpu_cycle_counter_cb(struct cpu_cycle_counter_cb *cb) -{ - mutex_lock(&cluster_lock); - if (!cb->get_cpu_cycle_counter) { - mutex_unlock(&cluster_lock); - return -EINVAL; - } - - cpu_cycle_counter_cb = *cb; - use_cycle_counter = true; - mutex_unlock(&cluster_lock); - - return 0; -} - -static int __init set_sched_enable_hmp(char *str) -{ - int enable_hmp = 0; - - get_option(&str, &enable_hmp); - - sched_enable_hmp = !!enable_hmp; - - return 0; -} - -early_param("sched_enable_hmp", set_sched_enable_hmp); - -static inline int got_boost_kick(void) -{ - int cpu = smp_processor_id(); - struct rq *rq = cpu_rq(cpu); - - return test_bit(BOOST_KICK, &rq->hmp_flags); -} - -static inline void clear_boost_kick(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - - clear_bit(BOOST_KICK, &rq->hmp_flags); -} - -void boost_kick(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - - if (!test_and_set_bit(BOOST_KICK, &rq->hmp_flags)) - smp_send_reschedule(cpu); -} - -/* Clear any HMP scheduler related requests pending from or on cpu */ -static inline void clear_hmp_request(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long flags; - - clear_boost_kick(cpu); - clear_reserved(cpu); - if (rq->push_task) { - raw_spin_lock_irqsave(&rq->lock, flags); - if (rq->push_task) { - clear_reserved(rq->push_cpu); - put_task_struct(rq->push_task); - rq->push_task = NULL; - } - rq->active_balance = 0; - raw_spin_unlock_irqrestore(&rq->lock, flags); - } -} - -int sched_set_static_cpu_pwr_cost(int cpu, unsigned int cost) -{ - struct rq *rq = cpu_rq(cpu); - - rq->static_cpu_pwr_cost = cost; - return 0; -} - -unsigned int sched_get_static_cpu_pwr_cost(int cpu) -{ - return cpu_rq(cpu)->static_cpu_pwr_cost; -} - -int sched_set_static_cluster_pwr_cost(int cpu, unsigned int cost) -{ - struct sched_cluster *cluster = cpu_rq(cpu)->cluster; - - cluster->static_cluster_pwr_cost = cost; - return 0; -} - -unsigned int sched_get_static_cluster_pwr_cost(int cpu) -{ - return cpu_rq(cpu)->cluster->static_cluster_pwr_cost; -} - -#else /* CONFIG_SCHED_HMP */ - -static inline int got_boost_kick(void) -{ - return 0; -} - -static inline void clear_boost_kick(int cpu) { } - -static inline void clear_hmp_request(int cpu) { } - -int register_cpu_cycle_counter_cb(struct cpu_cycle_counter_cb *cb) -{ - return 0; -} - -#ifdef CONFIG_SMP -static void update_cluster_topology(void) { } -#endif - -#endif /* CONFIG_SCHED_HMP */ - -#define SCHED_MIN_FREQ 1 - -#if defined(CONFIG_SCHED_HMP) - -/* - * sched_window_stats_policy and sched_ravg_hist_size have a 'sysctl' copy - * associated with them. This is required for atomic update of those variables - * when being modifed via sysctl interface. - * - * IMPORTANT: Initialize both copies to same value!! - */ - -/* - * Tasks that are runnable continuously for a period greather than - * EARLY_DETECTION_DURATION can be flagged early as potential - * high load tasks. - */ -#define EARLY_DETECTION_DURATION 9500000 - -static __read_mostly unsigned int sched_ravg_hist_size = 5; -__read_mostly unsigned int sysctl_sched_ravg_hist_size = 5; - -static __read_mostly unsigned int sched_window_stats_policy = - WINDOW_STATS_MAX_RECENT_AVG; -__read_mostly unsigned int sysctl_sched_window_stats_policy = - WINDOW_STATS_MAX_RECENT_AVG; - -#define SCHED_ACCOUNT_WAIT_TIME 1 - -__read_mostly unsigned int sysctl_sched_cpu_high_irqload = (10 * NSEC_PER_MSEC); - -unsigned int __read_mostly sysctl_sched_enable_colocation = 1; - -#ifdef CONFIG_SCHED_FREQ_INPUT - -__read_mostly unsigned int sysctl_sched_new_task_windows = 5; - -#define SCHED_FREQ_ACCOUNT_WAIT_TIME 0 - -/* - * For increase, send notification if - * freq_required - cur_freq > sysctl_sched_freq_inc_notify - */ -__read_mostly int sysctl_sched_freq_inc_notify = 10 * 1024 * 1024; /* + 10GHz */ - -/* - * For decrease, send notification if - * cur_freq - freq_required > sysctl_sched_freq_dec_notify - */ -__read_mostly int sysctl_sched_freq_dec_notify = 10 * 1024 * 1024; /* - 10GHz */ - -static __read_mostly unsigned int sched_io_is_busy; - -__read_mostly unsigned int sysctl_sched_pred_alert_freq = 10 * 1024 * 1024; - -#endif /* CONFIG_SCHED_FREQ_INPUT */ - -/* 1 -> use PELT based load stats, 0 -> use window-based load stats */ -unsigned int __read_mostly sched_use_pelt; - -/* - * Maximum possible frequency across all cpus. Task demand and cpu - * capacity (cpu_power) metrics are scaled in reference to it. - */ -unsigned int max_possible_freq = 1; - -/* - * Minimum possible max_freq across all cpus. This will be same as - * max_possible_freq on homogeneous systems and could be different from - * max_possible_freq on heterogenous systems. min_max_freq is used to derive - * capacity (cpu_power) of cpus. - */ -unsigned int min_max_freq = 1; - -unsigned int max_capacity = 1024; /* max(rq->capacity) */ -unsigned int min_capacity = 1024; /* min(rq->capacity) */ -unsigned int max_possible_capacity = 1024; /* max(rq->max_possible_capacity) */ -unsigned int -min_max_possible_capacity = 1024; /* min(rq->max_possible_capacity) */ - -/* Window size (in ns) */ -__read_mostly unsigned int sched_ravg_window = 10000000; - -/* Min window size (in ns) = 10ms */ -#define MIN_SCHED_RAVG_WINDOW 10000000 - -/* Max window size (in ns) = 1s */ -#define MAX_SCHED_RAVG_WINDOW 1000000000 - -/* Temporarily disable window-stats activity on all cpus */ -unsigned int __read_mostly sched_disable_window_stats; - -/* - * Major task runtime. If a task runs for more than sched_major_task_runtime - * in a window, it's considered to be generating majority of workload - * for this window. Prediction could be adjusted for such tasks. - */ -#ifdef CONFIG_SCHED_FREQ_INPUT -__read_mostly unsigned int sched_major_task_runtime = 10000000; -#endif - -static unsigned int sync_cpu; - -static LIST_HEAD(related_thread_groups); -static DEFINE_RWLOCK(related_thread_group_lock); - -#define for_each_related_thread_group(grp) \ - list_for_each_entry(grp, &related_thread_groups, list) - -/* - * Demand aggregation for frequency purpose: - * - * 'sched_freq_aggregate' controls aggregation of cpu demand of related threads - * for frequency determination purpose. This aggregation is done per-cluster. - * - * CPU demand of tasks from various related groups is aggregated per-cluster and - * added to the "max_busy_cpu" in that cluster, where max_busy_cpu is determined - * by just rq->prev_runnable_sum. - * - * Some examples follow, which assume: - * Cluster0 = CPU0-3, Cluster1 = CPU4-7 - * One related thread group A that has tasks A0, A1, A2 - * - * A->cpu_time[X].curr/prev_sum = counters in which cpu execution stats of - * tasks belonging to group A are accumulated when they run on cpu X. - * - * CX->curr/prev_sum = counters in which cpu execution stats of all tasks - * not belonging to group A are accumulated when they run on cpu X - * - * Lets say the stats for window M was as below: - * - * C0->prev_sum = 1ms, A->cpu_time[0].prev_sum = 5ms - * Task A0 ran 5ms on CPU0 - * Task B0 ran 1ms on CPU0 - * - * C1->prev_sum = 5ms, A->cpu_time[1].prev_sum = 6ms - * Task A1 ran 4ms on CPU1 - * Task A2 ran 2ms on CPU1 - * Task B1 ran 5ms on CPU1 - * - * C2->prev_sum = 0ms, A->cpu_time[2].prev_sum = 0 - * CPU2 idle - * - * C3->prev_sum = 0ms, A->cpu_time[3].prev_sum = 0 - * CPU3 idle - * - * In this case, CPU1 was most busy going by just its prev_sum counter. Demand - * from all group A tasks are added to CPU1. IOW, at end of window M, cpu busy - * time reported to governor will be: - * - * - * C0 busy time = 1ms - * C1 busy time = 5 + 5 + 6 = 16ms - * - */ -static __read_mostly unsigned int sched_freq_aggregate; -__read_mostly unsigned int sysctl_sched_freq_aggregate; - -#define EXITING_TASK_MARKER 0xdeaddead - -static inline int exiting_task(struct task_struct *p) -{ - return (p->ravg.sum_history[0] == EXITING_TASK_MARKER); -} - -static int __init set_sched_ravg_window(char *str) -{ - get_option(&str, &sched_ravg_window); - - sched_use_pelt = (sched_ravg_window < MIN_SCHED_RAVG_WINDOW || - sched_ravg_window > MAX_SCHED_RAVG_WINDOW); - - return 0; -} - -early_param("sched_ravg_window", set_sched_ravg_window); - -static inline void -update_window_start(struct rq *rq, u64 wallclock) -{ - s64 delta; - int nr_windows; - - delta = wallclock - rq->window_start; - BUG_ON(delta < 0); - if (delta < sched_ravg_window) - return; - - nr_windows = div64_u64(delta, sched_ravg_window); - rq->window_start += (u64)nr_windows * (u64)sched_ravg_window; -} - -#define DIV64_U64_ROUNDUP(X, Y) div64_u64((X) + (Y - 1), Y) - -static inline u64 scale_exec_time(u64 delta, struct rq *rq) -{ - u32 freq; - - freq = cpu_cycles_to_freq(rq->cc.cycles, rq->cc.time); - delta = DIV64_U64_ROUNDUP(delta * freq, max_possible_freq); - delta *= rq->cluster->exec_scale_factor; - delta >>= 10; - - return delta; -} - -#ifdef CONFIG_SCHED_FREQ_INPUT - -static inline int cpu_is_waiting_on_io(struct rq *rq) -{ - if (!sched_io_is_busy) - return 0; - - return atomic_read(&rq->nr_iowait); -} - -/* Does freq_required sufficiently exceed or fall behind cur_freq? */ -static inline int -nearly_same_freq(unsigned int cur_freq, unsigned int freq_required) -{ - int delta = freq_required - cur_freq; - - if (freq_required > cur_freq) - return delta < sysctl_sched_freq_inc_notify; - - delta = -delta; - - return delta < sysctl_sched_freq_dec_notify; -} - -/* Convert busy time to frequency equivalent */ -static inline unsigned int load_to_freq(struct rq *rq, u64 load) -{ - unsigned int freq; - - load = scale_load_to_cpu(load, cpu_of(rq)); - load *= 128; - load = div64_u64(load, max_task_load()); - - freq = load * cpu_max_possible_freq(cpu_of(rq)); - freq /= 128; - - return freq; -} - -static inline struct group_cpu_time * -_group_cpu_time(struct related_thread_group *grp, int cpu); - -/* - * Return load from all related group in given cpu. - * Caller must ensure that related_thread_group_lock is held. - */ -static void _group_load_in_cpu(int cpu, u64 *grp_load, u64 *new_grp_load) -{ - struct related_thread_group *grp; - - for_each_related_thread_group(grp) { - struct group_cpu_time *cpu_time; - - cpu_time = _group_cpu_time(grp, cpu); - *grp_load += cpu_time->prev_runnable_sum; - if (new_grp_load) - *new_grp_load += cpu_time->nt_prev_runnable_sum; - } -} - -/* - * Return load from all related groups in given frequency domain. - * Caller must ensure that related_thread_group_lock is held. - */ -static void group_load_in_freq_domain(struct cpumask *cpus, - u64 *grp_load, u64 *new_grp_load) -{ - struct related_thread_group *grp; - int j; - - for_each_related_thread_group(grp) { - for_each_cpu(j, cpus) { - struct group_cpu_time *cpu_time; - - cpu_time = _group_cpu_time(grp, j); - *grp_load += cpu_time->prev_runnable_sum; - *new_grp_load += cpu_time->nt_prev_runnable_sum; - } - } -} - -/* - * Should scheduler alert governor for changing frequency? - * - * @check_pred - evaluate frequency based on the predictive demand - * @check_groups - add load from all related groups on given cpu - * - * check_groups is set to 1 if a "related" task movement/wakeup is triggering - * the notification check. To avoid "re-aggregation" of demand in such cases, - * we check whether the migrated/woken tasks demand (along with demand from - * existing tasks on the cpu) can be met on target cpu - * - */ - -static int send_notification(struct rq *rq, int check_pred, int check_groups) -{ - unsigned int cur_freq, freq_required; - unsigned long flags; - int rc = 0; - u64 group_load = 0, new_load = 0; - - if (!sched_enable_hmp) - return 0; - - if (check_pred) { - u64 prev = rq->old_busy_time; - u64 predicted = rq->hmp_stats.pred_demands_sum; - - if (rq->cluster->cur_freq == cpu_max_freq(cpu_of(rq))) - return 0; - - prev = max(prev, rq->old_estimated_time); - if (prev > predicted) - return 0; - - cur_freq = load_to_freq(rq, prev); - freq_required = load_to_freq(rq, predicted); - - if (freq_required < cur_freq + sysctl_sched_pred_alert_freq) - return 0; - } else { - read_lock(&related_thread_group_lock); - /* - * Protect from concurrent update of rq->prev_runnable_sum and - * group cpu load - */ - raw_spin_lock_irqsave(&rq->lock, flags); - if (check_groups) - _group_load_in_cpu(cpu_of(rq), &group_load, NULL); - - new_load = rq->prev_runnable_sum + group_load; - - raw_spin_unlock_irqrestore(&rq->lock, flags); - read_unlock(&related_thread_group_lock); - - cur_freq = load_to_freq(rq, rq->old_busy_time); - freq_required = load_to_freq(rq, new_load); - - if (nearly_same_freq(cur_freq, freq_required)) - return 0; - } - - raw_spin_lock_irqsave(&rq->lock, flags); - if (!rq->notifier_sent) { - rq->notifier_sent = 1; - rc = 1; - trace_sched_freq_alert(cpu_of(rq), check_pred, check_groups, rq, - new_load); - } - raw_spin_unlock_irqrestore(&rq->lock, flags); - - return rc; -} - -/* Alert governor if there is a need to change frequency */ -void check_for_freq_change(struct rq *rq, bool check_pred, bool check_groups) -{ - int cpu = cpu_of(rq); - - if (!send_notification(rq, check_pred, check_groups)) - return; - - atomic_notifier_call_chain( - &load_alert_notifier_head, 0, - (void *)(long)cpu); -} - -static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p, - u64 irqtime, int event) -{ - if (is_idle_task(p)) { - /* TASK_WAKE && TASK_MIGRATE is not possible on idle task! */ - if (event == PICK_NEXT_TASK) - return 0; - - /* PUT_PREV_TASK, TASK_UPDATE && IRQ_UPDATE are left */ - return irqtime || cpu_is_waiting_on_io(rq); - } - - if (event == TASK_WAKE) - return 0; - - if (event == PUT_PREV_TASK || event == IRQ_UPDATE) - return 1; - - /* - * TASK_UPDATE can be called on sleeping task, when its moved between - * related groups - */ - if (event == TASK_UPDATE) { - if (rq->curr == p) - return 1; - - return p->on_rq ? SCHED_FREQ_ACCOUNT_WAIT_TIME : 0; - } - - /* TASK_MIGRATE, PICK_NEXT_TASK left */ - return SCHED_FREQ_ACCOUNT_WAIT_TIME; -} - -static inline bool is_new_task(struct task_struct *p) -{ - return p->ravg.active_windows < sysctl_sched_new_task_windows; -} - -#define INC_STEP 8 -#define DEC_STEP 2 -#define CONSISTENT_THRES 16 -#define INC_STEP_BIG 16 -/* - * bucket_increase - update the count of all buckets - * - * @buckets: array of buckets tracking busy time of a task - * @idx: the index of bucket to be incremented - * - * Each time a complete window finishes, count of bucket that runtime - * falls in (@idx) is incremented. Counts of all other buckets are - * decayed. The rate of increase and decay could be different based - * on current count in the bucket. - */ -static inline void bucket_increase(u8 *buckets, int idx) -{ - int i, step; - - for (i = 0; i < NUM_BUSY_BUCKETS; i++) { - if (idx != i) { - if (buckets[i] > DEC_STEP) - buckets[i] -= DEC_STEP; - else - buckets[i] = 0; - } else { - step = buckets[i] >= CONSISTENT_THRES ? - INC_STEP_BIG : INC_STEP; - if (buckets[i] > U8_MAX - step) - buckets[i] = U8_MAX; - else - buckets[i] += step; - } - } -} - -static inline int busy_to_bucket(u32 normalized_rt) -{ - int bidx; - - bidx = mult_frac(normalized_rt, NUM_BUSY_BUCKETS, max_task_load()); - bidx = min(bidx, NUM_BUSY_BUCKETS - 1); - - /* - * Combine lowest two buckets. The lowest frequency falls into - * 2nd bucket and thus keep predicting lowest bucket is not - * useful. - */ - if (!bidx) - bidx++; - - return bidx; -} - -static inline u64 -scale_load_to_freq(u64 load, unsigned int src_freq, unsigned int dst_freq) -{ - return div64_u64(load * (u64)src_freq, (u64)dst_freq); -} - -#define HEAVY_TASK_SKIP 2 -#define HEAVY_TASK_SKIP_LIMIT 4 -/* - * get_pred_busy - calculate predicted demand for a task on runqueue - * - * @rq: runqueue of task p - * @p: task whose prediction is being updated - * @start: starting bucket. returned prediction should not be lower than - * this bucket. - * @runtime: runtime of the task. returned prediction should not be lower - * than this runtime. - * Note: @start can be derived from @runtime. It's passed in only to - * avoid duplicated calculation in some cases. - * - * A new predicted busy time is returned for task @p based on @runtime - * passed in. The function searches through buckets that represent busy - * time equal to or bigger than @runtime and attempts to find the bucket to - * to use for prediction. Once found, it searches through historical busy - * time and returns the latest that falls into the bucket. If no such busy - * time exists, it returns the medium of that bucket. - */ -static u32 get_pred_busy(struct rq *rq, struct task_struct *p, - int start, u32 runtime) -{ - int i; - u8 *buckets = p->ravg.busy_buckets; - u32 *hist = p->ravg.sum_history; - u32 dmin, dmax; - u64 cur_freq_runtime = 0; - int first = NUM_BUSY_BUCKETS, final, skip_to; - u32 ret = runtime; - - /* skip prediction for new tasks due to lack of history */ - if (unlikely(is_new_task(p))) - goto out; - - /* find minimal bucket index to pick */ - for (i = start; i < NUM_BUSY_BUCKETS; i++) { - if (buckets[i]) { - first = i; - break; - } - } - /* if no higher buckets are filled, predict runtime */ - if (first >= NUM_BUSY_BUCKETS) - goto out; - - /* compute the bucket for prediction */ - final = first; - if (first < HEAVY_TASK_SKIP_LIMIT) { - /* compute runtime at current CPU frequency */ - cur_freq_runtime = mult_frac(runtime, max_possible_efficiency, - rq->cluster->efficiency); - cur_freq_runtime = scale_load_to_freq(cur_freq_runtime, - max_possible_freq, rq->cluster->cur_freq); - /* - * if the task runs for majority of the window, try to - * pick higher buckets. - */ - if (cur_freq_runtime >= sched_major_task_runtime) { - int next = NUM_BUSY_BUCKETS; - /* - * if there is a higher bucket that's consistently - * hit, don't jump beyond that. - */ - for (i = start + 1; i <= HEAVY_TASK_SKIP_LIMIT && - i < NUM_BUSY_BUCKETS; i++) { - if (buckets[i] > CONSISTENT_THRES) { - next = i; - break; - } - } - skip_to = min(next, start + HEAVY_TASK_SKIP); - /* don't jump beyond HEAVY_TASK_SKIP_LIMIT */ - skip_to = min(HEAVY_TASK_SKIP_LIMIT, skip_to); - /* don't go below first non-empty bucket, if any */ - final = max(first, skip_to); - } - } - - /* determine demand range for the predicted bucket */ - if (final < 2) { - /* lowest two buckets are combined */ - dmin = 0; - final = 1; - } else { - dmin = mult_frac(final, max_task_load(), NUM_BUSY_BUCKETS); - } - dmax = mult_frac(final + 1, max_task_load(), NUM_BUSY_BUCKETS); - - /* - * search through runtime history and return first runtime that falls - * into the range of predicted bucket. - */ - for (i = 0; i < sched_ravg_hist_size; i++) { - if (hist[i] >= dmin && hist[i] < dmax) { - ret = hist[i]; - break; - } - } - /* no historical runtime within bucket found, use average of the bin */ - if (ret < dmin) - ret = (dmin + dmax) / 2; - /* - * when updating in middle of a window, runtime could be higher - * than all recorded history. Always predict at least runtime. - */ - ret = max(runtime, ret); -out: - trace_sched_update_pred_demand(rq, p, runtime, - mult_frac((unsigned int)cur_freq_runtime, 100, - sched_ravg_window), ret); - return ret; -} - -static inline u32 calc_pred_demand(struct rq *rq, struct task_struct *p) -{ - if (p->ravg.pred_demand >= p->ravg.curr_window) - return p->ravg.pred_demand; - - return get_pred_busy(rq, p, busy_to_bucket(p->ravg.curr_window), - p->ravg.curr_window); -} - -/* - * predictive demand of a task is calculated at the window roll-over. - * if the task current window busy time exceeds the predicted - * demand, update it here to reflect the task needs. - */ -void update_task_pred_demand(struct rq *rq, struct task_struct *p, int event) -{ - u32 new, old; - - if (is_idle_task(p) || exiting_task(p)) - return; - - if (event != PUT_PREV_TASK && event != TASK_UPDATE && - (!SCHED_FREQ_ACCOUNT_WAIT_TIME || - (event != TASK_MIGRATE && - event != PICK_NEXT_TASK))) - return; - - /* - * TASK_UPDATE can be called on sleeping task, when its moved between - * related groups - */ - if (event == TASK_UPDATE) { - if (!p->on_rq && !SCHED_FREQ_ACCOUNT_WAIT_TIME) - return; - } - - new = calc_pred_demand(rq, p); - old = p->ravg.pred_demand; - - if (old >= new) - return; - - if (task_on_rq_queued(p) && (!task_has_dl_policy(p) || - !p->dl.dl_throttled)) - p->sched_class->fixup_hmp_sched_stats(rq, p, - p->ravg.demand, - new); - - p->ravg.pred_demand = new; -} - -/* - * Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum) - */ -static void update_cpu_busy_time(struct task_struct *p, struct rq *rq, - int event, u64 wallclock, u64 irqtime) -{ - int new_window, full_window = 0; - int p_is_curr_task = (p == rq->curr); - u64 mark_start = p->ravg.mark_start; - u64 window_start = rq->window_start; - u32 window_size = sched_ravg_window; - u64 delta; - u64 *curr_runnable_sum = &rq->curr_runnable_sum; - u64 *prev_runnable_sum = &rq->prev_runnable_sum; - u64 *nt_curr_runnable_sum = &rq->nt_curr_runnable_sum; - u64 *nt_prev_runnable_sum = &rq->nt_prev_runnable_sum; - int flip_counters = 0; - int prev_sum_reset = 0; - bool new_task; - struct related_thread_group *grp; - - new_window = mark_start < window_start; - if (new_window) { - full_window = (window_start - mark_start) >= window_size; - if (p->ravg.active_windows < USHRT_MAX) - p->ravg.active_windows++; - } - - new_task = is_new_task(p); - - grp = p->grp; - if (grp && sched_freq_aggregate) { - /* cpu_time protected by rq_lock */ - struct group_cpu_time *cpu_time = - _group_cpu_time(grp, cpu_of(rq)); - - curr_runnable_sum = &cpu_time->curr_runnable_sum; - prev_runnable_sum = &cpu_time->prev_runnable_sum; - - nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; - nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; - - if (cpu_time->window_start != rq->window_start) { - int nr_windows; - - delta = rq->window_start - cpu_time->window_start; - nr_windows = div64_u64(delta, window_size); - if (nr_windows > 1) - prev_sum_reset = 1; - - cpu_time->window_start = rq->window_start; - flip_counters = 1; - } - - if (p_is_curr_task && new_window) { - u64 curr_sum = rq->curr_runnable_sum; - u64 nt_curr_sum = rq->nt_curr_runnable_sum; - - if (full_window) - curr_sum = nt_curr_sum = 0; - - rq->prev_runnable_sum = curr_sum; - rq->nt_prev_runnable_sum = nt_curr_sum; - - rq->curr_runnable_sum = 0; - rq->nt_curr_runnable_sum = 0; - } - } else { - if (p_is_curr_task && new_window) { - flip_counters = 1; - if (full_window) - prev_sum_reset = 1; - } - } - - /* Handle per-task window rollover. We don't care about the idle - * task or exiting tasks. */ - if (new_window && !is_idle_task(p) && !exiting_task(p)) { - u32 curr_window = 0; - - if (!full_window) - curr_window = p->ravg.curr_window; - - p->ravg.prev_window = curr_window; - p->ravg.curr_window = 0; - } - - if (flip_counters) { - u64 curr_sum = *curr_runnable_sum; - u64 nt_curr_sum = *nt_curr_runnable_sum; - - if (prev_sum_reset) - curr_sum = nt_curr_sum = 0; - - *prev_runnable_sum = curr_sum; - *nt_prev_runnable_sum = nt_curr_sum; - - *curr_runnable_sum = 0; - *nt_curr_runnable_sum = 0; - } - - if (!account_busy_for_cpu_time(rq, p, irqtime, event)) { - /* account_busy_for_cpu_time() = 0, so no update to the - * task's current window needs to be made. This could be - * for example - * - * - a wakeup event on a task within the current - * window (!new_window below, no action required), - * - switching to a new task from idle (PICK_NEXT_TASK) - * in a new window where irqtime is 0 and we aren't - * waiting on IO */ - - if (!new_window) - return; - - /* A new window has started. The RQ demand must be rolled - * over if p is the current task. */ - if (p_is_curr_task) { - /* p is idle task */ - BUG_ON(p != rq->idle); - } - - return; - } - - if (!new_window) { - /* account_busy_for_cpu_time() = 1 so busy time needs - * to be accounted to the current window. No rollover - * since we didn't start a new window. An example of this is - * when a task starts execution and then sleeps within the - * same window. */ - - if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) - delta = wallclock - mark_start; - else - delta = irqtime; - delta = scale_exec_time(delta, rq); - *curr_runnable_sum += delta; - if (new_task) - *nt_curr_runnable_sum += delta; - - if (!is_idle_task(p) && !exiting_task(p)) - p->ravg.curr_window += delta; - - return; - } - - if (!p_is_curr_task) { - /* account_busy_for_cpu_time() = 1 so busy time needs - * to be accounted to the current window. A new window - * has also started, but p is not the current task, so the - * window is not rolled over - just split up and account - * as necessary into curr and prev. The window is only - * rolled over when a new window is processed for the current - * task. - * - * Irqtime can't be accounted by a task that isn't the - * currently running task. */ - - if (!full_window) { - /* A full window hasn't elapsed, account partial - * contribution to previous completed window. */ - delta = scale_exec_time(window_start - mark_start, rq); - if (!exiting_task(p)) - p->ravg.prev_window += delta; - } else { - /* Since at least one full window has elapsed, - * the contribution to the previous window is the - * full window (window_size). */ - delta = scale_exec_time(window_size, rq); - if (!exiting_task(p)) - p->ravg.prev_window = delta; - } - - *prev_runnable_sum += delta; - if (new_task) - *nt_prev_runnable_sum += delta; - - /* Account piece of busy time in the current window. */ - delta = scale_exec_time(wallclock - window_start, rq); - *curr_runnable_sum += delta; - if (new_task) - *nt_curr_runnable_sum += delta; - - if (!exiting_task(p)) - p->ravg.curr_window = delta; - - return; - } - - if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) { - /* account_busy_for_cpu_time() = 1 so busy time needs - * to be accounted to the current window. A new window - * has started and p is the current task so rollover is - * needed. If any of these three above conditions are true - * then this busy time can't be accounted as irqtime. - * - * Busy time for the idle task or exiting tasks need not - * be accounted. - * - * An example of this would be a task that starts execution - * and then sleeps once a new window has begun. */ - - if (!full_window) { - /* A full window hasn't elapsed, account partial - * contribution to previous completed window. */ - delta = scale_exec_time(window_start - mark_start, rq); - if (!is_idle_task(p) && !exiting_task(p)) - p->ravg.prev_window += delta; - } else { - /* Since at least one full window has elapsed, - * the contribution to the previous window is the - * full window (window_size). */ - delta = scale_exec_time(window_size, rq); - if (!is_idle_task(p) && !exiting_task(p)) - p->ravg.prev_window = delta; - } - - /* Rollover is done here by overwriting the values in - * prev_runnable_sum and curr_runnable_sum. */ - *prev_runnable_sum += delta; - if (new_task) - *nt_prev_runnable_sum += delta; - - /* Account piece of busy time in the current window. */ - delta = scale_exec_time(wallclock - window_start, rq); - *curr_runnable_sum += delta; - if (new_task) - *nt_curr_runnable_sum += delta; - - if (!is_idle_task(p) && !exiting_task(p)) - p->ravg.curr_window = delta; - - return; - } - - if (irqtime) { - /* account_busy_for_cpu_time() = 1 so busy time needs - * to be accounted to the current window. A new window - * has started and p is the current task so rollover is - * needed. The current task must be the idle task because - * irqtime is not accounted for any other task. - * - * Irqtime will be accounted each time we process IRQ activity - * after a period of idleness, so we know the IRQ busy time - * started at wallclock - irqtime. */ - - BUG_ON(!is_idle_task(p)); - mark_start = wallclock - irqtime; - - /* Roll window over. If IRQ busy time was just in the current - * window then that is all that need be accounted. */ - if (mark_start > window_start) { - *curr_runnable_sum = scale_exec_time(irqtime, rq); - return; - } - - /* The IRQ busy time spanned multiple windows. Process the - * busy time preceding the current window start first. */ - delta = window_start - mark_start; - if (delta > window_size) - delta = window_size; - delta = scale_exec_time(delta, rq); - *prev_runnable_sum += delta; - - /* Process the remaining IRQ busy time in the current window. */ - delta = wallclock - window_start; - rq->curr_runnable_sum = scale_exec_time(delta, rq); - - return; - } - - BUG(); -} - -static inline u32 predict_and_update_buckets(struct rq *rq, - struct task_struct *p, u32 runtime) { - - int bidx; - u32 pred_demand; - - bidx = busy_to_bucket(runtime); - pred_demand = get_pred_busy(rq, p, bidx, runtime); - bucket_increase(p->ravg.busy_buckets, bidx); - - return pred_demand; -} -#define assign_ravg_pred_demand(x) (p->ravg.pred_demand = x) - -#else /* CONFIG_SCHED_FREQ_INPUT */ - -static inline void -update_task_pred_demand(struct rq *rq, struct task_struct *p, int event) -{ -} - -static inline void update_cpu_busy_time(struct task_struct *p, struct rq *rq, - int event, u64 wallclock, u64 irqtime) -{ -} - -static inline u32 predict_and_update_buckets(struct rq *rq, - struct task_struct *p, u32 runtime) -{ - return 0; -} -#define assign_ravg_pred_demand(x) - -#endif /* CONFIG_SCHED_FREQ_INPUT */ - -static void update_task_cpu_cycles(struct task_struct *p, int cpu) -{ - if (use_cycle_counter) - p->cpu_cycles = cpu_cycle_counter_cb.get_cpu_cycle_counter(cpu); -} - -static void -update_task_rq_cpu_cycles(struct task_struct *p, struct rq *rq, int event, - u64 wallclock, u64 irqtime) -{ - u64 cur_cycles; - int cpu = cpu_of(rq); - - lockdep_assert_held(&rq->lock); - - if (!use_cycle_counter) { - rq->cc.cycles = cpu_cur_freq(cpu); - rq->cc.time = 1; - return; - } - - cur_cycles = cpu_cycle_counter_cb.get_cpu_cycle_counter(cpu); - - /* - * If current task is idle task and irqtime == 0 CPU was - * indeed idle and probably its cycle counter was not - * increasing. We still need estimatied CPU frequency - * for IO wait time accounting. Use the previously - * calculated frequency in such a case. - */ - if (!is_idle_task(rq->curr) || irqtime) { - if (unlikely(cur_cycles < p->cpu_cycles)) - rq->cc.cycles = cur_cycles + (U64_MAX - p->cpu_cycles); - else - rq->cc.cycles = cur_cycles - p->cpu_cycles; - rq->cc.cycles = rq->cc.cycles * NSEC_PER_MSEC; - - if (event == IRQ_UPDATE && is_idle_task(p)) - /* - * Time between mark_start of idle task and IRQ handler - * entry time is CPU cycle counter stall period. - * Upon IRQ handler entry sched_account_irqstart() - * replenishes idle task's cpu cycle counter so - * rq->cc.cycles now represents increased cycles during - * IRQ handler rather than time between idle entry and - * IRQ exit. Thus use irqtime as time delta. - */ - rq->cc.time = irqtime; - else - rq->cc.time = wallclock - p->ravg.mark_start; - BUG_ON((s64)rq->cc.time < 0); - } - - p->cpu_cycles = cur_cycles; - - trace_sched_get_task_cpu_cycles(cpu, event, rq->cc.cycles, rq->cc.time); -} - -static int account_busy_for_task_demand(struct task_struct *p, int event) -{ - /* No need to bother updating task demand for exiting tasks - * or the idle task. */ - if (exiting_task(p) || is_idle_task(p)) - return 0; - - /* When a task is waking up it is completing a segment of non-busy - * time. Likewise, if wait time is not treated as busy time, then - * when a task begins to run or is migrated, it is not running and - * is completing a segment of non-busy time. */ - if (event == TASK_WAKE || (!SCHED_ACCOUNT_WAIT_TIME && - (event == PICK_NEXT_TASK || event == TASK_MIGRATE))) - return 0; - - return 1; -} - -/* - * Called when new window is starting for a task, to record cpu usage over - * recently concluded window(s). Normally 'samples' should be 1. It can be > 1 - * when, say, a real-time task runs without preemption for several windows at a - * stretch. - */ -static void update_history(struct rq *rq, struct task_struct *p, - u32 runtime, int samples, int event) -{ - u32 *hist = &p->ravg.sum_history[0]; - int ridx, widx; - u32 max = 0, avg, demand, pred_demand; - u64 sum = 0; - - /* Ignore windows where task had no activity */ - if (!runtime || is_idle_task(p) || exiting_task(p) || !samples) - goto done; - - /* Push new 'runtime' value onto stack */ - widx = sched_ravg_hist_size - 1; - ridx = widx - samples; - for (; ridx >= 0; --widx, --ridx) { - hist[widx] = hist[ridx]; - sum += hist[widx]; - if (hist[widx] > max) - max = hist[widx]; - } - - for (widx = 0; widx < samples && widx < sched_ravg_hist_size; widx++) { - hist[widx] = runtime; - sum += hist[widx]; - if (hist[widx] > max) - max = hist[widx]; - } - - p->ravg.sum = 0; - - if (sched_window_stats_policy == WINDOW_STATS_RECENT) { - demand = runtime; - } else if (sched_window_stats_policy == WINDOW_STATS_MAX) { - demand = max; - } else { - avg = div64_u64(sum, sched_ravg_hist_size); - if (sched_window_stats_policy == WINDOW_STATS_AVG) - demand = avg; - else - demand = max(avg, runtime); - } - pred_demand = predict_and_update_buckets(rq, p, runtime); - - /* - * A throttled deadline sched class task gets dequeued without - * changing p->on_rq. Since the dequeue decrements hmp stats - * avoid decrementing it here again. - */ - if (task_on_rq_queued(p) && (!task_has_dl_policy(p) || - !p->dl.dl_throttled)) - p->sched_class->fixup_hmp_sched_stats(rq, p, demand, - pred_demand); - - p->ravg.demand = demand; - assign_ravg_pred_demand(pred_demand); - -done: - trace_sched_update_history(rq, p, runtime, samples, event); -} - -static void add_to_task_demand(struct rq *rq, struct task_struct *p, u64 delta) -{ - delta = scale_exec_time(delta, rq); - p->ravg.sum += delta; - if (unlikely(p->ravg.sum > sched_ravg_window)) - p->ravg.sum = sched_ravg_window; -} - -/* - * Account cpu demand of task and/or update task's cpu demand history - * - * ms = p->ravg.mark_start; - * wc = wallclock - * ws = rq->window_start - * - * Three possibilities: - * - * a) Task event is contained within one window. - * window_start < mark_start < wallclock - * - * ws ms wc - * | | | - * V V V - * |---------------| - * - * In this case, p->ravg.sum is updated *iff* event is appropriate - * (ex: event == PUT_PREV_TASK) - * - * b) Task event spans two windows. - * mark_start < window_start < wallclock - * - * ms ws wc - * | | | - * V V V - * -----|------------------- - * - * In this case, p->ravg.sum is updated with (ws - ms) *iff* event - * is appropriate, then a new window sample is recorded followed - * by p->ravg.sum being set to (wc - ws) *iff* event is appropriate. - * - * c) Task event spans more than two windows. - * - * ms ws_tmp ws wc - * | | | | - * V V V V - * ---|-------|-------|-------|-------|------ - * | | - * |<------ nr_full_windows ------>| - * - * In this case, p->ravg.sum is updated with (ws_tmp - ms) first *iff* - * event is appropriate, window sample of p->ravg.sum is recorded, - * 'nr_full_window' samples of window_size is also recorded *iff* - * event is appropriate and finally p->ravg.sum is set to (wc - ws) - * *iff* event is appropriate. - * - * IMPORTANT : Leave p->ravg.mark_start unchanged, as update_cpu_busy_time() - * depends on it! - */ -static void update_task_demand(struct task_struct *p, struct rq *rq, - int event, u64 wallclock) -{ - u64 mark_start = p->ravg.mark_start; - u64 delta, window_start = rq->window_start; - int new_window, nr_full_windows; - u32 window_size = sched_ravg_window; - - new_window = mark_start < window_start; - if (!account_busy_for_task_demand(p, event)) { - if (new_window) - /* If the time accounted isn't being accounted as - * busy time, and a new window started, only the - * previous window need be closed out with the - * pre-existing demand. Multiple windows may have - * elapsed, but since empty windows are dropped, - * it is not necessary to account those. */ - update_history(rq, p, p->ravg.sum, 1, event); - return; - } - - if (!new_window) { - /* The simple case - busy time contained within the existing - * window. */ - add_to_task_demand(rq, p, wallclock - mark_start); - return; - } - - /* Busy time spans at least two windows. Temporarily rewind - * window_start to first window boundary after mark_start. */ - delta = window_start - mark_start; - nr_full_windows = div64_u64(delta, window_size); - window_start -= (u64)nr_full_windows * (u64)window_size; - - /* Process (window_start - mark_start) first */ - add_to_task_demand(rq, p, window_start - mark_start); - - /* Push new sample(s) into task's demand history */ - update_history(rq, p, p->ravg.sum, 1, event); - if (nr_full_windows) - update_history(rq, p, scale_exec_time(window_size, rq), - nr_full_windows, event); - - /* Roll window_start back to current to process any remainder - * in current window. */ - window_start += (u64)nr_full_windows * (u64)window_size; - - /* Process (wallclock - window_start) next */ - mark_start = window_start; - add_to_task_demand(rq, p, wallclock - mark_start); -} - -/* Reflect task activity on its demand and cpu's busy time statistics */ -static void -update_task_ravg(struct task_struct *p, struct rq *rq, int event, - u64 wallclock, u64 irqtime) -{ - if (sched_use_pelt || !rq->window_start || sched_disable_window_stats) - return; - - lockdep_assert_held(&rq->lock); - - update_window_start(rq, wallclock); - - if (!p->ravg.mark_start) { - update_task_cpu_cycles(p, cpu_of(rq)); - goto done; - } - - update_task_rq_cpu_cycles(p, rq, event, wallclock, irqtime); - update_task_demand(p, rq, event, wallclock); - update_cpu_busy_time(p, rq, event, wallclock, irqtime); - update_task_pred_demand(rq, p, event); -done: - trace_sched_update_task_ravg(p, rq, event, wallclock, irqtime, - rq->cc.cycles, rq->cc.time, - _group_cpu_time(p->grp, cpu_of(rq))); - - p->ravg.mark_start = wallclock; -} - -void sched_account_irqtime(int cpu, struct task_struct *curr, - u64 delta, u64 wallclock) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long flags, nr_windows; - u64 cur_jiffies_ts; - - raw_spin_lock_irqsave(&rq->lock, flags); - - /* - * cputime (wallclock) uses sched_clock so use the same here for - * consistency. - */ - delta += sched_clock() - wallclock; - cur_jiffies_ts = get_jiffies_64(); - - if (is_idle_task(curr)) - update_task_ravg(curr, rq, IRQ_UPDATE, sched_ktime_clock(), - delta); - - nr_windows = cur_jiffies_ts - rq->irqload_ts; - - if (nr_windows) { - if (nr_windows < 10) { - /* Decay CPU's irqload by 3/4 for each window. */ - rq->avg_irqload *= (3 * nr_windows); - rq->avg_irqload = div64_u64(rq->avg_irqload, - 4 * nr_windows); - } else { - rq->avg_irqload = 0; - } - rq->avg_irqload += rq->cur_irqload; - rq->cur_irqload = 0; - } - - rq->cur_irqload += delta; - rq->irqload_ts = cur_jiffies_ts; - raw_spin_unlock_irqrestore(&rq->lock, flags); -} - -void sched_account_irqstart(int cpu, struct task_struct *curr, u64 wallclock) -{ - struct rq *rq = cpu_rq(cpu); - - if (!rq->window_start || sched_disable_window_stats) - return; - - if (is_idle_task(curr)) { - /* We're here without rq->lock held, IRQ disabled */ - raw_spin_lock(&rq->lock); - update_task_cpu_cycles(curr, cpu); - raw_spin_unlock(&rq->lock); - } -} - -static void reset_task_stats(struct task_struct *p) -{ - u32 sum = 0; - - if (exiting_task(p)) - sum = EXITING_TASK_MARKER; - - memset(&p->ravg, 0, sizeof(struct ravg)); - /* Retain EXITING_TASK marker */ - p->ravg.sum_history[0] = sum; -} - -static inline void mark_task_starting(struct task_struct *p) -{ - u64 wallclock; - struct rq *rq = task_rq(p); - - if (!rq->window_start || sched_disable_window_stats) { - reset_task_stats(p); - return; - } - - wallclock = sched_ktime_clock(); - p->ravg.mark_start = p->last_wake_ts = wallclock; - p->last_cpu_selected_ts = wallclock; - p->last_switch_out_ts = 0; - update_task_cpu_cycles(p, cpu_of(rq)); -} - -static inline void set_window_start(struct rq *rq) -{ - int cpu = cpu_of(rq); - struct rq *sync_rq = cpu_rq(sync_cpu); - - if (rq->window_start || !sched_enable_hmp) - return; - - if (cpu == sync_cpu) { - rq->window_start = sched_ktime_clock(); - } else { - raw_spin_unlock(&rq->lock); - double_rq_lock(rq, sync_rq); - rq->window_start = cpu_rq(sync_cpu)->window_start; -#ifdef CONFIG_SCHED_FREQ_INPUT - rq->curr_runnable_sum = rq->prev_runnable_sum = 0; - rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0; -#endif - raw_spin_unlock(&sync_rq->lock); - } - - rq->curr->ravg.mark_start = rq->window_start; -} - -static inline void migrate_sync_cpu(int cpu) -{ - if (cpu == sync_cpu) - sync_cpu = smp_processor_id(); -} - -static void reset_all_task_stats(void) -{ - struct task_struct *g, *p; - - read_lock(&tasklist_lock); - do_each_thread(g, p) { - reset_task_stats(p); - } while_each_thread(g, p); - read_unlock(&tasklist_lock); -} - -/* - * sched_exit() - Set EXITING_TASK_MARKER in task's ravg.demand field - * - * Stop accounting (exiting) task's future cpu usage - * - * We need this so that reset_all_windows_stats() can function correctly. - * reset_all_window_stats() depends on do_each_thread/for_each_thread task - * iterators to reset *all* task's statistics. Exiting tasks however become - * invisible to those iterators. sched_exit() is called on a exiting task prior - * to being removed from task_list, which will let reset_all_window_stats() - * function correctly. - */ -void sched_exit(struct task_struct *p) -{ - unsigned long flags; - int cpu = get_cpu(); - struct rq *rq = cpu_rq(cpu); - u64 wallclock; - - sched_set_group_id(p, 0); - - raw_spin_lock_irqsave(&rq->lock, flags); - /* rq->curr == p */ - wallclock = sched_ktime_clock(); - update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); - dequeue_task(rq, p, 0); - reset_task_stats(p); - p->ravg.mark_start = wallclock; - p->ravg.sum_history[0] = EXITING_TASK_MARKER; - enqueue_task(rq, p, 0); - clear_ed_task(p, rq); - raw_spin_unlock_irqrestore(&rq->lock, flags); - - put_cpu(); -} - -static void disable_window_stats(void) -{ - unsigned long flags; - int i; - - local_irq_save(flags); - for_each_possible_cpu(i) - raw_spin_lock(&cpu_rq(i)->lock); - - sched_disable_window_stats = 1; - - for_each_possible_cpu(i) - raw_spin_unlock(&cpu_rq(i)->lock); - - local_irq_restore(flags); -} - -/* Called with all cpu's rq->lock held */ -static void enable_window_stats(void) -{ - sched_disable_window_stats = 0; - -} - -enum reset_reason_code { - WINDOW_CHANGE, - POLICY_CHANGE, - HIST_SIZE_CHANGE, - FREQ_AGGREGATE_CHANGE, -}; - -const char *sched_window_reset_reasons[] = { - "WINDOW_CHANGE", - "POLICY_CHANGE", - "HIST_SIZE_CHANGE", -}; - -/* Called with IRQs enabled */ -void reset_all_window_stats(u64 window_start, unsigned int window_size) -{ - int cpu; - unsigned long flags; - u64 start_ts = sched_ktime_clock(); - int reason = WINDOW_CHANGE; - unsigned int old = 0, new = 0; - struct related_thread_group *grp; - - disable_window_stats(); - - reset_all_task_stats(); - - local_irq_save(flags); - - read_lock(&related_thread_group_lock); - - for_each_possible_cpu(cpu) { - struct rq *rq = cpu_rq(cpu); - raw_spin_lock(&rq->lock); - } - - list_for_each_entry(grp, &related_thread_groups, list) { - int j; - - for_each_possible_cpu(j) { - struct group_cpu_time *cpu_time; - /* Protected by rq lock */ - cpu_time = _group_cpu_time(grp, j); - memset(cpu_time, 0, sizeof(struct group_cpu_time)); - if (window_start) - cpu_time->window_start = window_start; - } - } - - if (window_size) { - sched_ravg_window = window_size * TICK_NSEC; - set_hmp_defaults(); - } - - enable_window_stats(); - - for_each_possible_cpu(cpu) { - struct rq *rq = cpu_rq(cpu); - - if (window_start) - rq->window_start = window_start; -#ifdef CONFIG_SCHED_FREQ_INPUT - rq->curr_runnable_sum = rq->prev_runnable_sum = 0; - rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0; -#endif - reset_cpu_hmp_stats(cpu, 1); - } - - if (sched_window_stats_policy != sysctl_sched_window_stats_policy) { - reason = POLICY_CHANGE; - old = sched_window_stats_policy; - new = sysctl_sched_window_stats_policy; - sched_window_stats_policy = sysctl_sched_window_stats_policy; - } else if (sched_ravg_hist_size != sysctl_sched_ravg_hist_size) { - reason = HIST_SIZE_CHANGE; - old = sched_ravg_hist_size; - new = sysctl_sched_ravg_hist_size; - sched_ravg_hist_size = sysctl_sched_ravg_hist_size; - } -#ifdef CONFIG_SCHED_FREQ_INPUT - else if (sched_freq_aggregate != - sysctl_sched_freq_aggregate) { - reason = FREQ_AGGREGATE_CHANGE; - old = sched_freq_aggregate; - new = sysctl_sched_freq_aggregate; - sched_freq_aggregate = sysctl_sched_freq_aggregate; - } -#endif - - for_each_possible_cpu(cpu) { - struct rq *rq = cpu_rq(cpu); - raw_spin_unlock(&rq->lock); - } - - read_unlock(&related_thread_group_lock); - - local_irq_restore(flags); - - trace_sched_reset_all_window_stats(window_start, window_size, - sched_ktime_clock() - start_ts, reason, old, new); -} - -#ifdef CONFIG_SCHED_FREQ_INPUT - -static inline void -sync_window_start(struct rq *rq, struct group_cpu_time *cpu_time); - -void sched_get_cpus_busy(struct sched_load *busy, - const struct cpumask *query_cpus) -{ - unsigned long flags; - struct rq *rq; - const int cpus = cpumask_weight(query_cpus); - u64 load[cpus], group_load[cpus]; - u64 nload[cpus], ngload[cpus]; - u64 pload[cpus]; - unsigned int cur_freq[cpus], max_freq[cpus]; - int notifier_sent[cpus]; - int early_detection[cpus]; - int cpu, i = 0; - unsigned int window_size; - u64 max_prev_sum = 0; - int max_busy_cpu = cpumask_first(query_cpus); - struct related_thread_group *grp; - - if (unlikely(cpus == 0)) - return; - - /* - * This function could be called in timer context, and the - * current task may have been executing for a long time. Ensure - * that the window stats are current by doing an update. - */ - read_lock(&related_thread_group_lock); - - local_irq_save(flags); - for_each_cpu(cpu, query_cpus) - raw_spin_lock(&cpu_rq(cpu)->lock); - - window_size = sched_ravg_window; - - for_each_cpu(cpu, query_cpus) { - rq = cpu_rq(cpu); - - update_task_ravg(rq->curr, rq, TASK_UPDATE, sched_ktime_clock(), - 0); - cur_freq[i] = cpu_cycles_to_freq(rq->cc.cycles, rq->cc.time); - - load[i] = rq->old_busy_time = rq->prev_runnable_sum; - nload[i] = rq->nt_prev_runnable_sum; - pload[i] = rq->hmp_stats.pred_demands_sum; - rq->old_estimated_time = pload[i]; - - if (load[i] > max_prev_sum) { - max_prev_sum = load[i]; - max_busy_cpu = cpu; - } - - notifier_sent[i] = rq->notifier_sent; - early_detection[i] = (rq->ed_task != NULL); - rq->notifier_sent = 0; - cur_freq[i] = cpu_cur_freq(cpu); - max_freq[i] = cpu_max_freq(cpu); - i++; - } - - for_each_related_thread_group(grp) { - for_each_cpu(cpu, query_cpus) { - /* Protected by rq_lock */ - struct group_cpu_time *cpu_time = - _group_cpu_time(grp, cpu); - sync_window_start(cpu_rq(cpu), cpu_time); - } - } - - i = 0; - for_each_cpu(cpu, query_cpus) { - group_load[i] = 0; - ngload[i] = 0; - - if (early_detection[i]) - goto skip_early; - - rq = cpu_rq(cpu); - if (!notifier_sent[i]) { - if (cpu == max_busy_cpu) - group_load_in_freq_domain( - &rq->freq_domain_cpumask, - &group_load[i], &ngload[i]); - } else { - _group_load_in_cpu(cpu, &group_load[i], &ngload[i]); - } - - load[i] += group_load[i]; - nload[i] += ngload[i]; - /* - * Scale load in reference to cluster max_possible_freq. - * - * Note that scale_load_to_cpu() scales load in reference to - * the cluster max_freq. - */ - load[i] = scale_load_to_cpu(load[i], cpu); - nload[i] = scale_load_to_cpu(nload[i], cpu); - pload[i] = scale_load_to_cpu(pload[i], cpu); -skip_early: - i++; - } - - for_each_cpu(cpu, query_cpus) - raw_spin_unlock(&(cpu_rq(cpu))->lock); - local_irq_restore(flags); - - read_unlock(&related_thread_group_lock); - - i = 0; - for_each_cpu(cpu, query_cpus) { - rq = cpu_rq(cpu); - - if (early_detection[i]) { - busy[i].prev_load = div64_u64(sched_ravg_window, - NSEC_PER_USEC); - busy[i].new_task_load = 0; - goto exit_early; - } - - if (!notifier_sent[i]) { - load[i] = scale_load_to_freq(load[i], max_freq[i], - cur_freq[i]); - nload[i] = scale_load_to_freq(nload[i], max_freq[i], - cur_freq[i]); - if (load[i] > window_size) - load[i] = window_size; - if (nload[i] > window_size) - nload[i] = window_size; - - load[i] = scale_load_to_freq(load[i], cur_freq[i], - cpu_max_possible_freq(cpu)); - nload[i] = scale_load_to_freq(nload[i], cur_freq[i], - cpu_max_possible_freq(cpu)); - } else { - load[i] = scale_load_to_freq(load[i], max_freq[i], - cpu_max_possible_freq(cpu)); - nload[i] = scale_load_to_freq(nload[i], max_freq[i], - cpu_max_possible_freq(cpu)); - } - pload[i] = scale_load_to_freq(pload[i], max_freq[i], - rq->cluster->max_possible_freq); - - busy[i].prev_load = div64_u64(load[i], NSEC_PER_USEC); - busy[i].new_task_load = div64_u64(nload[i], NSEC_PER_USEC); - busy[i].predicted_load = div64_u64(pload[i], NSEC_PER_USEC); - -exit_early: - trace_sched_get_busy(cpu, busy[i].prev_load, - busy[i].new_task_load, - busy[i].predicted_load, - early_detection[i]); - i++; - } -} - -void sched_set_io_is_busy(int val) -{ - sched_io_is_busy = val; -} - -int sched_set_window(u64 window_start, unsigned int window_size) -{ - u64 now, cur_jiffies, jiffy_ktime_ns; - s64 ws; - unsigned long flags; - - if (sched_use_pelt || - (window_size * TICK_NSEC < MIN_SCHED_RAVG_WINDOW)) - return -EINVAL; - - mutex_lock(&policy_mutex); - - /* - * Get a consistent view of ktime, jiffies, and the time - * since the last jiffy (based on last_jiffies_update). - */ - local_irq_save(flags); - cur_jiffies = jiffy_to_ktime_ns(&now, &jiffy_ktime_ns); - local_irq_restore(flags); - - /* translate window_start from jiffies to nanoseconds */ - ws = (window_start - cur_jiffies); /* jiffy difference */ - ws *= TICK_NSEC; - ws += jiffy_ktime_ns; - - /* roll back calculated window start so that it is in - * the past (window stats must have a current window) */ - while (ws > now) - ws -= (window_size * TICK_NSEC); - - BUG_ON(sched_ktime_clock() < ws); - - reset_all_window_stats(ws, window_size); - - sched_update_freq_max_load(cpu_possible_mask); - - mutex_unlock(&policy_mutex); - - return 0; -} - -static void fixup_busy_time(struct task_struct *p, int new_cpu) -{ - struct rq *src_rq = task_rq(p); - struct rq *dest_rq = cpu_rq(new_cpu); - u64 wallclock; - u64 *src_curr_runnable_sum, *dst_curr_runnable_sum; - u64 *src_prev_runnable_sum, *dst_prev_runnable_sum; - u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum; - u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum; - int migrate_type; - struct migration_sum_data d; - bool new_task; - struct related_thread_group *grp; - - if (!sched_enable_hmp || (!p->on_rq && p->state != TASK_WAKING)) - return; - - if (exiting_task(p)) { - clear_ed_task(p, src_rq); - return; - } - - if (p->state == TASK_WAKING) - double_rq_lock(src_rq, dest_rq); - - if (sched_disable_window_stats) - goto done; - - wallclock = sched_ktime_clock(); - - update_task_ravg(task_rq(p)->curr, task_rq(p), - TASK_UPDATE, - wallclock, 0); - update_task_ravg(dest_rq->curr, dest_rq, - TASK_UPDATE, wallclock, 0); - - update_task_ravg(p, task_rq(p), TASK_MIGRATE, - wallclock, 0); - - update_task_cpu_cycles(p, new_cpu); - - new_task = is_new_task(p); - /* Protected by rq_lock */ - grp = p->grp; - if (grp && sched_freq_aggregate) { - struct group_cpu_time *cpu_time; - - migrate_type = GROUP_TO_GROUP; - /* Protected by rq_lock */ - cpu_time = _group_cpu_time(grp, cpu_of(src_rq)); - d.src_rq = NULL; - d.src_cpu_time = cpu_time; - src_curr_runnable_sum = &cpu_time->curr_runnable_sum; - src_prev_runnable_sum = &cpu_time->prev_runnable_sum; - src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; - src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; - - /* Protected by rq_lock */ - cpu_time = _group_cpu_time(grp, cpu_of(dest_rq)); - d.dst_rq = NULL; - d.dst_cpu_time = cpu_time; - dst_curr_runnable_sum = &cpu_time->curr_runnable_sum; - dst_prev_runnable_sum = &cpu_time->prev_runnable_sum; - dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; - dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; - sync_window_start(dest_rq, cpu_time); - } else { - migrate_type = RQ_TO_RQ; - d.src_rq = src_rq; - d.src_cpu_time = NULL; - d.dst_rq = dest_rq; - d.dst_cpu_time = NULL; - src_curr_runnable_sum = &src_rq->curr_runnable_sum; - src_prev_runnable_sum = &src_rq->prev_runnable_sum; - src_nt_curr_runnable_sum = &src_rq->nt_curr_runnable_sum; - src_nt_prev_runnable_sum = &src_rq->nt_prev_runnable_sum; - - dst_curr_runnable_sum = &dest_rq->curr_runnable_sum; - dst_prev_runnable_sum = &dest_rq->prev_runnable_sum; - dst_nt_curr_runnable_sum = &dest_rq->nt_curr_runnable_sum; - dst_nt_prev_runnable_sum = &dest_rq->nt_prev_runnable_sum; - } - - if (p->ravg.curr_window) { - *src_curr_runnable_sum -= p->ravg.curr_window; - *dst_curr_runnable_sum += p->ravg.curr_window; - if (new_task) { - *src_nt_curr_runnable_sum -= p->ravg.curr_window; - *dst_nt_curr_runnable_sum += p->ravg.curr_window; - } - } - - if (p->ravg.prev_window) { - *src_prev_runnable_sum -= p->ravg.prev_window; - *dst_prev_runnable_sum += p->ravg.prev_window; - if (new_task) { - *src_nt_prev_runnable_sum -= p->ravg.prev_window; - *dst_nt_prev_runnable_sum += p->ravg.prev_window; - } - } - - if (p == src_rq->ed_task) { - src_rq->ed_task = NULL; - if (!dest_rq->ed_task) - dest_rq->ed_task = p; - } - - trace_sched_migration_update_sum(p, migrate_type, &d); - BUG_ON((s64)*src_prev_runnable_sum < 0); - BUG_ON((s64)*src_curr_runnable_sum < 0); - BUG_ON((s64)*src_nt_prev_runnable_sum < 0); - BUG_ON((s64)*src_nt_curr_runnable_sum < 0); - -done: - if (p->state == TASK_WAKING) - double_rq_unlock(src_rq, dest_rq); -} - -#else - -static inline void fixup_busy_time(struct task_struct *p, int new_cpu) { } - -#endif /* CONFIG_SCHED_FREQ_INPUT */ - -#define sched_up_down_migrate_auto_update 1 -static void check_for_up_down_migrate_update(const struct cpumask *cpus) -{ - int i = cpumask_first(cpus); - - if (!sched_up_down_migrate_auto_update) - return; - - if (cpu_max_possible_capacity(i) == max_possible_capacity) - return; - - if (cpu_max_possible_freq(i) == cpu_max_freq(i)) - up_down_migrate_scale_factor = 1024; - else - up_down_migrate_scale_factor = (1024 * - cpu_max_possible_freq(i)) / cpu_max_freq(i); - - update_up_down_migrate(); -} - -/* Return cluster which can offer required capacity for group */ -static struct sched_cluster * -best_cluster(struct related_thread_group *grp, u64 total_demand) -{ - struct sched_cluster *cluster = NULL; - - for_each_sched_cluster(cluster) { - if (group_will_fit(cluster, grp, total_demand)) - return cluster; - } - - return NULL; -} - -static void _set_preferred_cluster(struct related_thread_group *grp) -{ - struct task_struct *p; - u64 combined_demand = 0; - - if (!sysctl_sched_enable_colocation) { - grp->last_update = sched_ktime_clock(); - grp->preferred_cluster = NULL; - return; - } - - /* - * wakeup of two or more related tasks could race with each other and - * could result in multiple calls to _set_preferred_cluster being issued - * at same time. Avoid overhead in such cases of rechecking preferred - * cluster - */ - if (sched_ktime_clock() - grp->last_update < sched_ravg_window / 10) - return; - - list_for_each_entry(p, &grp->tasks, grp_list) - combined_demand += p->ravg.demand; - - grp->preferred_cluster = best_cluster(grp, combined_demand); - grp->last_update = sched_ktime_clock(); - trace_sched_set_preferred_cluster(grp, combined_demand); -} - -static void set_preferred_cluster(struct related_thread_group *grp) -{ - raw_spin_lock(&grp->lock); - _set_preferred_cluster(grp); - raw_spin_unlock(&grp->lock); -} - -#define ADD_TASK 0 -#define REM_TASK 1 - -#ifdef CONFIG_SCHED_FREQ_INPUT - -static void -update_task_ravg(struct task_struct *p, struct rq *rq, - int event, u64 wallclock, u64 irqtime); - -static inline void free_group_cputime(struct related_thread_group *grp) -{ - free_percpu(grp->cpu_time); -} - -static int alloc_group_cputime(struct related_thread_group *grp) -{ - int i; - struct group_cpu_time *cpu_time; - int cpu = raw_smp_processor_id(); - struct rq *rq = cpu_rq(cpu); - u64 window_start = rq->window_start; - - grp->cpu_time = alloc_percpu(struct group_cpu_time); - if (!grp->cpu_time) - return -ENOMEM; - - for_each_possible_cpu(i) { - cpu_time = per_cpu_ptr(grp->cpu_time, i); - memset(cpu_time, 0, sizeof(struct group_cpu_time)); - cpu_time->window_start = window_start; - } - - return 0; -} - -/* - * A group's window_start may be behind. When moving it forward, flip prev/curr - * counters. When moving forward > 1 window, prev counter is set to 0 - */ -static inline void -sync_window_start(struct rq *rq, struct group_cpu_time *cpu_time) -{ - u64 delta; - int nr_windows; - u64 curr_sum = cpu_time->curr_runnable_sum; - u64 nt_curr_sum = cpu_time->nt_curr_runnable_sum; - - delta = rq->window_start - cpu_time->window_start; - if (!delta) - return; - - nr_windows = div64_u64(delta, sched_ravg_window); - if (nr_windows > 1) - curr_sum = nt_curr_sum = 0; - - cpu_time->prev_runnable_sum = curr_sum; - cpu_time->curr_runnable_sum = 0; - - cpu_time->nt_prev_runnable_sum = nt_curr_sum; - cpu_time->nt_curr_runnable_sum = 0; - - cpu_time->window_start = rq->window_start; -} - -/* - * Task's cpu usage is accounted in: - * rq->curr/prev_runnable_sum, when its ->grp is NULL - * grp->cpu_time[cpu]->curr/prev_runnable_sum, when its ->grp is !NULL - * - * Transfer task's cpu usage between those counters when transitioning between - * groups - */ -static void transfer_busy_time(struct rq *rq, struct related_thread_group *grp, - struct task_struct *p, int event) -{ - u64 wallclock; - struct group_cpu_time *cpu_time; - u64 *src_curr_runnable_sum, *dst_curr_runnable_sum; - u64 *src_prev_runnable_sum, *dst_prev_runnable_sum; - u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum; - u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum; - struct migration_sum_data d; - int migrate_type; - - if (!sched_freq_aggregate) - return; - - wallclock = sched_ktime_clock(); - - update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); - update_task_ravg(p, rq, TASK_UPDATE, wallclock, 0); - - /* cpu_time protected by related_thread_group_lock, grp->lock rq_lock */ - cpu_time = _group_cpu_time(grp, cpu_of(rq)); - if (event == ADD_TASK) { - sync_window_start(rq, cpu_time); - migrate_type = RQ_TO_GROUP; - d.src_rq = rq; - d.src_cpu_time = NULL; - d.dst_rq = NULL; - d.dst_cpu_time = cpu_time; - src_curr_runnable_sum = &rq->curr_runnable_sum; - dst_curr_runnable_sum = &cpu_time->curr_runnable_sum; - src_prev_runnable_sum = &rq->prev_runnable_sum; - dst_prev_runnable_sum = &cpu_time->prev_runnable_sum; - - src_nt_curr_runnable_sum = &rq->nt_curr_runnable_sum; - dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; - src_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum; - dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; - } else if (event == REM_TASK) { - migrate_type = GROUP_TO_RQ; - d.src_rq = NULL; - d.src_cpu_time = cpu_time; - d.dst_rq = rq; - d.dst_cpu_time = NULL; - - /* - * In case of REM_TASK, cpu_time->window_start would be - * uptodate, because of the update_task_ravg() we called - * above on the moving task. Hence no need for - * sync_window_start() - */ - src_curr_runnable_sum = &cpu_time->curr_runnable_sum; - dst_curr_runnable_sum = &rq->curr_runnable_sum; - src_prev_runnable_sum = &cpu_time->prev_runnable_sum; - dst_prev_runnable_sum = &rq->prev_runnable_sum; - - src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; - dst_nt_curr_runnable_sum = &rq->nt_curr_runnable_sum; - src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; - dst_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum; - } - - *src_curr_runnable_sum -= p->ravg.curr_window; - *dst_curr_runnable_sum += p->ravg.curr_window; - - *src_prev_runnable_sum -= p->ravg.prev_window; - *dst_prev_runnable_sum += p->ravg.prev_window; - - if (is_new_task(p)) { - *src_nt_curr_runnable_sum -= p->ravg.curr_window; - *dst_nt_curr_runnable_sum += p->ravg.curr_window; - *src_nt_prev_runnable_sum -= p->ravg.prev_window; - *dst_nt_prev_runnable_sum += p->ravg.prev_window; - } - - trace_sched_migration_update_sum(p, migrate_type, &d); - - BUG_ON((s64)*src_curr_runnable_sum < 0); - BUG_ON((s64)*src_prev_runnable_sum < 0); -} - -static inline struct group_cpu_time * -task_group_cpu_time(struct task_struct *p, int cpu) -{ - return _group_cpu_time(rcu_dereference(p->grp), cpu); -} - -static inline struct group_cpu_time * -_group_cpu_time(struct related_thread_group *grp, int cpu) -{ - return grp ? per_cpu_ptr(grp->cpu_time, cpu) : NULL; -} - -#else /* CONFIG_SCHED_FREQ_INPUT */ - -static inline void free_group_cputime(struct related_thread_group *grp) { } - -static inline int alloc_group_cputime(struct related_thread_group *grp) -{ - return 0; -} - -static inline void transfer_busy_time(struct rq *rq, - struct related_thread_group *grp, struct task_struct *p, int event) -{ -} - -static struct group_cpu_time * -task_group_cpu_time(struct task_struct *p, int cpu) -{ - return NULL; -} - -static inline struct group_cpu_time * -_group_cpu_time(struct related_thread_group *grp, int cpu) -{ - return NULL; -} - -#endif - -struct related_thread_group *alloc_related_thread_group(int group_id) -{ - struct related_thread_group *grp; - - grp = kzalloc(sizeof(*grp), GFP_KERNEL); - if (!grp) - return ERR_PTR(-ENOMEM); - - if (alloc_group_cputime(grp)) { - kfree(grp); - return ERR_PTR(-ENOMEM); - } - - grp->id = group_id; - INIT_LIST_HEAD(&grp->tasks); - INIT_LIST_HEAD(&grp->list); - raw_spin_lock_init(&grp->lock); - - return grp; -} - -struct related_thread_group *lookup_related_thread_group(unsigned int group_id) -{ - struct related_thread_group *grp; - - list_for_each_entry(grp, &related_thread_groups, list) { - if (grp->id == group_id) - return grp; - } - - return NULL; -} - -/* See comments before preferred_cluster() */ -static void free_related_thread_group(struct rcu_head *rcu) -{ - struct related_thread_group *grp = container_of(rcu, struct - related_thread_group, rcu); - - free_group_cputime(grp); - kfree(grp); -} - -static void remove_task_from_group(struct task_struct *p) -{ - struct related_thread_group *grp = p->grp; - struct rq *rq; - int empty_group = 1; - - raw_spin_lock(&grp->lock); - - rq = __task_rq_lock(p); - transfer_busy_time(rq, p->grp, p, REM_TASK); - list_del_init(&p->grp_list); - rcu_assign_pointer(p->grp, NULL); - __task_rq_unlock(rq); - - if (!list_empty(&grp->tasks)) { - empty_group = 0; - _set_preferred_cluster(grp); - } - - raw_spin_unlock(&grp->lock); - - if (empty_group) { - list_del(&grp->list); - call_rcu(&grp->rcu, free_related_thread_group); - } -} - -static int -add_task_to_group(struct task_struct *p, struct related_thread_group *grp) -{ - struct rq *rq; - - raw_spin_lock(&grp->lock); - - /* - * Change p->grp under rq->lock. Will prevent races with read-side - * reference of p->grp in various hot-paths - */ - rq = __task_rq_lock(p); - transfer_busy_time(rq, grp, p, ADD_TASK); - list_add(&p->grp_list, &grp->tasks); - rcu_assign_pointer(p->grp, grp); - __task_rq_unlock(rq); - - _set_preferred_cluster(grp); - - raw_spin_unlock(&grp->lock); - - return 0; -} - -int sched_set_group_id(struct task_struct *p, unsigned int group_id) -{ - int rc = 0, destroy = 0; - unsigned long flags; - struct related_thread_group *grp = NULL, *new = NULL; - -redo: - raw_spin_lock_irqsave(&p->pi_lock, flags); - - if ((current != p && p->flags & PF_EXITING) || - (!p->grp && !group_id) || - (p->grp && p->grp->id == group_id)) - goto done; - - write_lock(&related_thread_group_lock); - - if (!group_id) { - remove_task_from_group(p); - write_unlock(&related_thread_group_lock); - goto done; - } - - if (p->grp && p->grp->id != group_id) - remove_task_from_group(p); - - grp = lookup_related_thread_group(group_id); - if (!grp && !new) { - /* New group */ - write_unlock(&related_thread_group_lock); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); - new = alloc_related_thread_group(group_id); - if (IS_ERR(new)) - return -ENOMEM; - destroy = 1; - /* Rerun checks (like task exiting), since we dropped pi_lock */ - goto redo; - } else if (!grp && new) { - /* New group - use object allocated before */ - destroy = 0; - list_add(&new->list, &related_thread_groups); - grp = new; - } - - BUG_ON(!grp); - rc = add_task_to_group(p, grp); - write_unlock(&related_thread_group_lock); -done: - raw_spin_unlock_irqrestore(&p->pi_lock, flags); - - if (new && destroy) { - free_group_cputime(new); - kfree(new); - } - - return rc; -} - -unsigned int sched_get_group_id(struct task_struct *p) -{ - unsigned int group_id; - struct related_thread_group *grp; - - rcu_read_lock(); - grp = task_related_thread_group(p); - group_id = grp ? grp->id : 0; - rcu_read_unlock(); - - return group_id; -} - -static void update_cpu_cluster_capacity(const cpumask_t *cpus) -{ - int i; - struct sched_cluster *cluster; - struct cpumask cpumask; - - cpumask_copy(&cpumask, cpus); - pre_big_task_count_change(cpu_possible_mask); - - for_each_cpu(i, &cpumask) { - cluster = cpu_rq(i)->cluster; - cpumask_andnot(&cpumask, &cpumask, &cluster->cpus); - - cluster->capacity = compute_capacity(cluster); - cluster->load_scale_factor = compute_load_scale_factor(cluster); - - /* 'cpus' can contain cpumask more than one cluster */ - check_for_up_down_migrate_update(&cluster->cpus); - } - - __update_min_max_capacity(); - - post_big_task_count_change(cpu_possible_mask); -} - -static DEFINE_SPINLOCK(cpu_freq_min_max_lock); -void sched_update_cpu_freq_min_max(const cpumask_t *cpus, u32 fmin, u32 fmax) -{ - struct cpumask cpumask; - struct sched_cluster *cluster; - int i, update_capacity = 0; - unsigned long flags; - - spin_lock_irqsave(&cpu_freq_min_max_lock, flags); - cpumask_copy(&cpumask, cpus); - for_each_cpu(i, &cpumask) { - cluster = cpu_rq(i)->cluster; - cpumask_andnot(&cpumask, &cpumask, &cluster->cpus); - - update_capacity += (cluster->max_mitigated_freq != fmax); - cluster->max_mitigated_freq = fmax; - } - spin_unlock_irqrestore(&cpu_freq_min_max_lock, flags); - - if (update_capacity) - update_cpu_cluster_capacity(cpus); -} - -static int cpufreq_notifier_policy(struct notifier_block *nb, - unsigned long val, void *data) -{ - struct cpufreq_policy *policy = (struct cpufreq_policy *)data; - struct sched_cluster *cluster = NULL; - struct cpumask policy_cluster = *policy->related_cpus; - unsigned int orig_max_freq = 0; - int i, j, update_capacity = 0; - - if (val != CPUFREQ_NOTIFY && val != CPUFREQ_REMOVE_POLICY && - val != CPUFREQ_CREATE_POLICY) - return 0; - - if (val == CPUFREQ_REMOVE_POLICY || val == CPUFREQ_CREATE_POLICY) { - update_min_max_capacity(); - return 0; - } - - max_possible_freq = max(max_possible_freq, policy->cpuinfo.max_freq); - if (min_max_freq == 1) - min_max_freq = UINT_MAX; - min_max_freq = min(min_max_freq, policy->cpuinfo.max_freq); - BUG_ON(!min_max_freq); - BUG_ON(!policy->max); - - for_each_cpu(i, &policy_cluster) { - cluster = cpu_rq(i)->cluster; - cpumask_andnot(&policy_cluster, &policy_cluster, - &cluster->cpus); - - orig_max_freq = cluster->max_freq; - cluster->min_freq = policy->min; - cluster->max_freq = policy->max; - cluster->cur_freq = policy->cur; - - if (!cluster->freq_init_done) { - mutex_lock(&cluster_lock); - for_each_cpu(j, &cluster->cpus) - cpumask_copy(&cpu_rq(j)->freq_domain_cpumask, - policy->related_cpus); - cluster->max_possible_freq = policy->cpuinfo.max_freq; - cluster->max_possible_capacity = - compute_max_possible_capacity(cluster); - cluster->freq_init_done = true; - - sort_clusters(); - update_all_clusters_stats(); - mutex_unlock(&cluster_lock); - continue; - } - - update_capacity += (orig_max_freq != cluster->max_freq); - } - - if (update_capacity) - update_cpu_cluster_capacity(policy->related_cpus); - - return 0; -} - -static int cpufreq_notifier_trans(struct notifier_block *nb, - unsigned long val, void *data) -{ - struct cpufreq_freqs *freq = (struct cpufreq_freqs *)data; - unsigned int cpu = freq->cpu, new_freq = freq->new; - unsigned long flags; - struct sched_cluster *cluster; - struct cpumask policy_cpus = cpu_rq(cpu)->freq_domain_cpumask; - int i, j; - - if (val != CPUFREQ_POSTCHANGE) - return 0; - - BUG_ON(!new_freq); - - if (cpu_cur_freq(cpu) == new_freq) - return 0; - - for_each_cpu(i, &policy_cpus) { - cluster = cpu_rq(i)->cluster; - - for_each_cpu(j, &cluster->cpus) { - struct rq *rq = cpu_rq(j); - - raw_spin_lock_irqsave(&rq->lock, flags); - update_task_ravg(rq->curr, rq, TASK_UPDATE, - sched_ktime_clock(), 0); - raw_spin_unlock_irqrestore(&rq->lock, flags); - } - - cluster->cur_freq = new_freq; - cpumask_andnot(&policy_cpus, &policy_cpus, &cluster->cpus); - } - - return 0; -} - -static int pwr_stats_ready_notifier(struct notifier_block *nb, - unsigned long cpu, void *data) -{ - cpumask_t mask = CPU_MASK_NONE; - - cpumask_set_cpu(cpu, &mask); - sched_update_freq_max_load(&mask); - - mutex_lock(&cluster_lock); - sort_clusters(); - mutex_unlock(&cluster_lock); - - return 0; -} - -static struct notifier_block notifier_policy_block = { - .notifier_call = cpufreq_notifier_policy -}; - -static struct notifier_block notifier_trans_block = { - .notifier_call = cpufreq_notifier_trans -}; - -static struct notifier_block notifier_pwr_stats_ready = { - .notifier_call = pwr_stats_ready_notifier -}; - -int __weak register_cpu_pwr_stats_ready_notifier(struct notifier_block *nb) -{ - return -EINVAL; -} - -static int register_sched_callback(void) -{ - int ret; - - if (!sched_enable_hmp) - return 0; - - ret = cpufreq_register_notifier(¬ifier_policy_block, - CPUFREQ_POLICY_NOTIFIER); - - if (!ret) - ret = cpufreq_register_notifier(¬ifier_trans_block, - CPUFREQ_TRANSITION_NOTIFIER); - - register_cpu_pwr_stats_ready_notifier(¬ifier_pwr_stats_ready); - - return 0; -} - -/* - * cpufreq callbacks can be registered at core_initcall or later time. - * Any registration done prior to that is "forgotten" by cpufreq. See - * initialization of variable init_cpufreq_transition_notifier_list_called - * for further information. - */ -core_initcall(register_sched_callback); - -static inline int update_preferred_cluster(struct related_thread_group *grp, - struct task_struct *p, u32 old_load) -{ - u32 new_load = task_load(p); - - if (!grp) - return 0; - - /* - * Update if task's load has changed significantly or a complete window - * has passed since we last updated preference - */ - if (abs(new_load - old_load) > sched_ravg_window / 4 || - sched_ktime_clock() - grp->last_update > sched_ravg_window) - return 1; - - return 0; -} - -#else /* CONFIG_SCHED_HMP */ - -static inline void fixup_busy_time(struct task_struct *p, int new_cpu) { } - -static void -update_task_ravg(struct task_struct *p, struct rq *rq, - int event, u64 wallclock, u64 irqtime) -{ -} - -static inline void mark_task_starting(struct task_struct *p) {} - -static inline void set_window_start(struct rq *rq) {} - -static inline void migrate_sync_cpu(int cpu) {} - -#endif /* CONFIG_SCHED_HMP */ - #ifdef CONFIG_SMP /* * This is how migration works: @@ -4330,34 +1134,6 @@ static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_ return rq; } -static void notify_migration(int src_cpu, int dest_cpu, bool src_cpu_dead, - struct task_struct *p) -{ - struct migration_notify_data mnd; - bool check_groups; - - rcu_read_lock(); - check_groups = rcu_access_pointer(p->grp) != NULL; - rcu_read_unlock(); - - if (!same_freq_domain(src_cpu, dest_cpu)) { - if (!src_cpu_dead) - check_for_freq_change(cpu_rq(src_cpu), false, - check_groups); - check_for_freq_change(cpu_rq(dest_cpu), false, check_groups); - } else { - check_for_freq_change(cpu_rq(dest_cpu), true, check_groups); - } - - if (task_notify_on_migrate(p)) { - mnd.src_cpu = src_cpu; - mnd.dest_cpu = dest_cpu; - mnd.load = pct_task_load(p); - atomic_notifier_call_chain(&migration_notifier_head, 0, - (void *)&mnd); - } -} - /* * migration_cpu_stop - this will be executed by a highprio stopper thread * and performs thread migration by bumping thread off CPU then @@ -5181,8 +1957,6 @@ static void ttwu_queue(struct task_struct *p, int cpu) raw_spin_unlock(&rq->lock); } -__read_mostly unsigned int sysctl_sched_wakeup_load_threshold = 110; - /** * try_to_wake_up - wake up a thread * @p: the thread to be awakened @@ -5203,8 +1977,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) { unsigned long flags; int cpu, src_cpu, success = 0; - int notify = 0; - struct migration_notify_data mnd; #ifdef CONFIG_SMP unsigned int old_load; struct rq *rq; @@ -5309,31 +2081,9 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) ttwu_queue(p, cpu); stat: ttwu_stat(p, cpu, wake_flags); - - if (task_notify_on_migrate(p)) { - mnd.src_cpu = src_cpu; - mnd.dest_cpu = cpu; - mnd.load = pct_task_load(p); - - /* - * Call the migration notifier with mnd for foreground task - * migrations as well as for wakeups if their load is above - * sysctl_sched_wakeup_load_threshold. This would prompt the - * cpu-boost to boost the CPU frequency on wake up of a heavy - * weight foreground task - */ - if ((src_cpu != cpu) || (mnd.load > - sysctl_sched_wakeup_load_threshold)) - notify = 1; - } - out: raw_spin_unlock_irqrestore(&p->pi_lock, flags); - if (notify) - atomic_notifier_call_chain(&migration_notifier_head, - 0, (void *)&mnd); - if (freq_notif_allowed) { if (!same_freq_domain(src_cpu, cpu)) { check_for_freq_change(cpu_rq(cpu), @@ -5466,6 +2216,44 @@ void __dl_clear_params(struct task_struct *p) dl_se->dl_yielded = 0; } +#ifdef CONFIG_SCHED_HMP +/* + * sched_exit() - Set EXITING_TASK_MARKER in task's ravg.demand field + * + * Stop accounting (exiting) task's future cpu usage + * + * We need this so that reset_all_windows_stats() can function correctly. + * reset_all_window_stats() depends on do_each_thread/for_each_thread task + * iterators to reset *all* task's statistics. Exiting tasks however become + * invisible to those iterators. sched_exit() is called on a exiting task prior + * to being removed from task_list, which will let reset_all_window_stats() + * function correctly. + */ +void sched_exit(struct task_struct *p) +{ + unsigned long flags; + int cpu = get_cpu(); + struct rq *rq = cpu_rq(cpu); + u64 wallclock; + + sched_set_group_id(p, 0); + + raw_spin_lock_irqsave(&rq->lock, flags); + /* rq->curr == p */ + wallclock = sched_ktime_clock(); + update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); + dequeue_task(rq, p, 0); + reset_task_stats(p); + p->ravg.mark_start = wallclock; + p->ravg.sum_history[0] = EXITING_TASK_MARKER; + enqueue_task(rq, p, 0); + clear_ed_task(p, rq); + raw_spin_unlock_irqrestore(&rq->lock, flags); + + put_cpu(); +} +#endif /* CONFIG_SCHED_HMP */ + /* * Perform scheduler related setup for a newly forked process p. * p is forked by current. @@ -5752,6 +2540,7 @@ void wake_up_new_task(struct task_struct *p) raw_spin_lock_irqsave(&p->pi_lock, flags); init_new_task_load(p); + add_new_task_to_grp(p); /* Initialize new task's runnable average */ init_entity_runnable_average(&p->se); #ifdef CONFIG_SMP @@ -6236,37 +3025,6 @@ unsigned long long task_sched_runtime(struct task_struct *p) return ns; } -#ifdef CONFIG_SCHED_HMP -static bool early_detection_notify(struct rq *rq, u64 wallclock) -{ - struct task_struct *p; - int loop_max = 10; - - if (!sched_boost() || !rq->cfs.h_nr_running) - return 0; - - rq->ed_task = NULL; - list_for_each_entry(p, &rq->cfs_tasks, se.group_node) { - if (!loop_max) - break; - - if (wallclock - p->last_wake_ts >= EARLY_DETECTION_DURATION) { - rq->ed_task = p; - return 1; - } - - loop_max--; - } - - return 0; -} -#else /* CONFIG_SCHED_HMP */ -static bool early_detection_notify(struct rq *rq, u64 wallclock) -{ - return 0; -} -#endif /* CONFIG_SCHED_HMP */ - /* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. @@ -10973,8 +7731,11 @@ void __init sched_init(void) rq->avg_irqload = 0; rq->irqload_ts = 0; rq->static_cpu_pwr_cost = 0; - rq->cc.cycles = SCHED_MIN_FREQ; + rq->cc.cycles = 1; rq->cc.time = 1; + rq->cstate = 0; + rq->wakeup_latency = 0; + rq->wakeup_energy = 0; /* * All cpus part of same cluster by default. This avoids the @@ -10982,19 +7743,14 @@ void __init sched_init(void) * like select_best_cpu() */ rq->cluster = &init_cluster; -#ifdef CONFIG_SCHED_FREQ_INPUT rq->curr_runnable_sum = rq->prev_runnable_sum = 0; rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0; rq->old_busy_time = 0; rq->old_estimated_time = 0; rq->old_busy_time_group = 0; - rq->notifier_sent = 0; rq->hmp_stats.pred_demands_sum = 0; #endif -#endif rq->max_idle_balance_cost = sysctl_sched_migration_cost; - rq->cstate = 0; - rq->wakeup_latency = 0; INIT_LIST_HEAD(&rq->cfs_tasks); @@ -11719,7 +8475,7 @@ int sched_rr_handler(struct ctl_table *table, int write, #ifdef CONFIG_CGROUP_SCHED -static inline struct task_group *css_tg(struct cgroup_subsys_state *css) +inline struct task_group *css_tg(struct cgroup_subsys_state *css) { return css ? container_of(css, struct task_group, css) : NULL; } @@ -11793,63 +8549,6 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset) sched_move_task(task); } -static u64 cpu_notify_on_migrate_read_u64(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - struct task_group *tg = css_tg(css); - - return tg->notify_on_migrate; -} - -static int cpu_notify_on_migrate_write_u64(struct cgroup_subsys_state *css, - struct cftype *cft, u64 notify) -{ - struct task_group *tg = css_tg(css); - - tg->notify_on_migrate = (notify > 0); - - return 0; -} - -#ifdef CONFIG_SCHED_HMP - -static u64 cpu_upmigrate_discourage_read_u64(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - struct task_group *tg = css_tg(css); - - return tg->upmigrate_discouraged; -} - -static int cpu_upmigrate_discourage_write_u64(struct cgroup_subsys_state *css, - struct cftype *cft, u64 upmigrate_discourage) -{ - struct task_group *tg = css_tg(css); - int discourage = upmigrate_discourage > 0; - - if (tg->upmigrate_discouraged == discourage) - return 0; - - /* - * Revisit big-task classification for tasks of this cgroup. It would - * have been efficient to walk tasks of just this cgroup in running - * state, but we don't have easy means to do that. Walk all tasks in - * running state on all cpus instead and re-visit their big task - * classification. - */ - get_online_cpus(); - pre_big_task_count_change(cpu_online_mask); - - tg->upmigrate_discouraged = discourage; - - post_big_task_count_change(cpu_online_mask); - put_online_cpus(); - - return 0; -} - -#endif /* CONFIG_SCHED_HMP */ - #ifdef CONFIG_FAIR_GROUP_SCHED static int cpu_shares_write_u64(struct cgroup_subsys_state *css, struct cftype *cftype, u64 shareval) @@ -12135,11 +8834,6 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, #endif /* CONFIG_RT_GROUP_SCHED */ static struct cftype cpu_files[] = { - { - .name = "notify_on_migrate", - .read_u64 = cpu_notify_on_migrate_read_u64, - .write_u64 = cpu_notify_on_migrate_write_u64, - }, #ifdef CONFIG_SCHED_HMP { .name = "upmigrate_discourage", diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index d1c0ef4bf07d..b6dc131f36a6 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -327,8 +327,6 @@ do { \ P(cluster->cur_freq); P(cluster->max_freq); P(cluster->exec_scale_factor); -#endif -#ifdef CONFIG_SCHED_HMP P(hmp_stats.nr_big_tasks); SEQ_printf(m, " .%-30s: %llu\n", "hmp_stats.cumulative_runnable_avg", rq->hmp_stats.cumulative_runnable_avg); @@ -417,10 +415,8 @@ static void sched_debug_header(struct seq_file *m) P(sched_upmigrate); P(sched_downmigrate); P(sched_init_task_load_windows); - P(sched_init_task_load_pelt); P(min_capacity); P(max_capacity); - P(sched_use_pelt); P(sched_ravg_window); #endif #undef PN @@ -644,7 +640,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) __P(load_avg); #ifdef CONFIG_SCHED_HMP P(ravg.demand); - P(se.avg.runnable_avg_sum_scaled); #endif #endif diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 584cd048c24b..ce58e2245b4b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2577,450 +2577,24 @@ static u32 __compute_runnable_contrib(u64 n) return contrib + runnable_avg_yN_sum[n]; } -static void add_to_scaled_stat(int cpu, struct sched_avg *sa, u64 delta); -static inline void decay_scaled_stat(struct sched_avg *sa, u64 periods); - -struct cpu_pwr_stats __weak *get_cpu_pwr_stats(void) -{ - return NULL; -} - -enum sched_boost_type { - SCHED_BOOST_NONE, - SCHED_BOOST_ON_BIG, - SCHED_BOOST_ON_ALL, -}; - #ifdef CONFIG_SCHED_HMP -/* Initial task load. Newly created tasks are assigned this load. */ -unsigned int __read_mostly sched_init_task_load_pelt; -unsigned int __read_mostly sched_init_task_load_windows; -unsigned int __read_mostly sysctl_sched_init_task_load_pct = 15; - -unsigned int max_task_load(void) -{ - if (sched_use_pelt) - return LOAD_AVG_MAX; - - return sched_ravg_window; -} - -/* Use this knob to turn on or off HMP-aware task placement logic */ -unsigned int __read_mostly sched_enable_hmp = 0; - -/* A cpu can no longer accomodate more tasks if: - * - * rq->nr_running > sysctl_sched_spill_nr_run || - * rq->hmp_stats.cumulative_runnable_avg > sched_spill_load - */ -unsigned int __read_mostly sysctl_sched_spill_nr_run = 10; - -/* - * Place sync wakee tasks those have less than configured demand to the waker's - * cluster. - */ -unsigned int __read_mostly sched_small_wakee_task_load; -unsigned int __read_mostly sysctl_sched_small_wakee_task_load_pct = 10; - -unsigned int __read_mostly sched_big_waker_task_load; -unsigned int __read_mostly sysctl_sched_big_waker_task_load_pct = 25; - -/* - * CPUs with load greater than the sched_spill_load_threshold are not - * eligible for task placement. When all CPUs in a cluster achieve a - * load higher than this level, tasks becomes eligible for inter - * cluster migration. - */ -unsigned int __read_mostly sched_spill_load; -unsigned int __read_mostly sysctl_sched_spill_load_pct = 100; - -/* - * Tasks whose bandwidth consumption on a cpu is more than - * sched_upmigrate are considered "big" tasks. Big tasks will be - * considered for "up" migration, i.e migrating to a cpu with better - * capacity. - */ -unsigned int __read_mostly sched_upmigrate; -unsigned int __read_mostly sysctl_sched_upmigrate_pct = 80; - -/* - * Big tasks, once migrated, will need to drop their bandwidth - * consumption to less than sched_downmigrate before they are "down" - * migrated. - */ -unsigned int __read_mostly sched_downmigrate; -unsigned int __read_mostly sysctl_sched_downmigrate_pct = 60; - -#define SCHED_UPMIGRATE_MIN_NICE 15 - -/* - * The load scale factor of a CPU gets boosted when its max frequency - * is restricted due to which the tasks are migrating to higher capacity - * CPUs early. The sched_upmigrate threshold is auto-upgraded by - * rq->max_possible_freq/rq->max_freq of a lower capacity CPU. - */ -unsigned int up_down_migrate_scale_factor = 1024; - -/* - * Scheduler boost is a mechanism to temporarily place tasks on CPUs - * with higher capacity than those where a task would have normally - * ended up with their load characteristics. Any entity enabling - * boost is responsible for disabling it as well. - */ -unsigned int sysctl_sched_boost; - -/* - * Scheduler selects and places task to its previous CPU if sleep time is - * less than sysctl_sched_select_prev_cpu_us. - */ -static unsigned int __read_mostly -sched_short_sleep_task_threshold = 2000 * NSEC_PER_USEC; -unsigned int __read_mostly sysctl_sched_select_prev_cpu_us = 2000; - -static unsigned int __read_mostly -sched_long_cpu_selection_threshold = 100 * NSEC_PER_MSEC; - -unsigned int __read_mostly sysctl_sched_restrict_cluster_spill; - -void update_up_down_migrate(void) -{ - unsigned int up_migrate = pct_to_real(sysctl_sched_upmigrate_pct); - unsigned int down_migrate = pct_to_real(sysctl_sched_downmigrate_pct); - unsigned int delta; - - if (up_down_migrate_scale_factor == 1024) - goto done; - - delta = up_migrate - down_migrate; - - up_migrate /= NSEC_PER_USEC; - up_migrate *= up_down_migrate_scale_factor; - up_migrate >>= 10; - up_migrate *= NSEC_PER_USEC; - - up_migrate = min(up_migrate, sched_ravg_window); - - down_migrate /= NSEC_PER_USEC; - down_migrate *= up_down_migrate_scale_factor; - down_migrate >>= 10; - down_migrate *= NSEC_PER_USEC; - - down_migrate = min(down_migrate, up_migrate - delta); -done: - sched_upmigrate = up_migrate; - sched_downmigrate = down_migrate; -} - -void set_hmp_defaults(void) -{ - sched_spill_load = - pct_to_real(sysctl_sched_spill_load_pct); - - update_up_down_migrate(); - -#ifdef CONFIG_SCHED_FREQ_INPUT - sched_major_task_runtime = - mult_frac(sched_ravg_window, MAJOR_TASK_PCT, 100); -#endif - - sched_init_task_load_pelt = - div64_u64((u64)sysctl_sched_init_task_load_pct * - (u64)LOAD_AVG_MAX, 100); - - sched_init_task_load_windows = - div64_u64((u64)sysctl_sched_init_task_load_pct * - (u64)sched_ravg_window, 100); - - sched_short_sleep_task_threshold = sysctl_sched_select_prev_cpu_us * - NSEC_PER_USEC; - - sched_small_wakee_task_load = - div64_u64((u64)sysctl_sched_small_wakee_task_load_pct * - (u64)sched_ravg_window, 100); - - sched_big_waker_task_load = - div64_u64((u64)sysctl_sched_big_waker_task_load_pct * - (u64)sched_ravg_window, 100); -} - -u32 sched_get_init_task_load(struct task_struct *p) -{ - return p->init_load_pct; -} - -int sched_set_init_task_load(struct task_struct *p, int init_load_pct) -{ - if (init_load_pct < 0 || init_load_pct > 100) - return -EINVAL; - - p->init_load_pct = init_load_pct; - - return 0; -} - -#ifdef CONFIG_CGROUP_SCHED - -static inline int upmigrate_discouraged(struct task_struct *p) -{ - return task_group(p)->upmigrate_discouraged; -} - -#else - -static inline int upmigrate_discouraged(struct task_struct *p) -{ - return 0; -} - -#endif - -/* Is a task "big" on its current cpu */ -static inline int __is_big_task(struct task_struct *p, u64 scaled_load) -{ - int nice = task_nice(p); - - if (nice > SCHED_UPMIGRATE_MIN_NICE || upmigrate_discouraged(p)) - return 0; - - return scaled_load > sched_upmigrate; -} - -static inline int is_big_task(struct task_struct *p) -{ - return __is_big_task(p, scale_load_to_cpu(task_load(p), task_cpu(p))); -} - -static inline u64 cpu_load(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - - return scale_load_to_cpu(rq->hmp_stats.cumulative_runnable_avg, cpu); -} - -static inline u64 cpu_load_sync(int cpu, int sync) -{ - return scale_load_to_cpu(cpu_cravg_sync(cpu, sync), cpu); -} - -static int boost_refcount; -static DEFINE_SPINLOCK(boost_lock); -static DEFINE_MUTEX(boost_mutex); - -static void boost_kick_cpus(void) -{ - int i; - - for_each_online_cpu(i) { - if (cpu_capacity(i) != max_capacity) - boost_kick(i); - } -} - -int sched_boost(void) -{ - return boost_refcount > 0; -} - -int sched_set_boost(int enable) -{ - unsigned long flags; - int ret = 0; - int old_refcount; - - if (!sched_enable_hmp) - return -EINVAL; - - spin_lock_irqsave(&boost_lock, flags); - - old_refcount = boost_refcount; - - if (enable == 1) { - boost_refcount++; - } else if (!enable) { - if (boost_refcount >= 1) - boost_refcount--; - else - ret = -EINVAL; - } else { - ret = -EINVAL; - } - - if (!old_refcount && boost_refcount) - boost_kick_cpus(); - - trace_sched_set_boost(boost_refcount); - spin_unlock_irqrestore(&boost_lock, flags); - - return ret; -} - -int sched_boost_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) -{ - int ret; - - mutex_lock(&boost_mutex); - if (!write) - sysctl_sched_boost = sched_boost(); - - ret = proc_dointvec(table, write, buffer, lenp, ppos); - if (ret || !write) - goto done; - - ret = (sysctl_sched_boost <= 1) ? - sched_set_boost(sysctl_sched_boost) : -EINVAL; - -done: - mutex_unlock(&boost_mutex); - return ret; -} - -/* - * Task will fit on a cpu if it's bandwidth consumption on that cpu - * will be less than sched_upmigrate. A big task that was previously - * "up" migrated will be considered fitting on "little" cpu if its - * bandwidth consumption on "little" cpu will be less than - * sched_downmigrate. This will help avoid frequenty migrations for - * tasks with load close to the upmigrate threshold - */ - -static int task_load_will_fit(struct task_struct *p, u64 task_load, int cpu, - enum sched_boost_type boost_type) -{ - int upmigrate; - - if (cpu_capacity(cpu) == max_capacity) - return 1; - - if (boost_type != SCHED_BOOST_ON_BIG) { - if (task_nice(p) > SCHED_UPMIGRATE_MIN_NICE || - upmigrate_discouraged(p)) - return 1; - - upmigrate = sched_upmigrate; - if (cpu_capacity(task_cpu(p)) > cpu_capacity(cpu)) - upmigrate = sched_downmigrate; - - if (task_load < upmigrate) - return 1; - } - - return 0; -} - -static enum sched_boost_type sched_boost_type(void) -{ - if (sched_boost()) { - if (min_possible_efficiency != max_possible_efficiency) - return SCHED_BOOST_ON_BIG; - else - return SCHED_BOOST_ON_ALL; - } - return SCHED_BOOST_NONE; -} - -static int task_will_fit(struct task_struct *p, int cpu) -{ - u64 tload = scale_load_to_cpu(task_load(p), cpu); - - return task_load_will_fit(p, tload, cpu, sched_boost_type()); -} - -int group_will_fit(struct sched_cluster *cluster, - struct related_thread_group *grp, u64 demand) -{ - int cpu = cluster_first_cpu(cluster); - int prev_capacity = 0; - unsigned int threshold = sched_upmigrate; - u64 load; - - if (cluster->capacity == max_capacity) - return 1; - - if (grp->preferred_cluster) - prev_capacity = grp->preferred_cluster->capacity; - - if (cluster->capacity < prev_capacity) - threshold = sched_downmigrate; - - load = scale_load_to_cpu(demand, cpu); - if (load < threshold) - return 1; - - return 0; -} - -/* - * Return the cost of running task p on CPU cpu. This function - * currently assumes that task p is the only task which will run on - * the CPU. - */ -unsigned int power_cost(int cpu, u64 demand) -{ - int first, mid, last; - struct cpu_pwr_stats *per_cpu_info = get_cpu_pwr_stats(); - struct cpu_pstate_pwr *costs; - struct freq_max_load *max_load; - int total_static_pwr_cost = 0; - struct rq *rq = cpu_rq(cpu); - unsigned int pc; - - if (!per_cpu_info || !per_cpu_info[cpu].ptable) - /* When power aware scheduling is not in use, or CPU - * power data is not available, just use the CPU - * capacity as a rough stand-in for real CPU power - * numbers, assuming bigger CPUs are more power - * hungry. */ - return cpu_max_possible_capacity(cpu); - - rcu_read_lock(); - max_load = rcu_dereference(per_cpu(freq_max_load, cpu)); - if (!max_load) { - pc = cpu_max_possible_capacity(cpu); - goto unlock; - } - - costs = per_cpu_info[cpu].ptable; - - if (demand <= max_load->freqs[0].hdemand) { - pc = costs[0].power; - goto unlock; - } else if (demand > max_load->freqs[max_load->length - 1].hdemand) { - pc = costs[max_load->length - 1].power; - goto unlock; - } - - first = 0; - last = max_load->length - 1; - mid = (last - first) >> 1; - while (1) { - if (demand <= max_load->freqs[mid].hdemand) - last = mid; - else - first = mid; - - if (last - first == 1) - break; - mid = first + ((last - first) >> 1); - } - - pc = costs[last].power; - -unlock: - rcu_read_unlock(); - - if (idle_cpu(cpu) && rq->cstate) { - total_static_pwr_cost += rq->static_cpu_pwr_cost; - if (rq->cluster->dstate) - total_static_pwr_cost += - rq->cluster->static_cluster_pwr_cost; - } - - return pc + total_static_pwr_cost; - -} +/* CPU selection flag */ +#define SBC_FLAG_PREV_CPU 0x1 +#define SBC_FLAG_BEST_CAP_CPU 0x2 +#define SBC_FLAG_CPU_COST 0x4 +#define SBC_FLAG_MIN_COST 0x8 +#define SBC_FLAG_IDLE_LEAST_LOADED 0x10 +#define SBC_FLAG_IDLE_CSTATE 0x20 +#define SBC_FLAG_COST_CSTATE_TIE_BREAKER 0x40 +#define SBC_FLAG_COST_CSTATE_PREV_CPU_TIE_BREAKER 0x80 +#define SBC_FLAG_CSTATE_LOAD 0x100 +#define SBC_FLAG_BEST_SIBLING 0x200 + +/* Cluster selection flag */ +#define SBC_FLAG_COLOC_CLUSTER 0x10000 +#define SBC_FLAG_WAKER_CLUSTER 0x20000 +#define SBC_FLAG_BACKUP_CLUSTER 0x40000 struct cpu_select_env { struct task_struct *p; @@ -3036,6 +2610,8 @@ struct cpu_select_env { DECLARE_BITMAP(backup_list, NR_CPUS); u64 task_load; u64 cpu_load; + u32 sbc_best_flag; + u32 sbc_best_cluster_flag; }; struct cluster_cpu_stats { @@ -3047,45 +2623,7 @@ struct cluster_cpu_stats { s64 highest_spare_capacity; }; -#define UP_MIGRATION 1 -#define DOWN_MIGRATION 2 -#define IRQLOAD_MIGRATION 3 - -/* - * Invoked from three places: - * 1) try_to_wake_up() -> ... -> select_best_cpu() - * 2) scheduler_tick() -> ... -> migration_needed() -> select_best_cpu() - * 3) can_migrate_task() - * - * Its safe to de-reference p->grp in first case (since p->pi_lock is held) - * but not in other cases. p->grp is hence freed after a RCU grace period and - * accessed under rcu_read_lock() - */ -static inline int -preferred_cluster(struct sched_cluster *cluster, struct task_struct *p) -{ - struct related_thread_group *grp; - int rc = 0; - - rcu_read_lock(); - - grp = task_related_thread_group(p); - if (!grp || !sysctl_sched_enable_colocation) - rc = 1; - else - rc = (grp->preferred_cluster == cluster); - - rcu_read_unlock(); - return rc; -} - -static inline struct sched_cluster *rq_cluster(struct rq *rq) -{ - return rq->cluster; -} - -static int -spill_threshold_crossed(struct cpu_select_env *env, struct rq *rq) +static int spill_threshold_crossed(struct cpu_select_env *env, struct rq *rq) { u64 total_load; @@ -3168,6 +2706,7 @@ select_least_power_cluster(struct cpu_select_env *env) if (env->rtg) { env->task_load = scale_load_to_cpu(task_load(env->p), cluster_first_cpu(env->rtg->preferred_cluster)); + env->sbc_best_cluster_flag |= SBC_FLAG_COLOC_CLUSTER; return env->rtg->preferred_cluster; } @@ -3246,6 +2785,7 @@ struct cpu_select_env *env, struct cluster_cpu_stats *stats) update_spare_capacity(stats, env, i, next->capacity, cpu_load_sync(i, env->sync)); } + env->sbc_best_cluster_flag = SBC_FLAG_BACKUP_CLUSTER; } } @@ -3317,6 +2857,7 @@ static void __update_cluster_stats(int cpu, struct cluster_cpu_stats *stats, stats->best_cpu_cstate = cpu_cstate; stats->best_load = env->cpu_load; stats->best_cpu = cpu; + env->sbc_best_flag = SBC_FLAG_CPU_COST; return; } @@ -3329,12 +2870,14 @@ static void __update_cluster_stats(int cpu, struct cluster_cpu_stats *stats, stats->best_cpu_cstate = cpu_cstate; stats->best_load = env->cpu_load; stats->best_cpu = cpu; + env->sbc_best_flag = SBC_FLAG_COST_CSTATE_TIE_BREAKER; return; } /* C-state is the same. Use prev CPU to break the tie */ if (cpu == prev_cpu) { stats->best_cpu = cpu; + env->sbc_best_flag = SBC_FLAG_COST_CSTATE_PREV_CPU_TIE_BREAKER; return; } @@ -3343,6 +2886,7 @@ static void __update_cluster_stats(int cpu, struct cluster_cpu_stats *stats, (cpu_cstate > 0 && env->cpu_load > stats->best_load))) { stats->best_load = env->cpu_load; stats->best_cpu = cpu; + env->sbc_best_flag = SBC_FLAG_CSTATE_LOAD; } } #else /* CONFIG_SCHED_HMP_CSTATE_AWARE */ @@ -3373,10 +2917,11 @@ static void __update_cluster_stats(int cpu, struct cluster_cpu_stats *stats, stats->min_cost = cpu_cost; stats->min_load = env->cpu_load; stats->best_cpu = cpu; + env->sbc_best_flag = SBC_FLAG_MIN_COST; } } } -#endif +#endif /* CONFIG_SCHED_HMP_CSTATE_AWARE */ static void update_cluster_stats(int cpu, struct cluster_cpu_stats *stats, struct cpu_select_env *env) @@ -3530,8 +3075,8 @@ static int select_best_cpu(struct task_struct *p, int target, int reason, { struct sched_cluster *cluster, *pref_cluster = NULL; struct cluster_cpu_stats stats; - bool fast_path = false; struct related_thread_group *grp; + unsigned int sbc_flag = 0; struct cpu_select_env env = { .p = p, @@ -3543,6 +3088,8 @@ static int select_best_cpu(struct task_struct *p, int target, int reason, .prev_cpu = target, .ignore_prev_cpu = 0, .rtg = NULL, + .sbc_best_flag = 0, + .sbc_best_cluster_flag = 0, }; bitmap_copy(env.candidate_list, all_cluster_ids, NR_CPUS); @@ -3567,8 +3114,10 @@ static int select_best_cpu(struct task_struct *p, int target, int reason, env.need_waker_cluster = 1; bitmap_zero(env.candidate_list, NR_CPUS); __set_bit(cluster->id, env.candidate_list); + env.sbc_best_cluster_flag = SBC_FLAG_WAKER_CLUSTER; + } else if (bias_to_prev_cpu(&env, &stats)) { - fast_path = true; + sbc_flag = SBC_FLAG_PREV_CPU; goto out; } } @@ -3592,15 +3141,20 @@ retry: } while ((cluster = next_best_cluster(cluster, &env, &stats))); if (env.need_idle) { - if (stats.best_idle_cpu >= 0) + if (stats.best_idle_cpu >= 0) { target = stats.best_idle_cpu; - else if (stats.least_loaded_cpu >= 0) + sbc_flag |= SBC_FLAG_IDLE_CSTATE; + } else if (stats.least_loaded_cpu >= 0) { target = stats.least_loaded_cpu; + sbc_flag |= SBC_FLAG_IDLE_LEAST_LOADED; + } } else if (stats.best_cpu >= 0) { if (stats.best_cpu != task_cpu(p) && - stats.min_cost == stats.best_sibling_cpu_cost) + stats.min_cost == stats.best_sibling_cpu_cost) { stats.best_cpu = stats.best_sibling_cpu; - + sbc_flag |= SBC_FLAG_BEST_SIBLING; + } + sbc_flag |= env.sbc_best_flag; target = stats.best_cpu; } else { if (env.rtg) { @@ -3609,66 +3163,20 @@ retry: } find_backup_cluster(&env, &stats); - if (stats.best_capacity_cpu >= 0) + if (stats.best_capacity_cpu >= 0) { target = stats.best_capacity_cpu; + sbc_flag |= SBC_FLAG_BEST_CAP_CPU; + } } p->last_cpu_selected_ts = sched_ktime_clock(); - + sbc_flag |= env.sbc_best_cluster_flag; out: rcu_read_unlock(); trace_sched_task_load(p, sched_boost(), env.reason, env.sync, - env.need_idle, fast_path, target); + env.need_idle, sbc_flag, target); return target; } -static void -inc_nr_big_task(struct hmp_sched_stats *stats, struct task_struct *p) -{ - if (!sched_enable_hmp || sched_disable_window_stats) - return; - - if (is_big_task(p)) - stats->nr_big_tasks++; -} - -static void -dec_nr_big_task(struct hmp_sched_stats *stats, struct task_struct *p) -{ - if (!sched_enable_hmp || sched_disable_window_stats) - return; - - if (is_big_task(p)) - stats->nr_big_tasks--; - - BUG_ON(stats->nr_big_tasks < 0); -} - -static void -inc_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra) -{ - inc_nr_big_task(&rq->hmp_stats, p); - if (change_cra) - inc_cumulative_runnable_avg(&rq->hmp_stats, p); -} - -static void -dec_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra) -{ - dec_nr_big_task(&rq->hmp_stats, p); - if (change_cra) - dec_cumulative_runnable_avg(&rq->hmp_stats, p); -} - -static void reset_hmp_stats(struct hmp_sched_stats *stats, int reset_cra) -{ - stats->nr_big_tasks = 0; - if (reset_cra) { - stats->cumulative_runnable_avg = 0; - set_pred_demands_sum(stats, 0); - } -} - - #ifdef CONFIG_CFS_BANDWIDTH static inline struct task_group *next_task_group(struct task_group *tg) @@ -3683,7 +3191,7 @@ static inline struct task_group *next_task_group(struct task_group *tg) for (tg = container_of(&task_groups, struct task_group, list); \ ((tg = next_task_group(tg)) && (cfs_rq = tg->cfs_rq[cpu]));) -static void reset_cfs_rq_hmp_stats(int cpu, int reset_cra) +void reset_cfs_rq_hmp_stats(int cpu, int reset_cra) { struct task_group *tg; struct cfs_rq *cfs_rq; @@ -3696,66 +3204,6 @@ static void reset_cfs_rq_hmp_stats(int cpu, int reset_cra) rcu_read_unlock(); } -#else /* CONFIG_CFS_BANDWIDTH */ - -static inline void reset_cfs_rq_hmp_stats(int cpu, int reset_cra) { } - -#endif /* CONFIG_CFS_BANDWIDTH */ - -/* - * Return total number of tasks "eligible" to run on highest capacity cpu - * - * This is simply nr_big_tasks for cpus which are not of max_capacity and - * nr_running for cpus of max_capacity - */ -unsigned int nr_eligible_big_tasks(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - int nr_big = rq->hmp_stats.nr_big_tasks; - int nr = rq->nr_running; - - if (cpu_max_possible_capacity(cpu) != max_possible_capacity) - return nr_big; - - return nr; -} - -/* - * reset_cpu_hmp_stats - reset HMP stats for a cpu - * nr_big_tasks - * cumulative_runnable_avg (iff reset_cra is true) - */ -void reset_cpu_hmp_stats(int cpu, int reset_cra) -{ - reset_cfs_rq_hmp_stats(cpu, reset_cra); - reset_hmp_stats(&cpu_rq(cpu)->hmp_stats, reset_cra); -} - -static void -fixup_nr_big_tasks(struct hmp_sched_stats *stats, struct task_struct *p, - s64 delta) -{ - u64 new_task_load; - u64 old_task_load; - - if (!sched_enable_hmp || sched_disable_window_stats) - return; - - old_task_load = scale_load_to_cpu(task_load(p), task_cpu(p)); - new_task_load = scale_load_to_cpu(delta + task_load(p), task_cpu(p)); - - if (__is_big_task(p, old_task_load) && !__is_big_task(p, new_task_load)) - stats->nr_big_tasks--; - else if (!__is_big_task(p, old_task_load) && - __is_big_task(p, new_task_load)) - stats->nr_big_tasks++; - - BUG_ON(stats->nr_big_tasks < 0); -} - - -#ifdef CONFIG_CFS_BANDWIDTH - static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq); static void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq, @@ -3764,8 +3212,8 @@ static void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq, struct task_struct *p, int change_cra); /* Add task's contribution to a cpu' HMP statistics */ -static void -_inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p, int change_cra) +void _inc_hmp_sched_stats_fair(struct rq *rq, + struct task_struct *p, int change_cra) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; @@ -3857,6 +3305,8 @@ static int task_will_be_throttled(struct task_struct *p); #else /* CONFIG_CFS_BANDWIDTH */ +inline void reset_cfs_rq_hmp_stats(int cpu, int reset_cra) { } + static void inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p) { @@ -3887,8 +3337,8 @@ static inline int task_will_be_throttled(struct task_struct *p) return 0; } -static void -_inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p, int change_cra) +void _inc_hmp_sched_stats_fair(struct rq *rq, + struct task_struct *p, int change_cra) { inc_nr_big_task(&rq->hmp_stats, p); } @@ -3896,179 +3346,6 @@ _inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p, int change_cra) #endif /* CONFIG_CFS_BANDWIDTH */ /* - * Walk runqueue of cpu and re-initialize 'nr_big_tasks' counters. - */ -static void update_nr_big_tasks(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - struct task_struct *p; - - /* Do not reset cumulative_runnable_avg */ - reset_cpu_hmp_stats(cpu, 0); - - list_for_each_entry(p, &rq->cfs_tasks, se.group_node) - _inc_hmp_sched_stats_fair(rq, p, 0); -} - -/* Disable interrupts and grab runqueue lock of all cpus listed in @cpus */ -void pre_big_task_count_change(const struct cpumask *cpus) -{ - int i; - - local_irq_disable(); - - for_each_cpu(i, cpus) - raw_spin_lock(&cpu_rq(i)->lock); -} - -/* - * Reinitialize 'nr_big_tasks' counters on all affected cpus - */ -void post_big_task_count_change(const struct cpumask *cpus) -{ - int i; - - /* Assumes local_irq_disable() keeps online cpumap stable */ - for_each_cpu(i, cpus) - update_nr_big_tasks(i); - - for_each_cpu(i, cpus) - raw_spin_unlock(&cpu_rq(i)->lock); - - local_irq_enable(); -} - -DEFINE_MUTEX(policy_mutex); - -#ifdef CONFIG_SCHED_FREQ_INPUT -static inline int invalid_value_freq_input(unsigned int *data) -{ - if (data == &sysctl_sched_freq_aggregate) - return !(*data == 0 || *data == 1); - - return 0; -} -#else -static inline int invalid_value_freq_input(unsigned int *data) -{ - return 0; -} -#endif - -static inline int invalid_value(unsigned int *data) -{ - unsigned int val = *data; - - if (data == &sysctl_sched_ravg_hist_size) - return (val < 2 || val > RAVG_HIST_SIZE_MAX); - - if (data == &sysctl_sched_window_stats_policy) - return val >= WINDOW_STATS_INVALID_POLICY; - - return invalid_value_freq_input(data); -} - -/* - * Handle "atomic" update of sysctl_sched_window_stats_policy, - * sysctl_sched_ravg_hist_size and sched_freq_legacy_mode variables. - */ -int sched_window_update_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) -{ - int ret; - unsigned int *data = (unsigned int *)table->data; - unsigned int old_val; - - if (!sched_enable_hmp) - return -EINVAL; - - mutex_lock(&policy_mutex); - - old_val = *data; - - ret = proc_dointvec(table, write, buffer, lenp, ppos); - if (ret || !write || (write && (old_val == *data))) - goto done; - - if (invalid_value(data)) { - *data = old_val; - ret = -EINVAL; - goto done; - } - - reset_all_window_stats(0, 0); - -done: - mutex_unlock(&policy_mutex); - - return ret; -} - -/* - * Convert percentage value into absolute form. This will avoid div() operation - * in fast path, to convert task load in percentage scale. - */ -int sched_hmp_proc_update_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) -{ - int ret; - unsigned int old_val; - unsigned int *data = (unsigned int *)table->data; - int update_min_nice = 0; - - mutex_lock(&policy_mutex); - - old_val = *data; - - ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - - if (ret || !write || !sched_enable_hmp) - goto done; - - if (write && (old_val == *data)) - goto done; - - if (data != &sysctl_sched_select_prev_cpu_us) { - /* - * all tunables other than sched_select_prev_cpu_us are - * in percentage. - */ - if (sysctl_sched_downmigrate_pct > - sysctl_sched_upmigrate_pct || *data > 100) { - *data = old_val; - ret = -EINVAL; - goto done; - } - } - - /* - * Big task tunable change will need to re-classify tasks on - * runqueue as big and set their counters appropriately. - * sysctl interface affects secondary variables (*_pct), which is then - * "atomically" carried over to the primary variables. Atomic change - * includes taking runqueue lock of all online cpus and re-initiatizing - * their big counter values based on changed criteria. - */ - if ((data == &sysctl_sched_upmigrate_pct || update_min_nice)) { - get_online_cpus(); - pre_big_task_count_change(cpu_online_mask); - } - - set_hmp_defaults(); - - if ((data == &sysctl_sched_upmigrate_pct || update_min_nice)) { - post_big_task_count_change(cpu_online_mask); - put_online_cpus(); - } - -done: - mutex_unlock(&policy_mutex); - return ret; -} - -/* * Reset balance_interval at all sched_domain levels of given cpu, so that it * honors kick. */ @@ -4131,8 +3408,6 @@ static inline int migration_needed(struct task_struct *p, int cpu) return 0; } -static DEFINE_RAW_SPINLOCK(migration_lock); - static inline int kick_active_balance(struct rq *rq, struct task_struct *p, int new_cpu) { @@ -4153,6 +3428,8 @@ kick_active_balance(struct rq *rq, struct task_struct *p, int new_cpu) return rc; } +static DEFINE_RAW_SPINLOCK(migration_lock); + /* * Check if currently running task should be migrated to a better cpu. * @@ -4183,104 +3460,85 @@ void check_for_migration(struct rq *rq, struct task_struct *p) &rq->active_balance_work); } -static inline int nr_big_tasks(struct rq *rq) -{ - return rq->hmp_stats.nr_big_tasks; -} +#ifdef CONFIG_CFS_BANDWIDTH -unsigned int cpu_temp(int cpu) +static void init_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq) { - struct cpu_pwr_stats *per_cpu_info = get_cpu_pwr_stats(); - if (per_cpu_info) - return per_cpu_info[cpu].temp; - else - return 0; + cfs_rq->hmp_stats.nr_big_tasks = 0; + cfs_rq->hmp_stats.cumulative_runnable_avg = 0; + cfs_rq->hmp_stats.pred_demands_sum = 0; } -#else /* CONFIG_SCHED_HMP */ - -struct cpu_select_env; -struct sched_cluster; - -static inline int task_will_fit(struct task_struct *p, int cpu, - enum sched_boost_type boost_type) +static void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq, + struct task_struct *p, int change_cra) { - return 1; + inc_nr_big_task(&cfs_rq->hmp_stats, p); + if (change_cra) + inc_cumulative_runnable_avg(&cfs_rq->hmp_stats, p); } -static inline int select_best_cpu(struct task_struct *p, int target, - int reason, int sync) +static void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq, + struct task_struct *p, int change_cra) { - return 0; + dec_nr_big_task(&cfs_rq->hmp_stats, p); + if (change_cra) + dec_cumulative_runnable_avg(&cfs_rq->hmp_stats, p); } -unsigned int power_cost(int cpu, u64 demand) +static void inc_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats, + struct cfs_rq *cfs_rq) { - return SCHED_CAPACITY_SCALE; + stats->nr_big_tasks += cfs_rq->hmp_stats.nr_big_tasks; + stats->cumulative_runnable_avg += + cfs_rq->hmp_stats.cumulative_runnable_avg; + stats->pred_demands_sum += cfs_rq->hmp_stats.pred_demands_sum; } -static inline int -spill_threshold_crossed(struct cpu_select_env *env, struct rq *rq) +static void dec_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats, + struct cfs_rq *cfs_rq) { - return 0; -} + stats->nr_big_tasks -= cfs_rq->hmp_stats.nr_big_tasks; + stats->cumulative_runnable_avg -= + cfs_rq->hmp_stats.cumulative_runnable_avg; + stats->pred_demands_sum -= cfs_rq->hmp_stats.pred_demands_sum; -static inline int sched_boost(void) -{ - return 0; + BUG_ON(stats->nr_big_tasks < 0 || + (s64)stats->cumulative_runnable_avg < 0); + verify_pred_demands_sum(stats); } -static inline int is_big_task(struct task_struct *p) -{ - return 0; -} +#else /* CONFIG_CFS_BANDWIDTH */ -static inline int nr_big_tasks(struct rq *rq) -{ - return 0; -} +static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq, + struct task_struct *p, int change_cra) { } -static inline int is_cpu_throttling_imminent(int cpu) -{ - return 0; -} +static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq, + struct task_struct *p, int change_cra) { } -static inline int is_task_migration_throttled(struct task_struct *p) -{ - return 0; -} +#endif /* CONFIG_CFS_BANDWIDTH */ -unsigned int cpu_temp(int cpu) -{ - return 0; -} +#else /* CONFIG_SCHED_HMP */ -static inline void -inc_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra) { } -static inline void -dec_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra) { } +static inline void init_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq) { } -static inline void -inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p) { } +static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq, + struct task_struct *p, int change_cra) { } -static inline void -dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p) { } +static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq, + struct task_struct *p, int change_cra) { } -static inline int -preferred_cluster(struct sched_cluster *cluster, struct task_struct *p) +static inline void inc_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats, + struct cfs_rq *cfs_rq) { - return 1; } -static inline struct sched_cluster *rq_cluster(struct rq *rq) +static inline void dec_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats, + struct cfs_rq *cfs_rq) { - return NULL; } #endif /* CONFIG_SCHED_HMP */ - - #if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10 #error "load tracking assumes 2^10 as unit" #endif @@ -4323,7 +3581,6 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, u32 contrib; unsigned int delta_w, scaled_delta_w, decayed = 0; unsigned long scale_freq, scale_cpu; - struct sched_entity *se = NULL; delta = now - sa->last_update_time; /* @@ -4344,12 +3601,6 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, return 0; sa->last_update_time = now; - if (sched_use_pelt && cfs_rq && weight) { - se = container_of(sa, struct sched_entity, avg); - if (entity_is_task(se) && se->on_rq) - dec_hmp_sched_stats_fair(rq_of(cfs_rq), task_of(se)); - } - scale_freq = arch_scale_freq_capacity(NULL, cpu); scale_cpu = arch_scale_cpu_capacity(NULL, cpu); @@ -4370,7 +3621,6 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, scaled_delta_w = cap_scale(delta_w, scale_freq); if (weight) { sa->load_sum += weight * scaled_delta_w; - add_to_scaled_stat(cpu, sa, delta_w); if (cfs_rq) { cfs_rq->runnable_load_sum += weight * scaled_delta_w; @@ -4397,7 +3647,6 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, contrib = cap_scale(contrib, scale_freq); if (weight) { sa->load_sum += weight * contrib; - add_to_scaled_stat(cpu, sa, contrib); if (cfs_rq) cfs_rq->runnable_load_sum += weight * contrib; } @@ -4409,14 +3658,10 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, scaled_delta = cap_scale(delta, scale_freq); if (weight) { sa->load_sum += weight * scaled_delta; - add_to_scaled_stat(cpu, sa, delta); if (cfs_rq) cfs_rq->runnable_load_sum += weight * scaled_delta; } - if (se && entity_is_task(se) && se->on_rq) - inc_hmp_sched_stats_fair(rq_of(cfs_rq), task_of(se)); - if (running) sa->util_sum += scaled_delta * scale_cpu; @@ -4657,191 +3902,13 @@ static inline int idle_balance(struct rq *rq) return 0; } -static inline void -inc_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra) { } -static inline void -dec_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra) { } - -#endif /* CONFIG_SMP */ - -#ifdef CONFIG_SCHED_HMP - -#ifdef CONFIG_SCHED_FREQ_INPUT -#define clear_ravg_pred_demand() (p->ravg.pred_demand = 0) -#else -#define clear_ravg_pred_demand() -#endif - -void init_new_task_load(struct task_struct *p) -{ - int i; - u32 init_load_windows = sched_init_task_load_windows; - u32 init_load_pelt = sched_init_task_load_pelt; - u32 init_load_pct = current->init_load_pct; - - p->init_load_pct = 0; - rcu_assign_pointer(p->grp, NULL); - INIT_LIST_HEAD(&p->grp_list); - memset(&p->ravg, 0, sizeof(struct ravg)); - p->cpu_cycles = 0; - - if (init_load_pct) { - init_load_pelt = div64_u64((u64)init_load_pct * - (u64)LOAD_AVG_MAX, 100); - init_load_windows = div64_u64((u64)init_load_pct * - (u64)sched_ravg_window, 100); - } - - p->ravg.demand = init_load_windows; - clear_ravg_pred_demand(); - for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i) - p->ravg.sum_history[i] = init_load_windows; - p->se.avg.runnable_avg_sum_scaled = init_load_pelt; -} - -#else /* CONFIG_SCHED_HMP */ - -void init_new_task_load(struct task_struct *p) -{ -} - -#endif /* CONFIG_SCHED_HMP */ - -#ifdef CONFIG_SCHED_HMP - -/* Return task demand in percentage scale */ -unsigned int pct_task_load(struct task_struct *p) -{ - unsigned int load; - - load = div64_u64((u64)task_load(p) * 100, (u64)max_task_load()); - - return load; -} - -/* - * Add scaled version of 'delta' to runnable_avg_sum_scaled - * 'delta' is scaled in reference to "best" cpu - */ -static inline void -add_to_scaled_stat(int cpu, struct sched_avg *sa, u64 delta) -{ - int cur_freq = cpu_cur_freq(cpu); - u64 scaled_delta; - int sf; - - if (!sched_enable_hmp) - return; - - if (unlikely(cur_freq > max_possible_freq)) - cur_freq = max_possible_freq; - - scaled_delta = div64_u64(delta * cur_freq, max_possible_freq); - sf = (cpu_efficiency(cpu) * 1024) / max_possible_efficiency; - scaled_delta *= sf; - scaled_delta >>= 10; - sa->runnable_avg_sum_scaled += scaled_delta; -} - -static inline void decay_scaled_stat(struct sched_avg *sa, u64 periods) -{ - if (!sched_enable_hmp) - return; - - sa->runnable_avg_sum_scaled = - decay_load(sa->runnable_avg_sum_scaled, - periods); -} - -#ifdef CONFIG_CFS_BANDWIDTH - -static void init_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq) -{ - cfs_rq->hmp_stats.nr_big_tasks = 0; - cfs_rq->hmp_stats.cumulative_runnable_avg = 0; - set_pred_demands_sum(&cfs_rq->hmp_stats, 0); -} - -static void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq, - struct task_struct *p, int change_cra) -{ - inc_nr_big_task(&cfs_rq->hmp_stats, p); - if (change_cra) - inc_cumulative_runnable_avg(&cfs_rq->hmp_stats, p); -} - -static void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq, - struct task_struct *p, int change_cra) -{ - dec_nr_big_task(&cfs_rq->hmp_stats, p); - if (change_cra) - dec_cumulative_runnable_avg(&cfs_rq->hmp_stats, p); -} - -static void inc_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats, - struct cfs_rq *cfs_rq) -{ - stats->nr_big_tasks += cfs_rq->hmp_stats.nr_big_tasks; - stats->cumulative_runnable_avg += - cfs_rq->hmp_stats.cumulative_runnable_avg; - set_pred_demands_sum(stats, stats->pred_demands_sum + - cfs_rq->hmp_stats.pred_demands_sum); -} - -static void dec_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats, - struct cfs_rq *cfs_rq) -{ - stats->nr_big_tasks -= cfs_rq->hmp_stats.nr_big_tasks; - stats->cumulative_runnable_avg -= - cfs_rq->hmp_stats.cumulative_runnable_avg; - set_pred_demands_sum(stats, stats->pred_demands_sum - - cfs_rq->hmp_stats.pred_demands_sum); - - BUG_ON(stats->nr_big_tasks < 0 || - (s64)stats->cumulative_runnable_avg < 0); - verify_pred_demands_sum(stats); -} - -#else /* CONFIG_CFS_BANDWIDTH */ - -static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq, - struct task_struct *p, int change_cra) { } - -static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq, - struct task_struct *p, int change_cra) { } - -#endif /* CONFIG_CFS_BANDWIDTH */ - -#else /* CONFIG_SCHED_HMP */ - -static inline void -add_to_scaled_stat(int cpu, struct sched_avg *sa, u64 delta) -{ -} - -static inline void decay_scaled_stat(struct sched_avg *sa, u64 periods) -{ -} - -static inline void init_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq) { } - static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq, struct task_struct *p, int change_cra) { } static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq, struct task_struct *p, int change_cra) { } -static inline void inc_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats, - struct cfs_rq *cfs_rq) -{ -} - -static inline void dec_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats, - struct cfs_rq *cfs_rq) -{ -} - -#endif /* CONFIG_SCHED_HMP */ +#endif /* CONFIG_SMP */ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -7579,9 +6646,6 @@ struct lb_env { struct list_head tasks; }; -static DEFINE_PER_CPU(bool, dbs_boost_needed); -static DEFINE_PER_CPU(int, dbs_boost_load_moved); - /* * Is this task likely cache-hot: */ @@ -7792,7 +6856,7 @@ static void detach_task(struct task_struct *p, struct lb_env *env) deactivate_task(env->src_rq, p, 0); double_lock_balance(env->src_rq, env->dst_rq); set_task_cpu(p, env->dst_cpu); - if (rcu_access_pointer(p->grp)) + if (task_in_related_thread_group(p)) env->flags |= LBF_MOVED_RELATED_THREAD_GROUP_TASK; double_unlock_balance(env->src_rq, env->dst_rq); } @@ -7822,7 +6886,6 @@ static struct task_struct *detach_one_task(struct lb_env *env) * inside detach_tasks(). */ schedstat_inc(env->sd, lb_gained[env->idle]); - per_cpu(dbs_boost_load_moved, env->dst_cpu) += pct_task_load(p); return p; } @@ -7895,7 +6958,6 @@ redo: detached++; env->imbalance -= load; - per_cpu(dbs_boost_load_moved, env->dst_cpu) += pct_task_load(p); #ifdef CONFIG_PREEMPT /* @@ -7949,8 +7011,6 @@ static void attach_task(struct rq *rq, struct task_struct *p) activate_task(rq, p, 0); p->on_rq = TASK_ON_RQ_QUEUED; check_preempt_curr(rq, p, 0); - if (task_notify_on_migrate(p)) - per_cpu(dbs_boost_needed, task_cpu(p)) = true; } /* @@ -9241,7 +8301,6 @@ static int load_balance(int this_cpu, struct rq *this_rq, cpumask_copy(cpus, cpu_active_mask); - per_cpu(dbs_boost_load_moved, this_cpu) = 0; schedstat_inc(sd, lb_count[idle]); redo: @@ -9436,20 +8495,6 @@ no_move: } } else { sd->nr_balance_failed = 0; - if (per_cpu(dbs_boost_needed, this_cpu)) { - struct migration_notify_data mnd; - - mnd.src_cpu = cpu_of(busiest); - mnd.dest_cpu = this_cpu; - mnd.load = per_cpu(dbs_boost_load_moved, this_cpu); - if (mnd.load > 100) - mnd.load = 100; - atomic_notifier_call_chain(&migration_notifier_head, - 0, (void *)&mnd); - per_cpu(dbs_boost_needed, this_cpu) = false; - per_cpu(dbs_boost_load_moved, this_cpu) = 0; - - } /* Assumes one 'busiest' cpu that we pulled tasks from */ if (!same_freq_domain(this_cpu, cpu_of(busiest))) { @@ -9681,8 +8726,6 @@ static int active_load_balance_cpu_stop(void *data) raw_spin_lock_irq(&busiest_rq->lock); - per_cpu(dbs_boost_load_moved, target_cpu) = 0; - /* make sure the requested cpu hasn't gone down in the meantime */ if (unlikely(busiest_cpu != smp_processor_id() || !busiest_rq->active_balance)) @@ -9765,20 +8808,6 @@ out_unlock: check_for_freq_change(target_rq, true, false); } - if (per_cpu(dbs_boost_needed, target_cpu)) { - struct migration_notify_data mnd; - - mnd.src_cpu = cpu_of(busiest_rq); - mnd.dest_cpu = target_cpu; - mnd.load = per_cpu(dbs_boost_load_moved, target_cpu); - if (mnd.load > 100) - mnd.load = 100; - atomic_notifier_call_chain(&migration_notifier_head, - 0, (void *)&mnd); - - per_cpu(dbs_boost_needed, target_cpu) = false; - per_cpu(dbs_boost_load_moved, target_cpu) = 0; - } return 0; } diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c new file mode 100644 index 000000000000..8da0147b4f89 --- /dev/null +++ b/kernel/sched/hmp.c @@ -0,0 +1,3970 @@ +/* Copyright (c) 2012-2016, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Implementation credits: Srivatsa Vaddagiri, Steve Muckle + * Syed Rameez Mustafa, Olav haugan, Joonwoo Park, Pavan Kumar Kondeti + * and Vikram Mulukutla + */ + +#include <linux/cpufreq.h> +#include <linux/list_sort.h> +#include <linux/syscore_ops.h> + +#include "sched.h" + +#include <trace/events/sched.h> + +static ktime_t ktime_last; +static bool sched_ktime_suspended; + +static bool use_cycle_counter; +static struct cpu_cycle_counter_cb cpu_cycle_counter_cb; + +u64 sched_ktime_clock(void) +{ + if (unlikely(sched_ktime_suspended)) + return ktime_to_ns(ktime_last); + return ktime_get_ns(); +} + +static void sched_resume(void) +{ + sched_ktime_suspended = false; +} + +static int sched_suspend(void) +{ + ktime_last = ktime_get(); + sched_ktime_suspended = true; + return 0; +} + +static struct syscore_ops sched_syscore_ops = { + .resume = sched_resume, + .suspend = sched_suspend +}; + +static int __init sched_init_ops(void) +{ + register_syscore_ops(&sched_syscore_ops); + return 0; +} +late_initcall(sched_init_ops); + +inline void clear_ed_task(struct task_struct *p, struct rq *rq) +{ + if (p == rq->ed_task) + rq->ed_task = NULL; +} + +inline void set_task_last_wake(struct task_struct *p, u64 wallclock) +{ + p->last_wake_ts = wallclock; +} + +inline void set_task_last_switch_out(struct task_struct *p, u64 wallclock) +{ + p->last_switch_out_ts = wallclock; +} + +/* + * Note C-state for (idle) cpus. + * + * @cstate = cstate index, 0 -> active state + * @wakeup_energy = energy spent in waking up cpu + * @wakeup_latency = latency to wakeup from cstate + * + */ +void +sched_set_cpu_cstate(int cpu, int cstate, int wakeup_energy, int wakeup_latency) +{ + struct rq *rq = cpu_rq(cpu); + + rq->cstate = cstate; /* C1, C2 etc */ + rq->wakeup_energy = wakeup_energy; + rq->wakeup_latency = wakeup_latency; +} + +/* + * Note D-state for (idle) cluster. + * + * @dstate = dstate index, 0 -> active state + * @wakeup_energy = energy spent in waking up cluster + * @wakeup_latency = latency to wakeup from cluster + * + */ +void sched_set_cluster_dstate(const cpumask_t *cluster_cpus, int dstate, + int wakeup_energy, int wakeup_latency) +{ + struct sched_cluster *cluster = + cpu_rq(cpumask_first(cluster_cpus))->cluster; + cluster->dstate = dstate; + cluster->dstate_wakeup_energy = wakeup_energy; + cluster->dstate_wakeup_latency = wakeup_latency; +} + +u32 __weak get_freq_max_load(int cpu, u32 freq) +{ + /* 100% by default */ + return 100; +} + +DEFINE_PER_CPU(struct freq_max_load *, freq_max_load); +static DEFINE_SPINLOCK(freq_max_load_lock); + +struct cpu_pwr_stats __weak *get_cpu_pwr_stats(void) +{ + return NULL; +} + +int sched_update_freq_max_load(const cpumask_t *cpumask) +{ + int i, cpu, ret; + unsigned int freq; + struct cpu_pstate_pwr *costs; + struct cpu_pwr_stats *per_cpu_info = get_cpu_pwr_stats(); + struct freq_max_load *max_load, *old_max_load; + struct freq_max_load_entry *entry; + u64 max_demand_capacity, max_demand; + unsigned long flags; + u32 hfreq; + int hpct; + + if (!per_cpu_info) + return 0; + + spin_lock_irqsave(&freq_max_load_lock, flags); + max_demand_capacity = div64_u64(max_task_load(), max_possible_capacity); + for_each_cpu(cpu, cpumask) { + if (!per_cpu_info[cpu].ptable) { + ret = -EINVAL; + goto fail; + } + + old_max_load = rcu_dereference(per_cpu(freq_max_load, cpu)); + + /* + * allocate len + 1 and leave the last power cost as 0 for + * power_cost() can stop iterating index when + * per_cpu_info[cpu].len > len of max_load due to race between + * cpu power stats update and get_cpu_pwr_stats(). + */ + max_load = kzalloc(sizeof(struct freq_max_load) + + sizeof(struct freq_max_load_entry) * + (per_cpu_info[cpu].len + 1), GFP_ATOMIC); + if (unlikely(!max_load)) { + ret = -ENOMEM; + goto fail; + } + + max_load->length = per_cpu_info[cpu].len; + + max_demand = max_demand_capacity * + cpu_max_possible_capacity(cpu); + + i = 0; + costs = per_cpu_info[cpu].ptable; + while (costs[i].freq) { + entry = &max_load->freqs[i]; + freq = costs[i].freq; + hpct = get_freq_max_load(cpu, freq); + if (hpct <= 0 && hpct > 100) + hpct = 100; + hfreq = div64_u64((u64)freq * hpct, 100); + entry->hdemand = + div64_u64(max_demand * hfreq, + cpu_max_possible_freq(cpu)); + i++; + } + + rcu_assign_pointer(per_cpu(freq_max_load, cpu), max_load); + if (old_max_load) + kfree_rcu(old_max_load, rcu); + } + + spin_unlock_irqrestore(&freq_max_load_lock, flags); + return 0; + +fail: + for_each_cpu(cpu, cpumask) { + max_load = rcu_dereference(per_cpu(freq_max_load, cpu)); + if (max_load) { + rcu_assign_pointer(per_cpu(freq_max_load, cpu), NULL); + kfree_rcu(max_load, rcu); + } + } + + spin_unlock_irqrestore(&freq_max_load_lock, flags); + return ret; +} + +unsigned int max_possible_efficiency = 1; +unsigned int min_possible_efficiency = UINT_MAX; + +unsigned long __weak arch_get_cpu_efficiency(int cpu) +{ + return SCHED_LOAD_SCALE; +} + +/* Keep track of max/min capacity possible across CPUs "currently" */ +static void __update_min_max_capacity(void) +{ + int i; + int max_cap = 0, min_cap = INT_MAX; + + for_each_online_cpu(i) { + max_cap = max(max_cap, cpu_capacity(i)); + min_cap = min(min_cap, cpu_capacity(i)); + } + + max_capacity = max_cap; + min_capacity = min_cap; +} + +static void update_min_max_capacity(void) +{ + unsigned long flags; + int i; + + local_irq_save(flags); + for_each_possible_cpu(i) + raw_spin_lock(&cpu_rq(i)->lock); + + __update_min_max_capacity(); + + for_each_possible_cpu(i) + raw_spin_unlock(&cpu_rq(i)->lock); + local_irq_restore(flags); +} + +/* + * Return 'capacity' of a cpu in reference to "least" efficient cpu, such that + * least efficient cpu gets capacity of 1024 + */ +static unsigned long +capacity_scale_cpu_efficiency(struct sched_cluster *cluster) +{ + return (1024 * cluster->efficiency) / min_possible_efficiency; +} + +/* + * Return 'capacity' of a cpu in reference to cpu with lowest max_freq + * (min_max_freq), such that one with lowest max_freq gets capacity of 1024. + */ +static unsigned long capacity_scale_cpu_freq(struct sched_cluster *cluster) +{ + return (1024 * cluster_max_freq(cluster)) / min_max_freq; +} + +/* + * Return load_scale_factor of a cpu in reference to "most" efficient cpu, so + * that "most" efficient cpu gets a load_scale_factor of 1 + */ +static inline unsigned long +load_scale_cpu_efficiency(struct sched_cluster *cluster) +{ + return DIV_ROUND_UP(1024 * max_possible_efficiency, + cluster->efficiency); +} + +/* + * Return load_scale_factor of a cpu in reference to cpu with best max_freq + * (max_possible_freq), so that one with best max_freq gets a load_scale_factor + * of 1. + */ +static inline unsigned long load_scale_cpu_freq(struct sched_cluster *cluster) +{ + return DIV_ROUND_UP(1024 * max_possible_freq, + cluster_max_freq(cluster)); +} + +static int compute_capacity(struct sched_cluster *cluster) +{ + int capacity = 1024; + + capacity *= capacity_scale_cpu_efficiency(cluster); + capacity >>= 10; + + capacity *= capacity_scale_cpu_freq(cluster); + capacity >>= 10; + + return capacity; +} + +static int compute_max_possible_capacity(struct sched_cluster *cluster) +{ + int capacity = 1024; + + capacity *= capacity_scale_cpu_efficiency(cluster); + capacity >>= 10; + + capacity *= (1024 * cluster->max_possible_freq) / min_max_freq; + capacity >>= 10; + + return capacity; +} + +static int compute_load_scale_factor(struct sched_cluster *cluster) +{ + int load_scale = 1024; + + /* + * load_scale_factor accounts for the fact that task load + * is in reference to "best" performing cpu. Task's load will need to be + * scaled (up) by a factor to determine suitability to be placed on a + * (little) cpu. + */ + load_scale *= load_scale_cpu_efficiency(cluster); + load_scale >>= 10; + + load_scale *= load_scale_cpu_freq(cluster); + load_scale >>= 10; + + return load_scale; +} + +struct list_head cluster_head; +static DEFINE_MUTEX(cluster_lock); +static cpumask_t all_cluster_cpus = CPU_MASK_NONE; +DECLARE_BITMAP(all_cluster_ids, NR_CPUS); +struct sched_cluster *sched_cluster[NR_CPUS]; +int num_clusters; + +struct sched_cluster init_cluster = { + .list = LIST_HEAD_INIT(init_cluster.list), + .id = 0, + .max_power_cost = 1, + .min_power_cost = 1, + .capacity = 1024, + .max_possible_capacity = 1024, + .efficiency = 1, + .load_scale_factor = 1024, + .cur_freq = 1, + .max_freq = 1, + .max_mitigated_freq = UINT_MAX, + .min_freq = 1, + .max_possible_freq = 1, + .dstate = 0, + .dstate_wakeup_energy = 0, + .dstate_wakeup_latency = 0, + .exec_scale_factor = 1024, + .notifier_sent = 0, +}; + +static void update_all_clusters_stats(void) +{ + struct sched_cluster *cluster; + u64 highest_mpc = 0, lowest_mpc = U64_MAX; + + pre_big_task_count_change(cpu_possible_mask); + + for_each_sched_cluster(cluster) { + u64 mpc; + + cluster->capacity = compute_capacity(cluster); + mpc = cluster->max_possible_capacity = + compute_max_possible_capacity(cluster); + cluster->load_scale_factor = compute_load_scale_factor(cluster); + + cluster->exec_scale_factor = + DIV_ROUND_UP(cluster->efficiency * 1024, + max_possible_efficiency); + + if (mpc > highest_mpc) + highest_mpc = mpc; + + if (mpc < lowest_mpc) + lowest_mpc = mpc; + } + + max_possible_capacity = highest_mpc; + min_max_possible_capacity = lowest_mpc; + + __update_min_max_capacity(); + sched_update_freq_max_load(cpu_possible_mask); + post_big_task_count_change(cpu_possible_mask); +} + +static void assign_cluster_ids(struct list_head *head) +{ + struct sched_cluster *cluster; + int pos = 0; + + list_for_each_entry(cluster, head, list) { + cluster->id = pos; + sched_cluster[pos++] = cluster; + } +} + +static void +move_list(struct list_head *dst, struct list_head *src, bool sync_rcu) +{ + struct list_head *first, *last; + + first = src->next; + last = src->prev; + + if (sync_rcu) { + INIT_LIST_HEAD_RCU(src); + synchronize_rcu(); + } + + first->prev = dst; + dst->prev = last; + last->next = dst; + + /* Ensure list sanity before making the head visible to all CPUs. */ + smp_mb(); + dst->next = first; +} + +static int +compare_clusters(void *priv, struct list_head *a, struct list_head *b) +{ + struct sched_cluster *cluster1, *cluster2; + int ret; + + cluster1 = container_of(a, struct sched_cluster, list); + cluster2 = container_of(b, struct sched_cluster, list); + + ret = cluster1->max_power_cost > cluster2->max_power_cost || + (cluster1->max_power_cost == cluster2->max_power_cost && + cluster1->max_possible_capacity < + cluster2->max_possible_capacity); + + return ret; +} + +static void sort_clusters(void) +{ + struct sched_cluster *cluster; + struct list_head new_head; + + INIT_LIST_HEAD(&new_head); + + for_each_sched_cluster(cluster) { + cluster->max_power_cost = power_cost(cluster_first_cpu(cluster), + max_task_load()); + cluster->min_power_cost = power_cost(cluster_first_cpu(cluster), + 0); + } + + move_list(&new_head, &cluster_head, true); + + list_sort(NULL, &new_head, compare_clusters); + assign_cluster_ids(&new_head); + + /* + * Ensure cluster ids are visible to all CPUs before making + * cluster_head visible. + */ + move_list(&cluster_head, &new_head, false); +} + +static void +insert_cluster(struct sched_cluster *cluster, struct list_head *head) +{ + struct sched_cluster *tmp; + struct list_head *iter = head; + + list_for_each_entry(tmp, head, list) { + if (cluster->max_power_cost < tmp->max_power_cost) + break; + iter = &tmp->list; + } + + list_add(&cluster->list, iter); +} + +static struct sched_cluster *alloc_new_cluster(const struct cpumask *cpus) +{ + struct sched_cluster *cluster = NULL; + + cluster = kzalloc(sizeof(struct sched_cluster), GFP_ATOMIC); + if (!cluster) { + __WARN_printf("Cluster allocation failed. \ + Possible bad scheduling\n"); + return NULL; + } + + INIT_LIST_HEAD(&cluster->list); + cluster->max_power_cost = 1; + cluster->min_power_cost = 1; + cluster->capacity = 1024; + cluster->max_possible_capacity = 1024; + cluster->efficiency = 1; + cluster->load_scale_factor = 1024; + cluster->cur_freq = 1; + cluster->max_freq = 1; + cluster->max_mitigated_freq = UINT_MAX; + cluster->min_freq = 1; + cluster->max_possible_freq = 1; + cluster->dstate = 0; + cluster->dstate_wakeup_energy = 0; + cluster->dstate_wakeup_latency = 0; + cluster->freq_init_done = false; + + cluster->cpus = *cpus; + cluster->efficiency = arch_get_cpu_efficiency(cpumask_first(cpus)); + + if (cluster->efficiency > max_possible_efficiency) + max_possible_efficiency = cluster->efficiency; + if (cluster->efficiency < min_possible_efficiency) + min_possible_efficiency = cluster->efficiency; + + cluster->notifier_sent = 0; + return cluster; +} + +static void add_cluster(const struct cpumask *cpus, struct list_head *head) +{ + struct sched_cluster *cluster = alloc_new_cluster(cpus); + int i; + + if (!cluster) + return; + + for_each_cpu(i, cpus) + cpu_rq(i)->cluster = cluster; + + insert_cluster(cluster, head); + set_bit(num_clusters, all_cluster_ids); + num_clusters++; +} + +void update_cluster_topology(void) +{ + struct cpumask cpus = *cpu_possible_mask; + const struct cpumask *cluster_cpus; + struct list_head new_head; + int i; + + INIT_LIST_HEAD(&new_head); + + for_each_cpu(i, &cpus) { + cluster_cpus = cpu_coregroup_mask(i); + cpumask_or(&all_cluster_cpus, &all_cluster_cpus, cluster_cpus); + cpumask_andnot(&cpus, &cpus, cluster_cpus); + add_cluster(cluster_cpus, &new_head); + } + + assign_cluster_ids(&new_head); + + /* + * Ensure cluster ids are visible to all CPUs before making + * cluster_head visible. + */ + move_list(&cluster_head, &new_head, false); +} + +void init_clusters(void) +{ + bitmap_clear(all_cluster_ids, 0, NR_CPUS); + init_cluster.cpus = *cpu_possible_mask; + INIT_LIST_HEAD(&cluster_head); +} + +int register_cpu_cycle_counter_cb(struct cpu_cycle_counter_cb *cb) +{ + mutex_lock(&cluster_lock); + if (!cb->get_cpu_cycle_counter) { + mutex_unlock(&cluster_lock); + return -EINVAL; + } + + cpu_cycle_counter_cb = *cb; + use_cycle_counter = true; + mutex_unlock(&cluster_lock); + + return 0; +} + +int __init set_sched_enable_hmp(char *str) +{ + int enable_hmp = 0; + + get_option(&str, &enable_hmp); + + sched_enable_hmp = !!enable_hmp; + + return 0; +} + +early_param("sched_enable_hmp", set_sched_enable_hmp); + +int got_boost_kick(void) +{ + int cpu = smp_processor_id(); + struct rq *rq = cpu_rq(cpu); + + return test_bit(BOOST_KICK, &rq->hmp_flags); +} + +inline void clear_boost_kick(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + clear_bit(BOOST_KICK, &rq->hmp_flags); +} + +inline void boost_kick(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (!test_and_set_bit(BOOST_KICK, &rq->hmp_flags)) + smp_send_reschedule(cpu); +} + +/* Clear any HMP scheduler related requests pending from or on cpu */ +void clear_hmp_request(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + clear_boost_kick(cpu); + clear_reserved(cpu); + if (rq->push_task) { + raw_spin_lock_irqsave(&rq->lock, flags); + if (rq->push_task) { + clear_reserved(rq->push_cpu); + put_task_struct(rq->push_task); + rq->push_task = NULL; + } + rq->active_balance = 0; + raw_spin_unlock_irqrestore(&rq->lock, flags); + } +} + +int sched_set_static_cpu_pwr_cost(int cpu, unsigned int cost) +{ + struct rq *rq = cpu_rq(cpu); + + rq->static_cpu_pwr_cost = cost; + return 0; +} + +unsigned int sched_get_static_cpu_pwr_cost(int cpu) +{ + return cpu_rq(cpu)->static_cpu_pwr_cost; +} + +int sched_set_static_cluster_pwr_cost(int cpu, unsigned int cost) +{ + struct sched_cluster *cluster = cpu_rq(cpu)->cluster; + + cluster->static_cluster_pwr_cost = cost; + return 0; +} + +unsigned int sched_get_static_cluster_pwr_cost(int cpu) +{ + return cpu_rq(cpu)->cluster->static_cluster_pwr_cost; +} + +/* + * sched_window_stats_policy and sched_ravg_hist_size have a 'sysctl' copy + * associated with them. This is required for atomic update of those variables + * when being modifed via sysctl interface. + * + * IMPORTANT: Initialize both copies to same value!! + */ + +/* + * Tasks that are runnable continuously for a period greather than + * EARLY_DETECTION_DURATION can be flagged early as potential + * high load tasks. + */ +#define EARLY_DETECTION_DURATION 9500000 + +static __read_mostly unsigned int sched_ravg_hist_size = 5; +__read_mostly unsigned int sysctl_sched_ravg_hist_size = 5; + +static __read_mostly unsigned int sched_window_stats_policy = + WINDOW_STATS_MAX_RECENT_AVG; +__read_mostly unsigned int sysctl_sched_window_stats_policy = + WINDOW_STATS_MAX_RECENT_AVG; + +#define SCHED_ACCOUNT_WAIT_TIME 1 + +__read_mostly unsigned int sysctl_sched_cpu_high_irqload = (10 * NSEC_PER_MSEC); + +unsigned int __read_mostly sysctl_sched_enable_colocation = 1; + +/* + * Enable colocation and frequency aggregation for all threads in a process. + * The children inherits the group id from the parent. + */ +unsigned int __read_mostly sysctl_sched_enable_thread_grouping; + + +__read_mostly unsigned int sysctl_sched_new_task_windows = 5; + +#define SCHED_FREQ_ACCOUNT_WAIT_TIME 0 + +/* + * For increase, send notification if + * freq_required - cur_freq > sysctl_sched_freq_inc_notify + */ +__read_mostly int sysctl_sched_freq_inc_notify = 10 * 1024 * 1024; /* + 10GHz */ + +/* + * For decrease, send notification if + * cur_freq - freq_required > sysctl_sched_freq_dec_notify + */ +__read_mostly int sysctl_sched_freq_dec_notify = 10 * 1024 * 1024; /* - 10GHz */ + +static __read_mostly unsigned int sched_io_is_busy; + +__read_mostly unsigned int sysctl_sched_pred_alert_freq = 10 * 1024 * 1024; + +/* + * Maximum possible frequency across all cpus. Task demand and cpu + * capacity (cpu_power) metrics are scaled in reference to it. + */ +unsigned int max_possible_freq = 1; + +/* + * Minimum possible max_freq across all cpus. This will be same as + * max_possible_freq on homogeneous systems and could be different from + * max_possible_freq on heterogenous systems. min_max_freq is used to derive + * capacity (cpu_power) of cpus. + */ +unsigned int min_max_freq = 1; + +unsigned int max_capacity = 1024; /* max(rq->capacity) */ +unsigned int min_capacity = 1024; /* min(rq->capacity) */ +unsigned int max_possible_capacity = 1024; /* max(rq->max_possible_capacity) */ +unsigned int +min_max_possible_capacity = 1024; /* min(rq->max_possible_capacity) */ + +/* Window size (in ns) */ +__read_mostly unsigned int sched_ravg_window = 10000000; + +/* Min window size (in ns) = 10ms */ +#define MIN_SCHED_RAVG_WINDOW 10000000 + +/* Max window size (in ns) = 1s */ +#define MAX_SCHED_RAVG_WINDOW 1000000000 + +/* Temporarily disable window-stats activity on all cpus */ +unsigned int __read_mostly sched_disable_window_stats; + +/* + * Major task runtime. If a task runs for more than sched_major_task_runtime + * in a window, it's considered to be generating majority of workload + * for this window. Prediction could be adjusted for such tasks. + */ +__read_mostly unsigned int sched_major_task_runtime = 10000000; + +static unsigned int sync_cpu; + +static LIST_HEAD(related_thread_groups); +static DEFINE_RWLOCK(related_thread_group_lock); + +#define for_each_related_thread_group(grp) \ + list_for_each_entry(grp, &related_thread_groups, list) + +/* + * Demand aggregation for frequency purpose: + * + * 'sched_freq_aggregate' controls aggregation of cpu demand of related threads + * for frequency determination purpose. This aggregation is done per-cluster. + * + * CPU demand of tasks from various related groups is aggregated per-cluster and + * added to the "max_busy_cpu" in that cluster, where max_busy_cpu is determined + * by just rq->prev_runnable_sum. + * + * Some examples follow, which assume: + * Cluster0 = CPU0-3, Cluster1 = CPU4-7 + * One related thread group A that has tasks A0, A1, A2 + * + * A->cpu_time[X].curr/prev_sum = counters in which cpu execution stats of + * tasks belonging to group A are accumulated when they run on cpu X. + * + * CX->curr/prev_sum = counters in which cpu execution stats of all tasks + * not belonging to group A are accumulated when they run on cpu X + * + * Lets say the stats for window M was as below: + * + * C0->prev_sum = 1ms, A->cpu_time[0].prev_sum = 5ms + * Task A0 ran 5ms on CPU0 + * Task B0 ran 1ms on CPU0 + * + * C1->prev_sum = 5ms, A->cpu_time[1].prev_sum = 6ms + * Task A1 ran 4ms on CPU1 + * Task A2 ran 2ms on CPU1 + * Task B1 ran 5ms on CPU1 + * + * C2->prev_sum = 0ms, A->cpu_time[2].prev_sum = 0 + * CPU2 idle + * + * C3->prev_sum = 0ms, A->cpu_time[3].prev_sum = 0 + * CPU3 idle + * + * In this case, CPU1 was most busy going by just its prev_sum counter. Demand + * from all group A tasks are added to CPU1. IOW, at end of window M, cpu busy + * time reported to governor will be: + * + * + * C0 busy time = 1ms + * C1 busy time = 5 + 5 + 6 = 16ms + * + */ +static __read_mostly unsigned int sched_freq_aggregate; +__read_mostly unsigned int sysctl_sched_freq_aggregate; + +/* Initial task load. Newly created tasks are assigned this load. */ +unsigned int __read_mostly sched_init_task_load_windows; +unsigned int __read_mostly sysctl_sched_init_task_load_pct = 15; + +unsigned int max_task_load(void) +{ + return sched_ravg_window; +} + +/* Use this knob to turn on or off HMP-aware task placement logic */ +unsigned int __read_mostly sched_enable_hmp; + +/* + * Scheduler boost is a mechanism to temporarily place tasks on CPUs + * with higher capacity than those where a task would have normally + * ended up with their load characteristics. Any entity enabling + * boost is responsible for disabling it as well. + */ +unsigned int sysctl_sched_boost; + +/* A cpu can no longer accommodate more tasks if: + * + * rq->nr_running > sysctl_sched_spill_nr_run || + * rq->hmp_stats.cumulative_runnable_avg > sched_spill_load + */ +unsigned int __read_mostly sysctl_sched_spill_nr_run = 10; + +/* + * Place sync wakee tasks those have less than configured demand to the waker's + * cluster. + */ +unsigned int __read_mostly sched_small_wakee_task_load; +unsigned int __read_mostly sysctl_sched_small_wakee_task_load_pct = 10; + +unsigned int __read_mostly sched_big_waker_task_load; +unsigned int __read_mostly sysctl_sched_big_waker_task_load_pct = 25; + +/* + * CPUs with load greater than the sched_spill_load_threshold are not + * eligible for task placement. When all CPUs in a cluster achieve a + * load higher than this level, tasks becomes eligible for inter + * cluster migration. + */ +unsigned int __read_mostly sched_spill_load; +unsigned int __read_mostly sysctl_sched_spill_load_pct = 100; + +/* + * Tasks whose bandwidth consumption on a cpu is more than + * sched_upmigrate are considered "big" tasks. Big tasks will be + * considered for "up" migration, i.e migrating to a cpu with better + * capacity. + */ +unsigned int __read_mostly sched_upmigrate; +unsigned int __read_mostly sysctl_sched_upmigrate_pct = 80; + +/* + * Big tasks, once migrated, will need to drop their bandwidth + * consumption to less than sched_downmigrate before they are "down" + * migrated. + */ +unsigned int __read_mostly sched_downmigrate; +unsigned int __read_mostly sysctl_sched_downmigrate_pct = 60; + +/* + * The load scale factor of a CPU gets boosted when its max frequency + * is restricted due to which the tasks are migrating to higher capacity + * CPUs early. The sched_upmigrate threshold is auto-upgraded by + * rq->max_possible_freq/rq->max_freq of a lower capacity CPU. + */ +unsigned int up_down_migrate_scale_factor = 1024; + +/* + * Scheduler selects and places task to its previous CPU if sleep time is + * less than sysctl_sched_select_prev_cpu_us. + */ +unsigned int __read_mostly +sched_short_sleep_task_threshold = 2000 * NSEC_PER_USEC; + +unsigned int __read_mostly sysctl_sched_select_prev_cpu_us = 2000; + +unsigned int __read_mostly +sched_long_cpu_selection_threshold = 100 * NSEC_PER_MSEC; + +unsigned int __read_mostly sysctl_sched_restrict_cluster_spill; + +void update_up_down_migrate(void) +{ + unsigned int up_migrate = pct_to_real(sysctl_sched_upmigrate_pct); + unsigned int down_migrate = pct_to_real(sysctl_sched_downmigrate_pct); + unsigned int delta; + + if (up_down_migrate_scale_factor == 1024) + goto done; + + delta = up_migrate - down_migrate; + + up_migrate /= NSEC_PER_USEC; + up_migrate *= up_down_migrate_scale_factor; + up_migrate >>= 10; + up_migrate *= NSEC_PER_USEC; + + up_migrate = min(up_migrate, sched_ravg_window); + + down_migrate /= NSEC_PER_USEC; + down_migrate *= up_down_migrate_scale_factor; + down_migrate >>= 10; + down_migrate *= NSEC_PER_USEC; + + down_migrate = min(down_migrate, up_migrate - delta); +done: + sched_upmigrate = up_migrate; + sched_downmigrate = down_migrate; +} + +void set_hmp_defaults(void) +{ + sched_spill_load = + pct_to_real(sysctl_sched_spill_load_pct); + + update_up_down_migrate(); + + sched_major_task_runtime = + mult_frac(sched_ravg_window, MAJOR_TASK_PCT, 100); + + sched_init_task_load_windows = + div64_u64((u64)sysctl_sched_init_task_load_pct * + (u64)sched_ravg_window, 100); + + sched_short_sleep_task_threshold = sysctl_sched_select_prev_cpu_us * + NSEC_PER_USEC; + + sched_small_wakee_task_load = + div64_u64((u64)sysctl_sched_small_wakee_task_load_pct * + (u64)sched_ravg_window, 100); + + sched_big_waker_task_load = + div64_u64((u64)sysctl_sched_big_waker_task_load_pct * + (u64)sched_ravg_window, 100); +} + +u32 sched_get_init_task_load(struct task_struct *p) +{ + return p->init_load_pct; +} + +int sched_set_init_task_load(struct task_struct *p, int init_load_pct) +{ + if (init_load_pct < 0 || init_load_pct > 100) + return -EINVAL; + + p->init_load_pct = init_load_pct; + + return 0; +} + +#ifdef CONFIG_CGROUP_SCHED + +int upmigrate_discouraged(struct task_struct *p) +{ + return task_group(p)->upmigrate_discouraged; +} + +#else + +static inline int upmigrate_discouraged(struct task_struct *p) +{ + return 0; +} + +#endif + +/* Is a task "big" on its current cpu */ +static inline int __is_big_task(struct task_struct *p, u64 scaled_load) +{ + int nice = task_nice(p); + + if (nice > SCHED_UPMIGRATE_MIN_NICE || upmigrate_discouraged(p)) + return 0; + + return scaled_load > sched_upmigrate; +} + +int is_big_task(struct task_struct *p) +{ + return __is_big_task(p, scale_load_to_cpu(task_load(p), task_cpu(p))); +} + +u64 cpu_load(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + return scale_load_to_cpu(rq->hmp_stats.cumulative_runnable_avg, cpu); +} + +u64 cpu_load_sync(int cpu, int sync) +{ + return scale_load_to_cpu(cpu_cravg_sync(cpu, sync), cpu); +} + +static int boost_refcount; +static DEFINE_SPINLOCK(boost_lock); +static DEFINE_MUTEX(boost_mutex); + +static void boost_kick_cpus(void) +{ + int i; + + for_each_online_cpu(i) { + if (cpu_capacity(i) != max_capacity) + boost_kick(i); + } +} + +int sched_boost(void) +{ + return boost_refcount > 0; +} + +int sched_set_boost(int enable) +{ + unsigned long flags; + int ret = 0; + int old_refcount; + + if (!sched_enable_hmp) + return -EINVAL; + + spin_lock_irqsave(&boost_lock, flags); + + old_refcount = boost_refcount; + + if (enable == 1) { + boost_refcount++; + } else if (!enable) { + if (boost_refcount >= 1) + boost_refcount--; + else + ret = -EINVAL; + } else { + ret = -EINVAL; + } + + if (!old_refcount && boost_refcount) + boost_kick_cpus(); + + trace_sched_set_boost(boost_refcount); + spin_unlock_irqrestore(&boost_lock, flags); + + return ret; +} + +int sched_boost_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + + mutex_lock(&boost_mutex); + if (!write) + sysctl_sched_boost = sched_boost(); + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + if (ret || !write) + goto done; + + ret = (sysctl_sched_boost <= 1) ? + sched_set_boost(sysctl_sched_boost) : -EINVAL; + +done: + mutex_unlock(&boost_mutex); + return ret; +} + +/* + * Task will fit on a cpu if it's bandwidth consumption on that cpu + * will be less than sched_upmigrate. A big task that was previously + * "up" migrated will be considered fitting on "little" cpu if its + * bandwidth consumption on "little" cpu will be less than + * sched_downmigrate. This will help avoid frequenty migrations for + * tasks with load close to the upmigrate threshold + */ +int task_load_will_fit(struct task_struct *p, u64 task_load, int cpu, + enum sched_boost_type boost_type) +{ + int upmigrate; + + if (cpu_capacity(cpu) == max_capacity) + return 1; + + if (boost_type != SCHED_BOOST_ON_BIG) { + if (task_nice(p) > SCHED_UPMIGRATE_MIN_NICE || + upmigrate_discouraged(p)) + return 1; + + upmigrate = sched_upmigrate; + if (cpu_capacity(task_cpu(p)) > cpu_capacity(cpu)) + upmigrate = sched_downmigrate; + + if (task_load < upmigrate) + return 1; + } + + return 0; +} + +enum sched_boost_type sched_boost_type(void) +{ + if (sched_boost()) { + if (min_possible_efficiency != max_possible_efficiency) + return SCHED_BOOST_ON_BIG; + else + return SCHED_BOOST_ON_ALL; + } + return SCHED_BOOST_NONE; +} + +int task_will_fit(struct task_struct *p, int cpu) +{ + u64 tload = scale_load_to_cpu(task_load(p), cpu); + + return task_load_will_fit(p, tload, cpu, sched_boost_type()); +} + +int group_will_fit(struct sched_cluster *cluster, + struct related_thread_group *grp, u64 demand) +{ + int cpu = cluster_first_cpu(cluster); + int prev_capacity = 0; + unsigned int threshold = sched_upmigrate; + u64 load; + + if (cluster->capacity == max_capacity) + return 1; + + if (grp->preferred_cluster) + prev_capacity = grp->preferred_cluster->capacity; + + if (cluster->capacity < prev_capacity) + threshold = sched_downmigrate; + + load = scale_load_to_cpu(demand, cpu); + if (load < threshold) + return 1; + + return 0; +} + +/* + * Return the cost of running task p on CPU cpu. This function + * currently assumes that task p is the only task which will run on + * the CPU. + */ +unsigned int power_cost(int cpu, u64 demand) +{ + int first, mid, last; + struct cpu_pwr_stats *per_cpu_info = get_cpu_pwr_stats(); + struct cpu_pstate_pwr *costs; + struct freq_max_load *max_load; + int total_static_pwr_cost = 0; + struct rq *rq = cpu_rq(cpu); + unsigned int pc; + + if (!per_cpu_info || !per_cpu_info[cpu].ptable) + /* + * When power aware scheduling is not in use, or CPU + * power data is not available, just use the CPU + * capacity as a rough stand-in for real CPU power + * numbers, assuming bigger CPUs are more power + * hungry. + */ + return cpu_max_possible_capacity(cpu); + + rcu_read_lock(); + max_load = rcu_dereference(per_cpu(freq_max_load, cpu)); + if (!max_load) { + pc = cpu_max_possible_capacity(cpu); + goto unlock; + } + + costs = per_cpu_info[cpu].ptable; + + if (demand <= max_load->freqs[0].hdemand) { + pc = costs[0].power; + goto unlock; + } else if (demand > max_load->freqs[max_load->length - 1].hdemand) { + pc = costs[max_load->length - 1].power; + goto unlock; + } + + first = 0; + last = max_load->length - 1; + mid = (last - first) >> 1; + while (1) { + if (demand <= max_load->freqs[mid].hdemand) + last = mid; + else + first = mid; + + if (last - first == 1) + break; + mid = first + ((last - first) >> 1); + } + + pc = costs[last].power; + +unlock: + rcu_read_unlock(); + + if (idle_cpu(cpu) && rq->cstate) { + total_static_pwr_cost += rq->static_cpu_pwr_cost; + if (rq->cluster->dstate) + total_static_pwr_cost += + rq->cluster->static_cluster_pwr_cost; + } + + return pc + total_static_pwr_cost; + +} + +void inc_nr_big_task(struct hmp_sched_stats *stats, struct task_struct *p) +{ + if (!sched_enable_hmp || sched_disable_window_stats) + return; + + if (is_big_task(p)) + stats->nr_big_tasks++; +} + +void dec_nr_big_task(struct hmp_sched_stats *stats, struct task_struct *p) +{ + if (!sched_enable_hmp || sched_disable_window_stats) + return; + + if (is_big_task(p)) + stats->nr_big_tasks--; + + BUG_ON(stats->nr_big_tasks < 0); +} + +void inc_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra) +{ + inc_nr_big_task(&rq->hmp_stats, p); + if (change_cra) + inc_cumulative_runnable_avg(&rq->hmp_stats, p); +} + +void dec_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra) +{ + dec_nr_big_task(&rq->hmp_stats, p); + if (change_cra) + dec_cumulative_runnable_avg(&rq->hmp_stats, p); +} + +static void reset_hmp_stats(struct hmp_sched_stats *stats, int reset_cra) +{ + stats->nr_big_tasks = 0; + if (reset_cra) { + stats->cumulative_runnable_avg = 0; + stats->pred_demands_sum = 0; + } +} + +/* + * Invoked from three places: + * 1) try_to_wake_up() -> ... -> select_best_cpu() + * 2) scheduler_tick() -> ... -> migration_needed() -> select_best_cpu() + * 3) can_migrate_task() + * + * Its safe to de-reference p->grp in first case (since p->pi_lock is held) + * but not in other cases. p->grp is hence freed after a RCU grace period and + * accessed under rcu_read_lock() + */ +int preferred_cluster(struct sched_cluster *cluster, struct task_struct *p) +{ + struct related_thread_group *grp; + int rc = 0; + + rcu_read_lock(); + + grp = task_related_thread_group(p); + if (!grp || !sysctl_sched_enable_colocation) + rc = 1; + else + rc = (grp->preferred_cluster == cluster); + + rcu_read_unlock(); + return rc; +} + +struct sched_cluster *rq_cluster(struct rq *rq) +{ + return rq->cluster; +} + +/* + * reset_cpu_hmp_stats - reset HMP stats for a cpu + * nr_big_tasks + * cumulative_runnable_avg (iff reset_cra is true) + */ +void reset_cpu_hmp_stats(int cpu, int reset_cra) +{ + reset_cfs_rq_hmp_stats(cpu, reset_cra); + reset_hmp_stats(&cpu_rq(cpu)->hmp_stats, reset_cra); +} + +void fixup_nr_big_tasks(struct hmp_sched_stats *stats, + struct task_struct *p, s64 delta) +{ + u64 new_task_load; + u64 old_task_load; + + if (!sched_enable_hmp || sched_disable_window_stats) + return; + + old_task_load = scale_load_to_cpu(task_load(p), task_cpu(p)); + new_task_load = scale_load_to_cpu(delta + task_load(p), task_cpu(p)); + + if (__is_big_task(p, old_task_load) && !__is_big_task(p, new_task_load)) + stats->nr_big_tasks--; + else if (!__is_big_task(p, old_task_load) && + __is_big_task(p, new_task_load)) + stats->nr_big_tasks++; + + BUG_ON(stats->nr_big_tasks < 0); +} + +/* + * Walk runqueue of cpu and re-initialize 'nr_big_tasks' counters. + */ +static void update_nr_big_tasks(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + struct task_struct *p; + + /* Do not reset cumulative_runnable_avg */ + reset_cpu_hmp_stats(cpu, 0); + + list_for_each_entry(p, &rq->cfs_tasks, se.group_node) + _inc_hmp_sched_stats_fair(rq, p, 0); +} + +/* Disable interrupts and grab runqueue lock of all cpus listed in @cpus */ +void pre_big_task_count_change(const struct cpumask *cpus) +{ + int i; + + local_irq_disable(); + + for_each_cpu(i, cpus) + raw_spin_lock(&cpu_rq(i)->lock); +} + +/* + * Reinitialize 'nr_big_tasks' counters on all affected cpus + */ +void post_big_task_count_change(const struct cpumask *cpus) +{ + int i; + + /* Assumes local_irq_disable() keeps online cpumap stable */ + for_each_cpu(i, cpus) + update_nr_big_tasks(i); + + for_each_cpu(i, cpus) + raw_spin_unlock(&cpu_rq(i)->lock); + + local_irq_enable(); +} + +DEFINE_MUTEX(policy_mutex); + +static inline int invalid_value_freq_input(unsigned int *data) +{ + if (data == &sysctl_sched_freq_aggregate) + return !(*data == 0 || *data == 1); + + return 0; +} + +static inline int invalid_value(unsigned int *data) +{ + unsigned int val = *data; + + if (data == &sysctl_sched_ravg_hist_size) + return (val < 2 || val > RAVG_HIST_SIZE_MAX); + + if (data == &sysctl_sched_window_stats_policy) + return val >= WINDOW_STATS_INVALID_POLICY; + + return invalid_value_freq_input(data); +} + +/* + * Handle "atomic" update of sysctl_sched_window_stats_policy, + * sysctl_sched_ravg_hist_size and sched_freq_legacy_mode variables. + */ +int sched_window_update_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + unsigned int *data = (unsigned int *)table->data; + unsigned int old_val; + + if (!sched_enable_hmp) + return -EINVAL; + + mutex_lock(&policy_mutex); + + old_val = *data; + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + if (ret || !write || (write && (old_val == *data))) + goto done; + + if (invalid_value(data)) { + *data = old_val; + ret = -EINVAL; + goto done; + } + + reset_all_window_stats(0, 0); + +done: + mutex_unlock(&policy_mutex); + + return ret; +} + +/* + * Convert percentage value into absolute form. This will avoid div() operation + * in fast path, to convert task load in percentage scale. + */ +int sched_hmp_proc_update_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + unsigned int old_val; + unsigned int *data = (unsigned int *)table->data; + int update_min_nice = 0; + + mutex_lock(&policy_mutex); + + old_val = *data; + + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + if (ret || !write || !sched_enable_hmp) + goto done; + + if (write && (old_val == *data)) + goto done; + + if (data != &sysctl_sched_select_prev_cpu_us) { + /* + * all tunables other than sched_select_prev_cpu_us are + * in percentage. + */ + if (sysctl_sched_downmigrate_pct > + sysctl_sched_upmigrate_pct || *data > 100) { + *data = old_val; + ret = -EINVAL; + goto done; + } + } + + /* + * Big task tunable change will need to re-classify tasks on + * runqueue as big and set their counters appropriately. + * sysctl interface affects secondary variables (*_pct), which is then + * "atomically" carried over to the primary variables. Atomic change + * includes taking runqueue lock of all online cpus and re-initiatizing + * their big counter values based on changed criteria. + */ + if ((data == &sysctl_sched_upmigrate_pct || update_min_nice)) { + get_online_cpus(); + pre_big_task_count_change(cpu_online_mask); + } + + set_hmp_defaults(); + + if ((data == &sysctl_sched_upmigrate_pct || update_min_nice)) { + post_big_task_count_change(cpu_online_mask); + put_online_cpus(); + } + +done: + mutex_unlock(&policy_mutex); + return ret; +} + +inline int nr_big_tasks(struct rq *rq) +{ + return rq->hmp_stats.nr_big_tasks; +} + +unsigned int cpu_temp(int cpu) +{ + struct cpu_pwr_stats *per_cpu_info = get_cpu_pwr_stats(); + + if (per_cpu_info) + return per_cpu_info[cpu].temp; + else + return 0; +} + +void init_new_task_load(struct task_struct *p) +{ + int i; + u32 init_load_windows = sched_init_task_load_windows; + u32 init_load_pct = current->init_load_pct; + + p->init_load_pct = 0; + rcu_assign_pointer(p->grp, NULL); + INIT_LIST_HEAD(&p->grp_list); + memset(&p->ravg, 0, sizeof(struct ravg)); + p->cpu_cycles = 0; + + if (init_load_pct) + init_load_windows = div64_u64((u64)init_load_pct * + (u64)sched_ravg_window, 100); + + p->ravg.demand = init_load_windows; + p->ravg.pred_demand = 0; + for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i) + p->ravg.sum_history[i] = init_load_windows; +} + +/* Return task demand in percentage scale */ +unsigned int pct_task_load(struct task_struct *p) +{ + unsigned int load; + + load = div64_u64((u64)task_load(p) * 100, (u64)max_task_load()); + + return load; +} + +/* + * Return total number of tasks "eligible" to run on highest capacity cpu + * + * This is simply nr_big_tasks for cpus which are not of max_capacity and + * nr_running for cpus of max_capacity + */ +unsigned int nr_eligible_big_tasks(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + int nr_big = rq->hmp_stats.nr_big_tasks; + int nr = rq->nr_running; + + if (cpu_max_possible_capacity(cpu) != max_possible_capacity) + return nr_big; + + return nr; +} + +static inline int exiting_task(struct task_struct *p) +{ + return (p->ravg.sum_history[0] == EXITING_TASK_MARKER); +} + +static int __init set_sched_ravg_window(char *str) +{ + unsigned int window_size; + + get_option(&str, &window_size); + + if (window_size < MIN_SCHED_RAVG_WINDOW || + window_size > MAX_SCHED_RAVG_WINDOW) { + WARN_ON(1); + return -EINVAL; + } + + sched_ravg_window = window_size; + return 0; +} + +early_param("sched_ravg_window", set_sched_ravg_window); + +static inline void +update_window_start(struct rq *rq, u64 wallclock) +{ + s64 delta; + int nr_windows; + + delta = wallclock - rq->window_start; + BUG_ON(delta < 0); + if (delta < sched_ravg_window) + return; + + nr_windows = div64_u64(delta, sched_ravg_window); + rq->window_start += (u64)nr_windows * (u64)sched_ravg_window; +} + +#define DIV64_U64_ROUNDUP(X, Y) div64_u64((X) + (Y - 1), Y) + +static inline u64 scale_exec_time(u64 delta, struct rq *rq) +{ + u32 freq; + + freq = cpu_cycles_to_freq(rq->cc.cycles, rq->cc.time); + delta = DIV64_U64_ROUNDUP(delta * freq, max_possible_freq); + delta *= rq->cluster->exec_scale_factor; + delta >>= 10; + + return delta; +} + +static inline int cpu_is_waiting_on_io(struct rq *rq) +{ + if (!sched_io_is_busy) + return 0; + + return atomic_read(&rq->nr_iowait); +} + +/* Does freq_required sufficiently exceed or fall behind cur_freq? */ +static inline int +nearly_same_freq(unsigned int cur_freq, unsigned int freq_required) +{ + int delta = freq_required - cur_freq; + + if (freq_required > cur_freq) + return delta < sysctl_sched_freq_inc_notify; + + delta = -delta; + + return delta < sysctl_sched_freq_dec_notify; +} + +/* Convert busy time to frequency equivalent */ +static inline unsigned int load_to_freq(struct rq *rq, u64 load) +{ + unsigned int freq; + + load = scale_load_to_cpu(load, cpu_of(rq)); + load *= 128; + load = div64_u64(load, max_task_load()); + + freq = load * cpu_max_possible_freq(cpu_of(rq)); + freq /= 128; + + return freq; +} + +static inline struct group_cpu_time * +_group_cpu_time(struct related_thread_group *grp, int cpu); + +/* + * Return load from all related group in given cpu. + * Caller must ensure that related_thread_group_lock is held. + */ +static void _group_load_in_cpu(int cpu, u64 *grp_load, u64 *new_grp_load) +{ + struct related_thread_group *grp; + + for_each_related_thread_group(grp) { + struct group_cpu_time *cpu_time; + + cpu_time = _group_cpu_time(grp, cpu); + *grp_load += cpu_time->prev_runnable_sum; + if (new_grp_load) + *new_grp_load += cpu_time->nt_prev_runnable_sum; + } +} + +/* + * Return load from all related groups in given frequency domain. + * Caller must ensure that related_thread_group_lock is held. + */ +static void group_load_in_freq_domain(struct cpumask *cpus, + u64 *grp_load, u64 *new_grp_load) +{ + struct related_thread_group *grp; + int j; + + for_each_related_thread_group(grp) { + for_each_cpu(j, cpus) { + struct group_cpu_time *cpu_time; + + cpu_time = _group_cpu_time(grp, j); + *grp_load += cpu_time->prev_runnable_sum; + *new_grp_load += cpu_time->nt_prev_runnable_sum; + } + } +} + +/* + * Should scheduler alert governor for changing frequency? + * + * @check_pred - evaluate frequency based on the predictive demand + * @check_groups - add load from all related groups on given cpu + * + * check_groups is set to 1 if a "related" task movement/wakeup is triggering + * the notification check. To avoid "re-aggregation" of demand in such cases, + * we check whether the migrated/woken tasks demand (along with demand from + * existing tasks on the cpu) can be met on target cpu + * + */ + +static int send_notification(struct rq *rq, int check_pred, int check_groups) +{ + unsigned int cur_freq, freq_required; + unsigned long flags; + int rc = 0; + u64 group_load = 0, new_load = 0; + + if (!sched_enable_hmp) + return 0; + + if (check_pred) { + u64 prev = rq->old_busy_time; + u64 predicted = rq->hmp_stats.pred_demands_sum; + + if (rq->cluster->cur_freq == cpu_max_freq(cpu_of(rq))) + return 0; + + prev = max(prev, rq->old_estimated_time); + if (prev > predicted) + return 0; + + cur_freq = load_to_freq(rq, prev); + freq_required = load_to_freq(rq, predicted); + + if (freq_required < cur_freq + sysctl_sched_pred_alert_freq) + return 0; + } else { + read_lock(&related_thread_group_lock); + /* + * Protect from concurrent update of rq->prev_runnable_sum and + * group cpu load + */ + raw_spin_lock_irqsave(&rq->lock, flags); + if (check_groups) + _group_load_in_cpu(cpu_of(rq), &group_load, NULL); + + new_load = rq->prev_runnable_sum + group_load; + + raw_spin_unlock_irqrestore(&rq->lock, flags); + read_unlock(&related_thread_group_lock); + + cur_freq = load_to_freq(rq, rq->old_busy_time); + freq_required = load_to_freq(rq, new_load); + + if (nearly_same_freq(cur_freq, freq_required)) + return 0; + } + + raw_spin_lock_irqsave(&rq->lock, flags); + if (!rq->cluster->notifier_sent) { + rq->cluster->notifier_sent = 1; + rc = 1; + trace_sched_freq_alert(cpu_of(rq), check_pred, check_groups, rq, + new_load); + } + raw_spin_unlock_irqrestore(&rq->lock, flags); + + return rc; +} + +/* Alert governor if there is a need to change frequency */ +void check_for_freq_change(struct rq *rq, bool check_pred, bool check_groups) +{ + int cpu = cpu_of(rq); + + if (!send_notification(rq, check_pred, check_groups)) + return; + + atomic_notifier_call_chain( + &load_alert_notifier_head, 0, + (void *)(long)cpu); +} + +void notify_migration(int src_cpu, int dest_cpu, bool src_cpu_dead, + struct task_struct *p) +{ + bool check_groups; + + rcu_read_lock(); + check_groups = task_in_related_thread_group(p); + rcu_read_unlock(); + + if (!same_freq_domain(src_cpu, dest_cpu)) { + if (!src_cpu_dead) + check_for_freq_change(cpu_rq(src_cpu), false, + check_groups); + check_for_freq_change(cpu_rq(dest_cpu), false, check_groups); + } else { + check_for_freq_change(cpu_rq(dest_cpu), true, check_groups); + } +} + +static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p, + u64 irqtime, int event) +{ + if (is_idle_task(p)) { + /* TASK_WAKE && TASK_MIGRATE is not possible on idle task! */ + if (event == PICK_NEXT_TASK) + return 0; + + /* PUT_PREV_TASK, TASK_UPDATE && IRQ_UPDATE are left */ + return irqtime || cpu_is_waiting_on_io(rq); + } + + if (event == TASK_WAKE) + return 0; + + if (event == PUT_PREV_TASK || event == IRQ_UPDATE) + return 1; + + /* + * TASK_UPDATE can be called on sleeping task, when its moved between + * related groups + */ + if (event == TASK_UPDATE) { + if (rq->curr == p) + return 1; + + return p->on_rq ? SCHED_FREQ_ACCOUNT_WAIT_TIME : 0; + } + + /* TASK_MIGRATE, PICK_NEXT_TASK left */ + return SCHED_FREQ_ACCOUNT_WAIT_TIME; +} + +static inline bool is_new_task(struct task_struct *p) +{ + return p->ravg.active_windows < sysctl_sched_new_task_windows; +} + +#define INC_STEP 8 +#define DEC_STEP 2 +#define CONSISTENT_THRES 16 +#define INC_STEP_BIG 16 +/* + * bucket_increase - update the count of all buckets + * + * @buckets: array of buckets tracking busy time of a task + * @idx: the index of bucket to be incremented + * + * Each time a complete window finishes, count of bucket that runtime + * falls in (@idx) is incremented. Counts of all other buckets are + * decayed. The rate of increase and decay could be different based + * on current count in the bucket. + */ +static inline void bucket_increase(u8 *buckets, int idx) +{ + int i, step; + + for (i = 0; i < NUM_BUSY_BUCKETS; i++) { + if (idx != i) { + if (buckets[i] > DEC_STEP) + buckets[i] -= DEC_STEP; + else + buckets[i] = 0; + } else { + step = buckets[i] >= CONSISTENT_THRES ? + INC_STEP_BIG : INC_STEP; + if (buckets[i] > U8_MAX - step) + buckets[i] = U8_MAX; + else + buckets[i] += step; + } + } +} + +static inline int busy_to_bucket(u32 normalized_rt) +{ + int bidx; + + bidx = mult_frac(normalized_rt, NUM_BUSY_BUCKETS, max_task_load()); + bidx = min(bidx, NUM_BUSY_BUCKETS - 1); + + /* + * Combine lowest two buckets. The lowest frequency falls into + * 2nd bucket and thus keep predicting lowest bucket is not + * useful. + */ + if (!bidx) + bidx++; + + return bidx; +} + +static inline u64 +scale_load_to_freq(u64 load, unsigned int src_freq, unsigned int dst_freq) +{ + return div64_u64(load * (u64)src_freq, (u64)dst_freq); +} + +#define HEAVY_TASK_SKIP 2 +#define HEAVY_TASK_SKIP_LIMIT 4 +/* + * get_pred_busy - calculate predicted demand for a task on runqueue + * + * @rq: runqueue of task p + * @p: task whose prediction is being updated + * @start: starting bucket. returned prediction should not be lower than + * this bucket. + * @runtime: runtime of the task. returned prediction should not be lower + * than this runtime. + * Note: @start can be derived from @runtime. It's passed in only to + * avoid duplicated calculation in some cases. + * + * A new predicted busy time is returned for task @p based on @runtime + * passed in. The function searches through buckets that represent busy + * time equal to or bigger than @runtime and attempts to find the bucket to + * to use for prediction. Once found, it searches through historical busy + * time and returns the latest that falls into the bucket. If no such busy + * time exists, it returns the medium of that bucket. + */ +static u32 get_pred_busy(struct rq *rq, struct task_struct *p, + int start, u32 runtime) +{ + int i; + u8 *buckets = p->ravg.busy_buckets; + u32 *hist = p->ravg.sum_history; + u32 dmin, dmax; + u64 cur_freq_runtime = 0; + int first = NUM_BUSY_BUCKETS, final, skip_to; + u32 ret = runtime; + + /* skip prediction for new tasks due to lack of history */ + if (unlikely(is_new_task(p))) + goto out; + + /* find minimal bucket index to pick */ + for (i = start; i < NUM_BUSY_BUCKETS; i++) { + if (buckets[i]) { + first = i; + break; + } + } + /* if no higher buckets are filled, predict runtime */ + if (first >= NUM_BUSY_BUCKETS) + goto out; + + /* compute the bucket for prediction */ + final = first; + if (first < HEAVY_TASK_SKIP_LIMIT) { + /* compute runtime at current CPU frequency */ + cur_freq_runtime = mult_frac(runtime, max_possible_efficiency, + rq->cluster->efficiency); + cur_freq_runtime = scale_load_to_freq(cur_freq_runtime, + max_possible_freq, rq->cluster->cur_freq); + /* + * if the task runs for majority of the window, try to + * pick higher buckets. + */ + if (cur_freq_runtime >= sched_major_task_runtime) { + int next = NUM_BUSY_BUCKETS; + /* + * if there is a higher bucket that's consistently + * hit, don't jump beyond that. + */ + for (i = start + 1; i <= HEAVY_TASK_SKIP_LIMIT && + i < NUM_BUSY_BUCKETS; i++) { + if (buckets[i] > CONSISTENT_THRES) { + next = i; + break; + } + } + skip_to = min(next, start + HEAVY_TASK_SKIP); + /* don't jump beyond HEAVY_TASK_SKIP_LIMIT */ + skip_to = min(HEAVY_TASK_SKIP_LIMIT, skip_to); + /* don't go below first non-empty bucket, if any */ + final = max(first, skip_to); + } + } + + /* determine demand range for the predicted bucket */ + if (final < 2) { + /* lowest two buckets are combined */ + dmin = 0; + final = 1; + } else { + dmin = mult_frac(final, max_task_load(), NUM_BUSY_BUCKETS); + } + dmax = mult_frac(final + 1, max_task_load(), NUM_BUSY_BUCKETS); + + /* + * search through runtime history and return first runtime that falls + * into the range of predicted bucket. + */ + for (i = 0; i < sched_ravg_hist_size; i++) { + if (hist[i] >= dmin && hist[i] < dmax) { + ret = hist[i]; + break; + } + } + /* no historical runtime within bucket found, use average of the bin */ + if (ret < dmin) + ret = (dmin + dmax) / 2; + /* + * when updating in middle of a window, runtime could be higher + * than all recorded history. Always predict at least runtime. + */ + ret = max(runtime, ret); +out: + trace_sched_update_pred_demand(rq, p, runtime, + mult_frac((unsigned int)cur_freq_runtime, 100, + sched_ravg_window), ret); + return ret; +} + +static inline u32 calc_pred_demand(struct rq *rq, struct task_struct *p) +{ + if (p->ravg.pred_demand >= p->ravg.curr_window) + return p->ravg.pred_demand; + + return get_pred_busy(rq, p, busy_to_bucket(p->ravg.curr_window), + p->ravg.curr_window); +} + +/* + * predictive demand of a task is calculated at the window roll-over. + * if the task current window busy time exceeds the predicted + * demand, update it here to reflect the task needs. + */ +void update_task_pred_demand(struct rq *rq, struct task_struct *p, int event) +{ + u32 new, old; + + if (is_idle_task(p) || exiting_task(p)) + return; + + if (event != PUT_PREV_TASK && event != TASK_UPDATE && + (!SCHED_FREQ_ACCOUNT_WAIT_TIME || + (event != TASK_MIGRATE && + event != PICK_NEXT_TASK))) + return; + + /* + * TASK_UPDATE can be called on sleeping task, when its moved between + * related groups + */ + if (event == TASK_UPDATE) { + if (!p->on_rq && !SCHED_FREQ_ACCOUNT_WAIT_TIME) + return; + } + + new = calc_pred_demand(rq, p); + old = p->ravg.pred_demand; + + if (old >= new) + return; + + if (task_on_rq_queued(p) && (!task_has_dl_policy(p) || + !p->dl.dl_throttled)) + p->sched_class->fixup_hmp_sched_stats(rq, p, + p->ravg.demand, + new); + + p->ravg.pred_demand = new; +} + +/* + * Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum) + */ +static void update_cpu_busy_time(struct task_struct *p, struct rq *rq, + int event, u64 wallclock, u64 irqtime) +{ + int new_window, full_window = 0; + int p_is_curr_task = (p == rq->curr); + u64 mark_start = p->ravg.mark_start; + u64 window_start = rq->window_start; + u32 window_size = sched_ravg_window; + u64 delta; + u64 *curr_runnable_sum = &rq->curr_runnable_sum; + u64 *prev_runnable_sum = &rq->prev_runnable_sum; + u64 *nt_curr_runnable_sum = &rq->nt_curr_runnable_sum; + u64 *nt_prev_runnable_sum = &rq->nt_prev_runnable_sum; + int flip_counters = 0; + int prev_sum_reset = 0; + bool new_task; + struct related_thread_group *grp; + + new_window = mark_start < window_start; + if (new_window) { + full_window = (window_start - mark_start) >= window_size; + if (p->ravg.active_windows < USHRT_MAX) + p->ravg.active_windows++; + } + + new_task = is_new_task(p); + + grp = p->grp; + if (grp && sched_freq_aggregate) { + /* cpu_time protected by rq_lock */ + struct group_cpu_time *cpu_time = + _group_cpu_time(grp, cpu_of(rq)); + + curr_runnable_sum = &cpu_time->curr_runnable_sum; + prev_runnable_sum = &cpu_time->prev_runnable_sum; + + nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; + nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; + + if (cpu_time->window_start != rq->window_start) { + int nr_windows; + + delta = rq->window_start - cpu_time->window_start; + nr_windows = div64_u64(delta, window_size); + if (nr_windows > 1) + prev_sum_reset = 1; + + cpu_time->window_start = rq->window_start; + flip_counters = 1; + } + + if (p_is_curr_task && new_window) { + u64 curr_sum = rq->curr_runnable_sum; + u64 nt_curr_sum = rq->nt_curr_runnable_sum; + + if (full_window) + curr_sum = nt_curr_sum = 0; + + rq->prev_runnable_sum = curr_sum; + rq->nt_prev_runnable_sum = nt_curr_sum; + + rq->curr_runnable_sum = 0; + rq->nt_curr_runnable_sum = 0; + } + } else { + if (p_is_curr_task && new_window) { + flip_counters = 1; + if (full_window) + prev_sum_reset = 1; + } + } + + /* + * Handle per-task window rollover. We don't care about the idle + * task or exiting tasks. + */ + if (new_window && !is_idle_task(p) && !exiting_task(p)) { + u32 curr_window = 0; + + if (!full_window) + curr_window = p->ravg.curr_window; + + p->ravg.prev_window = curr_window; + p->ravg.curr_window = 0; + } + + if (flip_counters) { + u64 curr_sum = *curr_runnable_sum; + u64 nt_curr_sum = *nt_curr_runnable_sum; + + if (prev_sum_reset) + curr_sum = nt_curr_sum = 0; + + *prev_runnable_sum = curr_sum; + *nt_prev_runnable_sum = nt_curr_sum; + + *curr_runnable_sum = 0; + *nt_curr_runnable_sum = 0; + } + + if (!account_busy_for_cpu_time(rq, p, irqtime, event)) { + /* + * account_busy_for_cpu_time() = 0, so no update to the + * task's current window needs to be made. This could be + * for example + * + * - a wakeup event on a task within the current + * window (!new_window below, no action required), + * - switching to a new task from idle (PICK_NEXT_TASK) + * in a new window where irqtime is 0 and we aren't + * waiting on IO + */ + + if (!new_window) + return; + + /* + * A new window has started. The RQ demand must be rolled + * over if p is the current task. + */ + if (p_is_curr_task) { + /* p is idle task */ + BUG_ON(p != rq->idle); + } + + return; + } + + if (!new_window) { + /* + * account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. No rollover + * since we didn't start a new window. An example of this is + * when a task starts execution and then sleeps within the + * same window. + */ + + if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) + delta = wallclock - mark_start; + else + delta = irqtime; + delta = scale_exec_time(delta, rq); + *curr_runnable_sum += delta; + if (new_task) + *nt_curr_runnable_sum += delta; + + if (!is_idle_task(p) && !exiting_task(p)) + p->ravg.curr_window += delta; + + return; + } + + if (!p_is_curr_task) { + /* + * account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. A new window + * has also started, but p is not the current task, so the + * window is not rolled over - just split up and account + * as necessary into curr and prev. The window is only + * rolled over when a new window is processed for the current + * task. + * + * Irqtime can't be accounted by a task that isn't the + * currently running task. + */ + + if (!full_window) { + /* + * A full window hasn't elapsed, account partial + * contribution to previous completed window. + */ + delta = scale_exec_time(window_start - mark_start, rq); + if (!exiting_task(p)) + p->ravg.prev_window += delta; + } else { + /* + * Since at least one full window has elapsed, + * the contribution to the previous window is the + * full window (window_size). + */ + delta = scale_exec_time(window_size, rq); + if (!exiting_task(p)) + p->ravg.prev_window = delta; + } + + *prev_runnable_sum += delta; + if (new_task) + *nt_prev_runnable_sum += delta; + + /* Account piece of busy time in the current window. */ + delta = scale_exec_time(wallclock - window_start, rq); + *curr_runnable_sum += delta; + if (new_task) + *nt_curr_runnable_sum += delta; + + if (!exiting_task(p)) + p->ravg.curr_window = delta; + + return; + } + + if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) { + /* + * account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. A new window + * has started and p is the current task so rollover is + * needed. If any of these three above conditions are true + * then this busy time can't be accounted as irqtime. + * + * Busy time for the idle task or exiting tasks need not + * be accounted. + * + * An example of this would be a task that starts execution + * and then sleeps once a new window has begun. + */ + + if (!full_window) { + /* + * A full window hasn't elapsed, account partial + * contribution to previous completed window. + */ + delta = scale_exec_time(window_start - mark_start, rq); + if (!is_idle_task(p) && !exiting_task(p)) + p->ravg.prev_window += delta; + } else { + /* + * Since at least one full window has elapsed, + * the contribution to the previous window is the + * full window (window_size). + */ + delta = scale_exec_time(window_size, rq); + if (!is_idle_task(p) && !exiting_task(p)) + p->ravg.prev_window = delta; + } + + /* + * Rollover is done here by overwriting the values in + * prev_runnable_sum and curr_runnable_sum. + */ + *prev_runnable_sum += delta; + if (new_task) + *nt_prev_runnable_sum += delta; + + /* Account piece of busy time in the current window. */ + delta = scale_exec_time(wallclock - window_start, rq); + *curr_runnable_sum += delta; + if (new_task) + *nt_curr_runnable_sum += delta; + + if (!is_idle_task(p) && !exiting_task(p)) + p->ravg.curr_window = delta; + + return; + } + + if (irqtime) { + /* + * account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. A new window + * has started and p is the current task so rollover is + * needed. The current task must be the idle task because + * irqtime is not accounted for any other task. + * + * Irqtime will be accounted each time we process IRQ activity + * after a period of idleness, so we know the IRQ busy time + * started at wallclock - irqtime. + */ + + BUG_ON(!is_idle_task(p)); + mark_start = wallclock - irqtime; + + /* + * Roll window over. If IRQ busy time was just in the current + * window then that is all that need be accounted. + */ + if (mark_start > window_start) { + *curr_runnable_sum = scale_exec_time(irqtime, rq); + return; + } + + /* + * The IRQ busy time spanned multiple windows. Process the + * busy time preceding the current window start first. + */ + delta = window_start - mark_start; + if (delta > window_size) + delta = window_size; + delta = scale_exec_time(delta, rq); + *prev_runnable_sum += delta; + + /* Process the remaining IRQ busy time in the current window. */ + delta = wallclock - window_start; + rq->curr_runnable_sum = scale_exec_time(delta, rq); + + return; + } + + BUG(); +} + +static inline u32 predict_and_update_buckets(struct rq *rq, + struct task_struct *p, u32 runtime) { + + int bidx; + u32 pred_demand; + + bidx = busy_to_bucket(runtime); + pred_demand = get_pred_busy(rq, p, bidx, runtime); + bucket_increase(p->ravg.busy_buckets, bidx); + + return pred_demand; +} + +static void update_task_cpu_cycles(struct task_struct *p, int cpu) +{ + if (use_cycle_counter) + p->cpu_cycles = cpu_cycle_counter_cb.get_cpu_cycle_counter(cpu); +} + +static void +update_task_rq_cpu_cycles(struct task_struct *p, struct rq *rq, int event, + u64 wallclock, u64 irqtime) +{ + u64 cur_cycles; + int cpu = cpu_of(rq); + + lockdep_assert_held(&rq->lock); + + if (!use_cycle_counter) { + rq->cc.cycles = cpu_cur_freq(cpu); + rq->cc.time = 1; + return; + } + + cur_cycles = cpu_cycle_counter_cb.get_cpu_cycle_counter(cpu); + + /* + * If current task is idle task and irqtime == 0 CPU was + * indeed idle and probably its cycle counter was not + * increasing. We still need estimatied CPU frequency + * for IO wait time accounting. Use the previously + * calculated frequency in such a case. + */ + if (!is_idle_task(rq->curr) || irqtime) { + if (unlikely(cur_cycles < p->cpu_cycles)) + rq->cc.cycles = cur_cycles + (U64_MAX - p->cpu_cycles); + else + rq->cc.cycles = cur_cycles - p->cpu_cycles; + rq->cc.cycles = rq->cc.cycles * NSEC_PER_MSEC; + + if (event == IRQ_UPDATE && is_idle_task(p)) + /* + * Time between mark_start of idle task and IRQ handler + * entry time is CPU cycle counter stall period. + * Upon IRQ handler entry sched_account_irqstart() + * replenishes idle task's cpu cycle counter so + * rq->cc.cycles now represents increased cycles during + * IRQ handler rather than time between idle entry and + * IRQ exit. Thus use irqtime as time delta. + */ + rq->cc.time = irqtime; + else + rq->cc.time = wallclock - p->ravg.mark_start; + BUG_ON((s64)rq->cc.time < 0); + } + + p->cpu_cycles = cur_cycles; + + trace_sched_get_task_cpu_cycles(cpu, event, rq->cc.cycles, rq->cc.time); +} + +static int account_busy_for_task_demand(struct task_struct *p, int event) +{ + /* + * No need to bother updating task demand for exiting tasks + * or the idle task. + */ + if (exiting_task(p) || is_idle_task(p)) + return 0; + + /* + * When a task is waking up it is completing a segment of non-busy + * time. Likewise, if wait time is not treated as busy time, then + * when a task begins to run or is migrated, it is not running and + * is completing a segment of non-busy time. + */ + if (event == TASK_WAKE || (!SCHED_ACCOUNT_WAIT_TIME && + (event == PICK_NEXT_TASK || event == TASK_MIGRATE))) + return 0; + + return 1; +} + +/* + * Called when new window is starting for a task, to record cpu usage over + * recently concluded window(s). Normally 'samples' should be 1. It can be > 1 + * when, say, a real-time task runs without preemption for several windows at a + * stretch. + */ +static void update_history(struct rq *rq, struct task_struct *p, + u32 runtime, int samples, int event) +{ + u32 *hist = &p->ravg.sum_history[0]; + int ridx, widx; + u32 max = 0, avg, demand, pred_demand; + u64 sum = 0; + + /* Ignore windows where task had no activity */ + if (!runtime || is_idle_task(p) || exiting_task(p) || !samples) + goto done; + + /* Push new 'runtime' value onto stack */ + widx = sched_ravg_hist_size - 1; + ridx = widx - samples; + for (; ridx >= 0; --widx, --ridx) { + hist[widx] = hist[ridx]; + sum += hist[widx]; + if (hist[widx] > max) + max = hist[widx]; + } + + for (widx = 0; widx < samples && widx < sched_ravg_hist_size; widx++) { + hist[widx] = runtime; + sum += hist[widx]; + if (hist[widx] > max) + max = hist[widx]; + } + + p->ravg.sum = 0; + + if (sched_window_stats_policy == WINDOW_STATS_RECENT) { + demand = runtime; + } else if (sched_window_stats_policy == WINDOW_STATS_MAX) { + demand = max; + } else { + avg = div64_u64(sum, sched_ravg_hist_size); + if (sched_window_stats_policy == WINDOW_STATS_AVG) + demand = avg; + else + demand = max(avg, runtime); + } + pred_demand = predict_and_update_buckets(rq, p, runtime); + + /* + * A throttled deadline sched class task gets dequeued without + * changing p->on_rq. Since the dequeue decrements hmp stats + * avoid decrementing it here again. + */ + if (task_on_rq_queued(p) && (!task_has_dl_policy(p) || + !p->dl.dl_throttled)) + p->sched_class->fixup_hmp_sched_stats(rq, p, demand, + pred_demand); + + p->ravg.demand = demand; + p->ravg.pred_demand = pred_demand; + +done: + trace_sched_update_history(rq, p, runtime, samples, event); +} + +static void add_to_task_demand(struct rq *rq, struct task_struct *p, u64 delta) +{ + delta = scale_exec_time(delta, rq); + p->ravg.sum += delta; + if (unlikely(p->ravg.sum > sched_ravg_window)) + p->ravg.sum = sched_ravg_window; +} + +/* + * Account cpu demand of task and/or update task's cpu demand history + * + * ms = p->ravg.mark_start; + * wc = wallclock + * ws = rq->window_start + * + * Three possibilities: + * + * a) Task event is contained within one window. + * window_start < mark_start < wallclock + * + * ws ms wc + * | | | + * V V V + * |---------------| + * + * In this case, p->ravg.sum is updated *iff* event is appropriate + * (ex: event == PUT_PREV_TASK) + * + * b) Task event spans two windows. + * mark_start < window_start < wallclock + * + * ms ws wc + * | | | + * V V V + * -----|------------------- + * + * In this case, p->ravg.sum is updated with (ws - ms) *iff* event + * is appropriate, then a new window sample is recorded followed + * by p->ravg.sum being set to (wc - ws) *iff* event is appropriate. + * + * c) Task event spans more than two windows. + * + * ms ws_tmp ws wc + * | | | | + * V V V V + * ---|-------|-------|-------|-------|------ + * | | + * |<------ nr_full_windows ------>| + * + * In this case, p->ravg.sum is updated with (ws_tmp - ms) first *iff* + * event is appropriate, window sample of p->ravg.sum is recorded, + * 'nr_full_window' samples of window_size is also recorded *iff* + * event is appropriate and finally p->ravg.sum is set to (wc - ws) + * *iff* event is appropriate. + * + * IMPORTANT : Leave p->ravg.mark_start unchanged, as update_cpu_busy_time() + * depends on it! + */ +static void update_task_demand(struct task_struct *p, struct rq *rq, + int event, u64 wallclock) +{ + u64 mark_start = p->ravg.mark_start; + u64 delta, window_start = rq->window_start; + int new_window, nr_full_windows; + u32 window_size = sched_ravg_window; + + new_window = mark_start < window_start; + if (!account_busy_for_task_demand(p, event)) { + if (new_window) + /* + * If the time accounted isn't being accounted as + * busy time, and a new window started, only the + * previous window need be closed out with the + * pre-existing demand. Multiple windows may have + * elapsed, but since empty windows are dropped, + * it is not necessary to account those. + */ + update_history(rq, p, p->ravg.sum, 1, event); + return; + } + + if (!new_window) { + /* + * The simple case - busy time contained within the existing + * window. + */ + add_to_task_demand(rq, p, wallclock - mark_start); + return; + } + + /* + * Busy time spans at least two windows. Temporarily rewind + * window_start to first window boundary after mark_start. + */ + delta = window_start - mark_start; + nr_full_windows = div64_u64(delta, window_size); + window_start -= (u64)nr_full_windows * (u64)window_size; + + /* Process (window_start - mark_start) first */ + add_to_task_demand(rq, p, window_start - mark_start); + + /* Push new sample(s) into task's demand history */ + update_history(rq, p, p->ravg.sum, 1, event); + if (nr_full_windows) + update_history(rq, p, scale_exec_time(window_size, rq), + nr_full_windows, event); + + /* + * Roll window_start back to current to process any remainder + * in current window. + */ + window_start += (u64)nr_full_windows * (u64)window_size; + + /* Process (wallclock - window_start) next */ + mark_start = window_start; + add_to_task_demand(rq, p, wallclock - mark_start); +} + +/* Reflect task activity on its demand and cpu's busy time statistics */ +void update_task_ravg(struct task_struct *p, struct rq *rq, int event, + u64 wallclock, u64 irqtime) +{ + if (!rq->window_start || sched_disable_window_stats) + return; + + lockdep_assert_held(&rq->lock); + + update_window_start(rq, wallclock); + + if (!p->ravg.mark_start) { + update_task_cpu_cycles(p, cpu_of(rq)); + goto done; + } + + update_task_rq_cpu_cycles(p, rq, event, wallclock, irqtime); + update_task_demand(p, rq, event, wallclock); + update_cpu_busy_time(p, rq, event, wallclock, irqtime); + update_task_pred_demand(rq, p, event); +done: + trace_sched_update_task_ravg(p, rq, event, wallclock, irqtime, + rq->cc.cycles, rq->cc.time, + _group_cpu_time(p->grp, cpu_of(rq))); + + p->ravg.mark_start = wallclock; +} + +void sched_account_irqtime(int cpu, struct task_struct *curr, + u64 delta, u64 wallclock) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags, nr_windows; + u64 cur_jiffies_ts; + + raw_spin_lock_irqsave(&rq->lock, flags); + + /* + * cputime (wallclock) uses sched_clock so use the same here for + * consistency. + */ + delta += sched_clock() - wallclock; + cur_jiffies_ts = get_jiffies_64(); + + if (is_idle_task(curr)) + update_task_ravg(curr, rq, IRQ_UPDATE, sched_ktime_clock(), + delta); + + nr_windows = cur_jiffies_ts - rq->irqload_ts; + + if (nr_windows) { + if (nr_windows < 10) { + /* Decay CPU's irqload by 3/4 for each window. */ + rq->avg_irqload *= (3 * nr_windows); + rq->avg_irqload = div64_u64(rq->avg_irqload, + 4 * nr_windows); + } else { + rq->avg_irqload = 0; + } + rq->avg_irqload += rq->cur_irqload; + rq->cur_irqload = 0; + } + + rq->cur_irqload += delta; + rq->irqload_ts = cur_jiffies_ts; + raw_spin_unlock_irqrestore(&rq->lock, flags); +} + +void sched_account_irqstart(int cpu, struct task_struct *curr, u64 wallclock) +{ + struct rq *rq = cpu_rq(cpu); + + if (!rq->window_start || sched_disable_window_stats) + return; + + if (is_idle_task(curr)) { + /* We're here without rq->lock held, IRQ disabled */ + raw_spin_lock(&rq->lock); + update_task_cpu_cycles(curr, cpu); + raw_spin_unlock(&rq->lock); + } +} + +void reset_task_stats(struct task_struct *p) +{ + u32 sum = 0; + + if (exiting_task(p)) + sum = EXITING_TASK_MARKER; + + memset(&p->ravg, 0, sizeof(struct ravg)); + /* Retain EXITING_TASK marker */ + p->ravg.sum_history[0] = sum; +} + +void mark_task_starting(struct task_struct *p) +{ + u64 wallclock; + struct rq *rq = task_rq(p); + + if (!rq->window_start || sched_disable_window_stats) { + reset_task_stats(p); + return; + } + + wallclock = sched_ktime_clock(); + p->ravg.mark_start = p->last_wake_ts = wallclock; + p->last_cpu_selected_ts = wallclock; + p->last_switch_out_ts = 0; + update_task_cpu_cycles(p, cpu_of(rq)); +} + +void set_window_start(struct rq *rq) +{ + int cpu = cpu_of(rq); + struct rq *sync_rq = cpu_rq(sync_cpu); + + if (rq->window_start || !sched_enable_hmp) + return; + + if (cpu == sync_cpu) { + rq->window_start = sched_ktime_clock(); + } else { + raw_spin_unlock(&rq->lock); + double_rq_lock(rq, sync_rq); + rq->window_start = cpu_rq(sync_cpu)->window_start; + rq->curr_runnable_sum = rq->prev_runnable_sum = 0; + rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0; + raw_spin_unlock(&sync_rq->lock); + } + + rq->curr->ravg.mark_start = rq->window_start; +} + +void migrate_sync_cpu(int cpu) +{ + if (cpu == sync_cpu) + sync_cpu = smp_processor_id(); +} + +static void reset_all_task_stats(void) +{ + struct task_struct *g, *p; + + read_lock(&tasklist_lock); + do_each_thread(g, p) { + reset_task_stats(p); + } while_each_thread(g, p); + read_unlock(&tasklist_lock); +} + +static void disable_window_stats(void) +{ + unsigned long flags; + int i; + + local_irq_save(flags); + for_each_possible_cpu(i) + raw_spin_lock(&cpu_rq(i)->lock); + + sched_disable_window_stats = 1; + + for_each_possible_cpu(i) + raw_spin_unlock(&cpu_rq(i)->lock); + + local_irq_restore(flags); +} + +/* Called with all cpu's rq->lock held */ +static void enable_window_stats(void) +{ + sched_disable_window_stats = 0; + +} + +enum reset_reason_code { + WINDOW_CHANGE, + POLICY_CHANGE, + HIST_SIZE_CHANGE, + FREQ_AGGREGATE_CHANGE, +}; + +const char *sched_window_reset_reasons[] = { + "WINDOW_CHANGE", + "POLICY_CHANGE", + "HIST_SIZE_CHANGE", +}; + +/* Called with IRQs enabled */ +void reset_all_window_stats(u64 window_start, unsigned int window_size) +{ + int cpu; + unsigned long flags; + u64 start_ts = sched_ktime_clock(); + int reason = WINDOW_CHANGE; + unsigned int old = 0, new = 0; + struct related_thread_group *grp; + + disable_window_stats(); + + reset_all_task_stats(); + + local_irq_save(flags); + + read_lock(&related_thread_group_lock); + + for_each_possible_cpu(cpu) + raw_spin_lock(&cpu_rq(cpu)->lock); + + list_for_each_entry(grp, &related_thread_groups, list) { + int j; + + for_each_possible_cpu(j) { + struct group_cpu_time *cpu_time; + /* Protected by rq lock */ + cpu_time = _group_cpu_time(grp, j); + memset(cpu_time, 0, sizeof(struct group_cpu_time)); + if (window_start) + cpu_time->window_start = window_start; + } + } + + if (window_size) { + sched_ravg_window = window_size * TICK_NSEC; + set_hmp_defaults(); + } + + enable_window_stats(); + + for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + + if (window_start) + rq->window_start = window_start; + rq->curr_runnable_sum = rq->prev_runnable_sum = 0; + rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0; + reset_cpu_hmp_stats(cpu, 1); + } + + if (sched_window_stats_policy != sysctl_sched_window_stats_policy) { + reason = POLICY_CHANGE; + old = sched_window_stats_policy; + new = sysctl_sched_window_stats_policy; + sched_window_stats_policy = sysctl_sched_window_stats_policy; + } else if (sched_ravg_hist_size != sysctl_sched_ravg_hist_size) { + reason = HIST_SIZE_CHANGE; + old = sched_ravg_hist_size; + new = sysctl_sched_ravg_hist_size; + sched_ravg_hist_size = sysctl_sched_ravg_hist_size; + } else if (sched_freq_aggregate != + sysctl_sched_freq_aggregate) { + reason = FREQ_AGGREGATE_CHANGE; + old = sched_freq_aggregate; + new = sysctl_sched_freq_aggregate; + sched_freq_aggregate = sysctl_sched_freq_aggregate; + } + + for_each_possible_cpu(cpu) + raw_spin_unlock(&cpu_rq(cpu)->lock); + + read_unlock(&related_thread_group_lock); + + local_irq_restore(flags); + + trace_sched_reset_all_window_stats(window_start, window_size, + sched_ktime_clock() - start_ts, reason, old, new); +} + +static inline void +sync_window_start(struct rq *rq, struct group_cpu_time *cpu_time); + +void sched_get_cpus_busy(struct sched_load *busy, + const struct cpumask *query_cpus) +{ + unsigned long flags; + struct rq *rq; + const int cpus = cpumask_weight(query_cpus); + u64 load[cpus], group_load[cpus]; + u64 nload[cpus], ngload[cpus]; + u64 pload[cpus]; + unsigned int cur_freq[cpus], max_freq[cpus]; + int notifier_sent = 0; + int early_detection[cpus]; + int cpu, i = 0; + unsigned int window_size; + u64 max_prev_sum = 0; + int max_busy_cpu = cpumask_first(query_cpus); + struct related_thread_group *grp; + + if (unlikely(cpus == 0)) + return; + + /* + * This function could be called in timer context, and the + * current task may have been executing for a long time. Ensure + * that the window stats are current by doing an update. + */ + read_lock(&related_thread_group_lock); + + local_irq_save(flags); + for_each_cpu(cpu, query_cpus) + raw_spin_lock(&cpu_rq(cpu)->lock); + + window_size = sched_ravg_window; + + for_each_cpu(cpu, query_cpus) { + rq = cpu_rq(cpu); + + update_task_ravg(rq->curr, rq, TASK_UPDATE, sched_ktime_clock(), + 0); + cur_freq[i] = cpu_cycles_to_freq(rq->cc.cycles, rq->cc.time); + + load[i] = rq->old_busy_time = rq->prev_runnable_sum; + nload[i] = rq->nt_prev_runnable_sum; + pload[i] = rq->hmp_stats.pred_demands_sum; + rq->old_estimated_time = pload[i]; + + if (load[i] > max_prev_sum) { + max_prev_sum = load[i]; + max_busy_cpu = cpu; + } + + /* + * sched_get_cpus_busy() is called for all CPUs in a + * frequency domain. So the notifier_sent flag per + * cluster works even when a frequency domain spans + * more than 1 cluster. + */ + if (rq->cluster->notifier_sent) { + notifier_sent = 1; + rq->cluster->notifier_sent = 0; + } + early_detection[i] = (rq->ed_task != NULL); + cur_freq[i] = cpu_cur_freq(cpu); + max_freq[i] = cpu_max_freq(cpu); + i++; + } + + for_each_related_thread_group(grp) { + for_each_cpu(cpu, query_cpus) { + /* Protected by rq_lock */ + struct group_cpu_time *cpu_time = + _group_cpu_time(grp, cpu); + sync_window_start(cpu_rq(cpu), cpu_time); + } + } + + i = 0; + for_each_cpu(cpu, query_cpus) { + group_load[i] = 0; + ngload[i] = 0; + + if (early_detection[i]) + goto skip_early; + + rq = cpu_rq(cpu); + if (!notifier_sent) { + if (cpu == max_busy_cpu) + group_load_in_freq_domain( + &rq->freq_domain_cpumask, + &group_load[i], &ngload[i]); + } else { + _group_load_in_cpu(cpu, &group_load[i], &ngload[i]); + } + + load[i] += group_load[i]; + nload[i] += ngload[i]; + /* + * Scale load in reference to cluster max_possible_freq. + * + * Note that scale_load_to_cpu() scales load in reference to + * the cluster max_freq. + */ + load[i] = scale_load_to_cpu(load[i], cpu); + nload[i] = scale_load_to_cpu(nload[i], cpu); + pload[i] = scale_load_to_cpu(pload[i], cpu); +skip_early: + i++; + } + + for_each_cpu(cpu, query_cpus) + raw_spin_unlock(&(cpu_rq(cpu))->lock); + local_irq_restore(flags); + + read_unlock(&related_thread_group_lock); + + i = 0; + for_each_cpu(cpu, query_cpus) { + rq = cpu_rq(cpu); + + if (early_detection[i]) { + busy[i].prev_load = div64_u64(sched_ravg_window, + NSEC_PER_USEC); + busy[i].new_task_load = 0; + goto exit_early; + } + + if (!notifier_sent) { + load[i] = scale_load_to_freq(load[i], max_freq[i], + cur_freq[i]); + nload[i] = scale_load_to_freq(nload[i], max_freq[i], + cur_freq[i]); + if (load[i] > window_size) + load[i] = window_size; + if (nload[i] > window_size) + nload[i] = window_size; + + load[i] = scale_load_to_freq(load[i], cur_freq[i], + cpu_max_possible_freq(cpu)); + nload[i] = scale_load_to_freq(nload[i], cur_freq[i], + cpu_max_possible_freq(cpu)); + } else { + load[i] = scale_load_to_freq(load[i], max_freq[i], + cpu_max_possible_freq(cpu)); + nload[i] = scale_load_to_freq(nload[i], max_freq[i], + cpu_max_possible_freq(cpu)); + } + pload[i] = scale_load_to_freq(pload[i], max_freq[i], + rq->cluster->max_possible_freq); + + busy[i].prev_load = div64_u64(load[i], NSEC_PER_USEC); + busy[i].new_task_load = div64_u64(nload[i], NSEC_PER_USEC); + busy[i].predicted_load = div64_u64(pload[i], NSEC_PER_USEC); + +exit_early: + trace_sched_get_busy(cpu, busy[i].prev_load, + busy[i].new_task_load, + busy[i].predicted_load, + early_detection[i]); + i++; + } +} + +void sched_set_io_is_busy(int val) +{ + sched_io_is_busy = val; +} + +int sched_set_window(u64 window_start, unsigned int window_size) +{ + u64 now, cur_jiffies, jiffy_ktime_ns; + s64 ws; + unsigned long flags; + + if (window_size * TICK_NSEC < MIN_SCHED_RAVG_WINDOW) + return -EINVAL; + + mutex_lock(&policy_mutex); + + /* + * Get a consistent view of ktime, jiffies, and the time + * since the last jiffy (based on last_jiffies_update). + */ + local_irq_save(flags); + cur_jiffies = jiffy_to_ktime_ns(&now, &jiffy_ktime_ns); + local_irq_restore(flags); + + /* translate window_start from jiffies to nanoseconds */ + ws = (window_start - cur_jiffies); /* jiffy difference */ + ws *= TICK_NSEC; + ws += jiffy_ktime_ns; + + /* + * Roll back calculated window start so that it is in + * the past (window stats must have a current window). + */ + while (ws > now) + ws -= (window_size * TICK_NSEC); + + BUG_ON(sched_ktime_clock() < ws); + + reset_all_window_stats(ws, window_size); + + sched_update_freq_max_load(cpu_possible_mask); + + mutex_unlock(&policy_mutex); + + return 0; +} + +void fixup_busy_time(struct task_struct *p, int new_cpu) +{ + struct rq *src_rq = task_rq(p); + struct rq *dest_rq = cpu_rq(new_cpu); + u64 wallclock; + u64 *src_curr_runnable_sum, *dst_curr_runnable_sum; + u64 *src_prev_runnable_sum, *dst_prev_runnable_sum; + u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum; + u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum; + int migrate_type; + struct migration_sum_data d; + bool new_task; + struct related_thread_group *grp; + + if (!sched_enable_hmp || (!p->on_rq && p->state != TASK_WAKING)) + return; + + if (exiting_task(p)) { + clear_ed_task(p, src_rq); + return; + } + + if (p->state == TASK_WAKING) + double_rq_lock(src_rq, dest_rq); + + if (sched_disable_window_stats) + goto done; + + wallclock = sched_ktime_clock(); + + update_task_ravg(task_rq(p)->curr, task_rq(p), + TASK_UPDATE, + wallclock, 0); + update_task_ravg(dest_rq->curr, dest_rq, + TASK_UPDATE, wallclock, 0); + + update_task_ravg(p, task_rq(p), TASK_MIGRATE, + wallclock, 0); + + update_task_cpu_cycles(p, new_cpu); + + new_task = is_new_task(p); + /* Protected by rq_lock */ + grp = p->grp; + if (grp && sched_freq_aggregate) { + struct group_cpu_time *cpu_time; + + migrate_type = GROUP_TO_GROUP; + /* Protected by rq_lock */ + cpu_time = _group_cpu_time(grp, cpu_of(src_rq)); + d.src_rq = NULL; + d.src_cpu_time = cpu_time; + src_curr_runnable_sum = &cpu_time->curr_runnable_sum; + src_prev_runnable_sum = &cpu_time->prev_runnable_sum; + src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; + src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; + + /* Protected by rq_lock */ + cpu_time = _group_cpu_time(grp, cpu_of(dest_rq)); + d.dst_rq = NULL; + d.dst_cpu_time = cpu_time; + dst_curr_runnable_sum = &cpu_time->curr_runnable_sum; + dst_prev_runnable_sum = &cpu_time->prev_runnable_sum; + dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; + dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; + sync_window_start(dest_rq, cpu_time); + } else { + migrate_type = RQ_TO_RQ; + d.src_rq = src_rq; + d.src_cpu_time = NULL; + d.dst_rq = dest_rq; + d.dst_cpu_time = NULL; + src_curr_runnable_sum = &src_rq->curr_runnable_sum; + src_prev_runnable_sum = &src_rq->prev_runnable_sum; + src_nt_curr_runnable_sum = &src_rq->nt_curr_runnable_sum; + src_nt_prev_runnable_sum = &src_rq->nt_prev_runnable_sum; + + dst_curr_runnable_sum = &dest_rq->curr_runnable_sum; + dst_prev_runnable_sum = &dest_rq->prev_runnable_sum; + dst_nt_curr_runnable_sum = &dest_rq->nt_curr_runnable_sum; + dst_nt_prev_runnable_sum = &dest_rq->nt_prev_runnable_sum; + } + + if (p->ravg.curr_window) { + *src_curr_runnable_sum -= p->ravg.curr_window; + *dst_curr_runnable_sum += p->ravg.curr_window; + if (new_task) { + *src_nt_curr_runnable_sum -= p->ravg.curr_window; + *dst_nt_curr_runnable_sum += p->ravg.curr_window; + } + } + + if (p->ravg.prev_window) { + *src_prev_runnable_sum -= p->ravg.prev_window; + *dst_prev_runnable_sum += p->ravg.prev_window; + if (new_task) { + *src_nt_prev_runnable_sum -= p->ravg.prev_window; + *dst_nt_prev_runnable_sum += p->ravg.prev_window; + } + } + + if (p == src_rq->ed_task) { + src_rq->ed_task = NULL; + if (!dest_rq->ed_task) + dest_rq->ed_task = p; + } + + trace_sched_migration_update_sum(p, migrate_type, &d); + BUG_ON((s64)*src_prev_runnable_sum < 0); + BUG_ON((s64)*src_curr_runnable_sum < 0); + BUG_ON((s64)*src_nt_prev_runnable_sum < 0); + BUG_ON((s64)*src_nt_curr_runnable_sum < 0); + +done: + if (p->state == TASK_WAKING) + double_rq_unlock(src_rq, dest_rq); +} + +#define sched_up_down_migrate_auto_update 1 +static void check_for_up_down_migrate_update(const struct cpumask *cpus) +{ + int i = cpumask_first(cpus); + + if (!sched_up_down_migrate_auto_update) + return; + + if (cpu_max_possible_capacity(i) == max_possible_capacity) + return; + + if (cpu_max_possible_freq(i) == cpu_max_freq(i)) + up_down_migrate_scale_factor = 1024; + else + up_down_migrate_scale_factor = (1024 * + cpu_max_possible_freq(i)) / cpu_max_freq(i); + + update_up_down_migrate(); +} + +/* Return cluster which can offer required capacity for group */ +static struct sched_cluster * +best_cluster(struct related_thread_group *grp, u64 total_demand) +{ + struct sched_cluster *cluster = NULL; + + for_each_sched_cluster(cluster) { + if (group_will_fit(cluster, grp, total_demand)) + return cluster; + } + + return NULL; +} + +static void _set_preferred_cluster(struct related_thread_group *grp) +{ + struct task_struct *p; + u64 combined_demand = 0; + + if (!sysctl_sched_enable_colocation) { + grp->last_update = sched_ktime_clock(); + grp->preferred_cluster = NULL; + return; + } + + /* + * wakeup of two or more related tasks could race with each other and + * could result in multiple calls to _set_preferred_cluster being issued + * at same time. Avoid overhead in such cases of rechecking preferred + * cluster + */ + if (sched_ktime_clock() - grp->last_update < sched_ravg_window / 10) + return; + + list_for_each_entry(p, &grp->tasks, grp_list) + combined_demand += p->ravg.demand; + + grp->preferred_cluster = best_cluster(grp, combined_demand); + grp->last_update = sched_ktime_clock(); + trace_sched_set_preferred_cluster(grp, combined_demand); +} + +void set_preferred_cluster(struct related_thread_group *grp) +{ + raw_spin_lock(&grp->lock); + _set_preferred_cluster(grp); + raw_spin_unlock(&grp->lock); +} + +#define ADD_TASK 0 +#define REM_TASK 1 + +static inline void free_group_cputime(struct related_thread_group *grp) +{ + free_percpu(grp->cpu_time); +} + +static int alloc_group_cputime(struct related_thread_group *grp) +{ + int i; + struct group_cpu_time *cpu_time; + int cpu = raw_smp_processor_id(); + struct rq *rq = cpu_rq(cpu); + u64 window_start = rq->window_start; + + grp->cpu_time = alloc_percpu(struct group_cpu_time); + if (!grp->cpu_time) + return -ENOMEM; + + for_each_possible_cpu(i) { + cpu_time = per_cpu_ptr(grp->cpu_time, i); + memset(cpu_time, 0, sizeof(struct group_cpu_time)); + cpu_time->window_start = window_start; + } + + return 0; +} + +/* + * A group's window_start may be behind. When moving it forward, flip prev/curr + * counters. When moving forward > 1 window, prev counter is set to 0 + */ +static inline void +sync_window_start(struct rq *rq, struct group_cpu_time *cpu_time) +{ + u64 delta; + int nr_windows; + u64 curr_sum = cpu_time->curr_runnable_sum; + u64 nt_curr_sum = cpu_time->nt_curr_runnable_sum; + + delta = rq->window_start - cpu_time->window_start; + if (!delta) + return; + + nr_windows = div64_u64(delta, sched_ravg_window); + if (nr_windows > 1) + curr_sum = nt_curr_sum = 0; + + cpu_time->prev_runnable_sum = curr_sum; + cpu_time->curr_runnable_sum = 0; + + cpu_time->nt_prev_runnable_sum = nt_curr_sum; + cpu_time->nt_curr_runnable_sum = 0; + + cpu_time->window_start = rq->window_start; +} + +/* + * Task's cpu usage is accounted in: + * rq->curr/prev_runnable_sum, when its ->grp is NULL + * grp->cpu_time[cpu]->curr/prev_runnable_sum, when its ->grp is !NULL + * + * Transfer task's cpu usage between those counters when transitioning between + * groups + */ +static void transfer_busy_time(struct rq *rq, struct related_thread_group *grp, + struct task_struct *p, int event) +{ + u64 wallclock; + struct group_cpu_time *cpu_time; + u64 *src_curr_runnable_sum, *dst_curr_runnable_sum; + u64 *src_prev_runnable_sum, *dst_prev_runnable_sum; + u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum; + u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum; + struct migration_sum_data d; + int migrate_type; + + if (!sched_freq_aggregate) + return; + + wallclock = sched_ktime_clock(); + + update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); + update_task_ravg(p, rq, TASK_UPDATE, wallclock, 0); + + /* cpu_time protected by related_thread_group_lock, grp->lock rq_lock */ + cpu_time = _group_cpu_time(grp, cpu_of(rq)); + if (event == ADD_TASK) { + sync_window_start(rq, cpu_time); + migrate_type = RQ_TO_GROUP; + d.src_rq = rq; + d.src_cpu_time = NULL; + d.dst_rq = NULL; + d.dst_cpu_time = cpu_time; + src_curr_runnable_sum = &rq->curr_runnable_sum; + dst_curr_runnable_sum = &cpu_time->curr_runnable_sum; + src_prev_runnable_sum = &rq->prev_runnable_sum; + dst_prev_runnable_sum = &cpu_time->prev_runnable_sum; + + src_nt_curr_runnable_sum = &rq->nt_curr_runnable_sum; + dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; + src_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum; + dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; + } else { + migrate_type = GROUP_TO_RQ; + d.src_rq = NULL; + d.src_cpu_time = cpu_time; + d.dst_rq = rq; + d.dst_cpu_time = NULL; + + /* + * In case of REM_TASK, cpu_time->window_start would be + * uptodate, because of the update_task_ravg() we called + * above on the moving task. Hence no need for + * sync_window_start() + */ + src_curr_runnable_sum = &cpu_time->curr_runnable_sum; + dst_curr_runnable_sum = &rq->curr_runnable_sum; + src_prev_runnable_sum = &cpu_time->prev_runnable_sum; + dst_prev_runnable_sum = &rq->prev_runnable_sum; + + src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; + dst_nt_curr_runnable_sum = &rq->nt_curr_runnable_sum; + src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; + dst_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum; + } + + *src_curr_runnable_sum -= p->ravg.curr_window; + *dst_curr_runnable_sum += p->ravg.curr_window; + + *src_prev_runnable_sum -= p->ravg.prev_window; + *dst_prev_runnable_sum += p->ravg.prev_window; + + if (is_new_task(p)) { + *src_nt_curr_runnable_sum -= p->ravg.curr_window; + *dst_nt_curr_runnable_sum += p->ravg.curr_window; + *src_nt_prev_runnable_sum -= p->ravg.prev_window; + *dst_nt_prev_runnable_sum += p->ravg.prev_window; + } + + trace_sched_migration_update_sum(p, migrate_type, &d); + + BUG_ON((s64)*src_curr_runnable_sum < 0); + BUG_ON((s64)*src_prev_runnable_sum < 0); +} + +static inline struct group_cpu_time * +task_group_cpu_time(struct task_struct *p, int cpu) +{ + return _group_cpu_time(rcu_dereference(p->grp), cpu); +} + +static inline struct group_cpu_time * +_group_cpu_time(struct related_thread_group *grp, int cpu) +{ + return grp ? per_cpu_ptr(grp->cpu_time, cpu) : NULL; +} + +struct related_thread_group *alloc_related_thread_group(int group_id) +{ + struct related_thread_group *grp; + + grp = kzalloc(sizeof(*grp), GFP_KERNEL); + if (!grp) + return ERR_PTR(-ENOMEM); + + if (alloc_group_cputime(grp)) { + kfree(grp); + return ERR_PTR(-ENOMEM); + } + + grp->id = group_id; + INIT_LIST_HEAD(&grp->tasks); + INIT_LIST_HEAD(&grp->list); + raw_spin_lock_init(&grp->lock); + + return grp; +} + +struct related_thread_group *lookup_related_thread_group(unsigned int group_id) +{ + struct related_thread_group *grp; + + list_for_each_entry(grp, &related_thread_groups, list) { + if (grp->id == group_id) + return grp; + } + + return NULL; +} + +/* See comments before preferred_cluster() */ +static void free_related_thread_group(struct rcu_head *rcu) +{ + struct related_thread_group *grp = container_of(rcu, struct + related_thread_group, rcu); + + free_group_cputime(grp); + kfree(grp); +} + +static void remove_task_from_group(struct task_struct *p) +{ + struct related_thread_group *grp = p->grp; + struct rq *rq; + int empty_group = 1; + + raw_spin_lock(&grp->lock); + + rq = __task_rq_lock(p); + transfer_busy_time(rq, p->grp, p, REM_TASK); + list_del_init(&p->grp_list); + rcu_assign_pointer(p->grp, NULL); + __task_rq_unlock(rq); + + if (!list_empty(&grp->tasks)) { + empty_group = 0; + _set_preferred_cluster(grp); + } + + raw_spin_unlock(&grp->lock); + + if (empty_group) { + list_del(&grp->list); + call_rcu(&grp->rcu, free_related_thread_group); + } +} + +static int +add_task_to_group(struct task_struct *p, struct related_thread_group *grp) +{ + struct rq *rq; + + raw_spin_lock(&grp->lock); + + /* + * Change p->grp under rq->lock. Will prevent races with read-side + * reference of p->grp in various hot-paths + */ + rq = __task_rq_lock(p); + transfer_busy_time(rq, grp, p, ADD_TASK); + list_add(&p->grp_list, &grp->tasks); + rcu_assign_pointer(p->grp, grp); + __task_rq_unlock(rq); + + _set_preferred_cluster(grp); + + raw_spin_unlock(&grp->lock); + + return 0; +} + +void add_new_task_to_grp(struct task_struct *new) +{ + unsigned long flags; + struct related_thread_group *grp; + struct task_struct *parent; + + if (!sysctl_sched_enable_thread_grouping) + return; + + if (thread_group_leader(new)) + return; + + parent = new->group_leader; + + /* + * The parent's pi_lock is required here to protect race + * against the parent task being removed from the + * group. + */ + raw_spin_lock_irqsave(&parent->pi_lock, flags); + + /* protected by pi_lock. */ + grp = task_related_thread_group(parent); + if (!grp) { + raw_spin_unlock_irqrestore(&parent->pi_lock, flags); + return; + } + raw_spin_lock(&grp->lock); + + rcu_assign_pointer(new->grp, grp); + list_add(&new->grp_list, &grp->tasks); + + raw_spin_unlock(&grp->lock); + raw_spin_unlock_irqrestore(&parent->pi_lock, flags); +} + +int sched_set_group_id(struct task_struct *p, unsigned int group_id) +{ + int rc = 0, destroy = 0; + unsigned long flags; + struct related_thread_group *grp = NULL, *new = NULL; + +redo: + raw_spin_lock_irqsave(&p->pi_lock, flags); + + if ((current != p && p->flags & PF_EXITING) || + (!p->grp && !group_id) || + (p->grp && p->grp->id == group_id)) + goto done; + + write_lock(&related_thread_group_lock); + + if (!group_id) { + remove_task_from_group(p); + write_unlock(&related_thread_group_lock); + goto done; + } + + if (p->grp && p->grp->id != group_id) + remove_task_from_group(p); + + grp = lookup_related_thread_group(group_id); + if (!grp && !new) { + /* New group */ + write_unlock(&related_thread_group_lock); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + new = alloc_related_thread_group(group_id); + if (IS_ERR(new)) + return -ENOMEM; + destroy = 1; + /* Rerun checks (like task exiting), since we dropped pi_lock */ + goto redo; + } else if (!grp && new) { + /* New group - use object allocated before */ + destroy = 0; + list_add(&new->list, &related_thread_groups); + grp = new; + } + + BUG_ON(!grp); + rc = add_task_to_group(p, grp); + write_unlock(&related_thread_group_lock); +done: + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + + if (new && destroy) { + free_group_cputime(new); + kfree(new); + } + + return rc; +} + +unsigned int sched_get_group_id(struct task_struct *p) +{ + unsigned int group_id; + struct related_thread_group *grp; + + rcu_read_lock(); + grp = task_related_thread_group(p); + group_id = grp ? grp->id : 0; + rcu_read_unlock(); + + return group_id; +} + +static void update_cpu_cluster_capacity(const cpumask_t *cpus) +{ + int i; + struct sched_cluster *cluster; + struct cpumask cpumask; + + cpumask_copy(&cpumask, cpus); + pre_big_task_count_change(cpu_possible_mask); + + for_each_cpu(i, &cpumask) { + cluster = cpu_rq(i)->cluster; + cpumask_andnot(&cpumask, &cpumask, &cluster->cpus); + + cluster->capacity = compute_capacity(cluster); + cluster->load_scale_factor = compute_load_scale_factor(cluster); + + /* 'cpus' can contain cpumask more than one cluster */ + check_for_up_down_migrate_update(&cluster->cpus); + } + + __update_min_max_capacity(); + + post_big_task_count_change(cpu_possible_mask); +} + +static DEFINE_SPINLOCK(cpu_freq_min_max_lock); +void sched_update_cpu_freq_min_max(const cpumask_t *cpus, u32 fmin, u32 fmax) +{ + struct cpumask cpumask; + struct sched_cluster *cluster; + int i, update_capacity = 0; + unsigned long flags; + + spin_lock_irqsave(&cpu_freq_min_max_lock, flags); + cpumask_copy(&cpumask, cpus); + for_each_cpu(i, &cpumask) { + cluster = cpu_rq(i)->cluster; + cpumask_andnot(&cpumask, &cpumask, &cluster->cpus); + + update_capacity += (cluster->max_mitigated_freq != fmax); + cluster->max_mitigated_freq = fmax; + } + spin_unlock_irqrestore(&cpu_freq_min_max_lock, flags); + + if (update_capacity) + update_cpu_cluster_capacity(cpus); +} + +static int cpufreq_notifier_policy(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct cpufreq_policy *policy = (struct cpufreq_policy *)data; + struct sched_cluster *cluster = NULL; + struct cpumask policy_cluster = *policy->related_cpus; + unsigned int orig_max_freq = 0; + int i, j, update_capacity = 0; + + if (val != CPUFREQ_NOTIFY && val != CPUFREQ_REMOVE_POLICY && + val != CPUFREQ_CREATE_POLICY) + return 0; + + if (val == CPUFREQ_REMOVE_POLICY || val == CPUFREQ_CREATE_POLICY) { + update_min_max_capacity(); + return 0; + } + + max_possible_freq = max(max_possible_freq, policy->cpuinfo.max_freq); + if (min_max_freq == 1) + min_max_freq = UINT_MAX; + min_max_freq = min(min_max_freq, policy->cpuinfo.max_freq); + BUG_ON(!min_max_freq); + BUG_ON(!policy->max); + + for_each_cpu(i, &policy_cluster) { + cluster = cpu_rq(i)->cluster; + cpumask_andnot(&policy_cluster, &policy_cluster, + &cluster->cpus); + + orig_max_freq = cluster->max_freq; + cluster->min_freq = policy->min; + cluster->max_freq = policy->max; + cluster->cur_freq = policy->cur; + + if (!cluster->freq_init_done) { + mutex_lock(&cluster_lock); + for_each_cpu(j, &cluster->cpus) + cpumask_copy(&cpu_rq(j)->freq_domain_cpumask, + policy->related_cpus); + cluster->max_possible_freq = policy->cpuinfo.max_freq; + cluster->max_possible_capacity = + compute_max_possible_capacity(cluster); + cluster->freq_init_done = true; + + sort_clusters(); + update_all_clusters_stats(); + mutex_unlock(&cluster_lock); + continue; + } + + update_capacity += (orig_max_freq != cluster->max_freq); + } + + if (update_capacity) + update_cpu_cluster_capacity(policy->related_cpus); + + return 0; +} + +static int cpufreq_notifier_trans(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct cpufreq_freqs *freq = (struct cpufreq_freqs *)data; + unsigned int cpu = freq->cpu, new_freq = freq->new; + unsigned long flags; + struct sched_cluster *cluster; + struct cpumask policy_cpus = cpu_rq(cpu)->freq_domain_cpumask; + int i, j; + + if (val != CPUFREQ_POSTCHANGE) + return 0; + + BUG_ON(!new_freq); + + if (cpu_cur_freq(cpu) == new_freq) + return 0; + + for_each_cpu(i, &policy_cpus) { + cluster = cpu_rq(i)->cluster; + + for_each_cpu(j, &cluster->cpus) { + struct rq *rq = cpu_rq(j); + + raw_spin_lock_irqsave(&rq->lock, flags); + update_task_ravg(rq->curr, rq, TASK_UPDATE, + sched_ktime_clock(), 0); + raw_spin_unlock_irqrestore(&rq->lock, flags); + } + + cluster->cur_freq = new_freq; + cpumask_andnot(&policy_cpus, &policy_cpus, &cluster->cpus); + } + + return 0; +} + +static int pwr_stats_ready_notifier(struct notifier_block *nb, + unsigned long cpu, void *data) +{ + cpumask_t mask = CPU_MASK_NONE; + + cpumask_set_cpu(cpu, &mask); + sched_update_freq_max_load(&mask); + + mutex_lock(&cluster_lock); + sort_clusters(); + mutex_unlock(&cluster_lock); + + return 0; +} + +static struct notifier_block notifier_policy_block = { + .notifier_call = cpufreq_notifier_policy +}; + +static struct notifier_block notifier_trans_block = { + .notifier_call = cpufreq_notifier_trans +}; + +static struct notifier_block notifier_pwr_stats_ready = { + .notifier_call = pwr_stats_ready_notifier +}; + +int __weak register_cpu_pwr_stats_ready_notifier(struct notifier_block *nb) +{ + return -EINVAL; +} + +static int register_sched_callback(void) +{ + int ret; + + if (!sched_enable_hmp) + return 0; + + ret = cpufreq_register_notifier(¬ifier_policy_block, + CPUFREQ_POLICY_NOTIFIER); + + if (!ret) + ret = cpufreq_register_notifier(¬ifier_trans_block, + CPUFREQ_TRANSITION_NOTIFIER); + + register_cpu_pwr_stats_ready_notifier(¬ifier_pwr_stats_ready); + + return 0; +} + +/* + * cpufreq callbacks can be registered at core_initcall or later time. + * Any registration done prior to that is "forgotten" by cpufreq. See + * initialization of variable init_cpufreq_transition_notifier_list_called + * for further information. + */ +core_initcall(register_sched_callback); + +int update_preferred_cluster(struct related_thread_group *grp, + struct task_struct *p, u32 old_load) +{ + u32 new_load = task_load(p); + + if (!grp) + return 0; + + /* + * Update if task's load has changed significantly or a complete window + * has passed since we last updated preference + */ + if (abs(new_load - old_load) > sched_ravg_window / 4 || + sched_ktime_clock() - grp->last_update > sched_ravg_window) + return 1; + + return 0; +} + +bool early_detection_notify(struct rq *rq, u64 wallclock) +{ + struct task_struct *p; + int loop_max = 10; + + if (!sched_boost() || !rq->cfs.h_nr_running) + return 0; + + rq->ed_task = NULL; + list_for_each_entry(p, &rq->cfs_tasks, se.group_node) { + if (!loop_max) + break; + + if (wallclock - p->last_wake_ts >= EARLY_DETECTION_DURATION) { + rq->ed_task = p; + return 1; + } + + loop_max--; + } + + return 0; +} + +#ifdef CONFIG_CGROUP_SCHED +u64 cpu_upmigrate_discourage_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct task_group *tg = css_tg(css); + + return tg->upmigrate_discouraged; +} + +int cpu_upmigrate_discourage_write_u64(struct cgroup_subsys_state *css, + struct cftype *cft, u64 upmigrate_discourage) +{ + struct task_group *tg = css_tg(css); + int discourage = upmigrate_discourage > 0; + + if (tg->upmigrate_discouraged == discourage) + return 0; + + /* + * Revisit big-task classification for tasks of this cgroup. It would + * have been efficient to walk tasks of just this cgroup in running + * state, but we don't have easy means to do that. Walk all tasks in + * running state on all cpus instead and re-visit their big task + * classification. + */ + get_online_cpus(); + pre_big_task_count_change(cpu_online_mask); + + tg->upmigrate_discouraged = discourage; + + post_big_task_count_change(cpu_online_mask); + put_online_cpus(); + + return 0; +} +#endif /* CONFIG_CGROUP_SCHED */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b9566cf3ad37..e31334d5f581 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -254,7 +254,6 @@ struct cfs_bandwidth { struct task_group { struct cgroup_subsys_state css; - bool notify_on_migrate; #ifdef CONFIG_SCHED_HMP bool upmigrate_discouraged; #endif @@ -356,6 +355,7 @@ extern void sched_move_task(struct task_struct *tsk); extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); #endif +extern struct task_group *css_tg(struct cgroup_subsys_state *css); #else /* CONFIG_CGROUP_SCHED */ struct cfs_bandwidth { }; @@ -367,9 +367,7 @@ struct cfs_bandwidth { }; struct hmp_sched_stats { int nr_big_tasks; u64 cumulative_runnable_avg; -#ifdef CONFIG_SCHED_FREQ_INPUT u64 pred_demands_sum; -#endif }; struct sched_cluster { @@ -393,6 +391,7 @@ struct sched_cluster { bool freq_init_done; int dstate, dstate_wakeup_latency, dstate_wakeup_energy; unsigned int static_cluster_pwr_cost; + int notifier_sent; }; extern unsigned long all_cluster_ids[]; @@ -410,23 +409,17 @@ struct related_thread_group { struct sched_cluster *preferred_cluster; struct rcu_head rcu; u64 last_update; -#ifdef CONFIG_SCHED_FREQ_INPUT struct group_cpu_time __percpu *cpu_time; /* one per cluster */ -#endif }; struct migration_sum_data { struct rq *src_rq, *dst_rq; -#ifdef CONFIG_SCHED_FREQ_INPUT struct group_cpu_time *src_cpu_time, *dst_cpu_time; -#endif }; extern struct list_head cluster_head; extern int num_clusters; extern struct sched_cluster *sched_cluster[NR_CPUS]; -extern int group_will_fit(struct sched_cluster *cluster, - struct related_thread_group *grp, u64 demand); struct cpu_cycle { u64 cycles; @@ -436,7 +429,7 @@ struct cpu_cycle { #define for_each_sched_cluster(cluster) \ list_for_each_entry_rcu(cluster, &cluster_head, list) -#endif +#endif /* CONFIG_SCHED_HMP */ /* CFS-related fields in a runqueue */ struct cfs_rq { @@ -736,7 +729,6 @@ struct rq { u64 age_stamp; u64 idle_stamp; u64 avg_idle; - int cstate, wakeup_latency, wakeup_energy; /* This is used to determine avg_idle's max value */ u64 max_idle_balance_cost; @@ -747,6 +739,7 @@ struct rq { struct cpumask freq_domain_cpumask; struct hmp_sched_stats hmp_stats; + int cstate, wakeup_latency, wakeup_energy; u64 window_start; unsigned long hmp_flags; @@ -756,15 +749,8 @@ struct rq { unsigned int static_cpu_pwr_cost; struct task_struct *ed_task; struct cpu_cycle cc; - -#ifdef CONFIG_SCHED_FREQ_INPUT u64 old_busy_time, old_busy_time_group; - int notifier_sent; u64 old_estimated_time; -#endif -#endif - -#ifdef CONFIG_SCHED_FREQ_INPUT u64 curr_runnable_sum; u64 prev_runnable_sum; u64 nt_curr_runnable_sum; @@ -1036,8 +1022,6 @@ static inline void sched_ttwu_pending(void) { } #include "stats.h" #include "auto_group.h" -extern void init_new_task_load(struct task_struct *p); - #ifdef CONFIG_SCHED_HMP #define WINDOW_STATS_RECENT 0 @@ -1046,9 +1030,16 @@ extern void init_new_task_load(struct task_struct *p); #define WINDOW_STATS_AVG 3 #define WINDOW_STATS_INVALID_POLICY 4 +#define MAJOR_TASK_PCT 85 +#define SCHED_UPMIGRATE_MIN_NICE 15 +#define EXITING_TASK_MARKER 0xdeaddead + +#define UP_MIGRATION 1 +#define DOWN_MIGRATION 2 +#define IRQLOAD_MIGRATION 3 + extern struct mutex policy_mutex; extern unsigned int sched_ravg_window; -extern unsigned int sched_use_pelt; extern unsigned int sched_disable_window_stats; extern unsigned int sched_enable_hmp; extern unsigned int max_possible_freq; @@ -1063,28 +1054,59 @@ extern unsigned int max_possible_capacity; extern unsigned int min_max_possible_capacity; extern unsigned int sched_upmigrate; extern unsigned int sched_downmigrate; -extern unsigned int sched_init_task_load_pelt; extern unsigned int sched_init_task_load_windows; extern unsigned int up_down_migrate_scale_factor; extern unsigned int sysctl_sched_restrict_cluster_spill; extern unsigned int sched_pred_alert_load; - -#ifdef CONFIG_SCHED_FREQ_INPUT -#define MAJOR_TASK_PCT 85 extern unsigned int sched_major_task_runtime; -#endif +extern struct sched_cluster init_cluster; +extern unsigned int __read_mostly sched_short_sleep_task_threshold; +extern unsigned int __read_mostly sched_long_cpu_selection_threshold; +extern unsigned int __read_mostly sched_big_waker_task_load; +extern unsigned int __read_mostly sched_small_wakee_task_load; +extern unsigned int __read_mostly sched_spill_load; +extern unsigned int __read_mostly sched_upmigrate; +extern unsigned int __read_mostly sched_downmigrate; +extern unsigned int __read_mostly sysctl_sched_spill_nr_run; +extern void init_new_task_load(struct task_struct *p); +extern u64 sched_ktime_clock(void); +extern int got_boost_kick(void); +extern int register_cpu_cycle_counter_cb(struct cpu_cycle_counter_cb *cb); +extern void update_task_ravg(struct task_struct *p, struct rq *rq, int event, + u64 wallclock, u64 irqtime); +extern bool early_detection_notify(struct rq *rq, u64 wallclock); +extern void clear_ed_task(struct task_struct *p, struct rq *rq); +extern void fixup_busy_time(struct task_struct *p, int new_cpu); +extern void clear_boost_kick(int cpu); +extern void clear_hmp_request(int cpu); +extern void mark_task_starting(struct task_struct *p); +extern void set_window_start(struct rq *rq); +extern void migrate_sync_cpu(int cpu); +extern void update_cluster_topology(void); +extern void set_task_last_wake(struct task_struct *p, u64 wallclock); +extern void set_task_last_switch_out(struct task_struct *p, u64 wallclock); +extern void init_clusters(void); +extern int __init set_sched_enable_hmp(char *str); extern void reset_cpu_hmp_stats(int cpu, int reset_cra); extern unsigned int max_task_load(void); extern void sched_account_irqtime(int cpu, struct task_struct *curr, u64 delta, u64 wallclock); extern void sched_account_irqstart(int cpu, struct task_struct *curr, u64 wallclock); - -unsigned int cpu_temp(int cpu); -int sched_set_group_id(struct task_struct *p, unsigned int group_id); +extern unsigned int cpu_temp(int cpu); extern unsigned int nr_eligible_big_tasks(int cpu); extern void update_up_down_migrate(void); +extern int update_preferred_cluster(struct related_thread_group *grp, + struct task_struct *p, u32 old_load); +extern void set_preferred_cluster(struct related_thread_group *grp); +extern void add_new_task_to_grp(struct task_struct *new); + +enum sched_boost_type { + SCHED_BOOST_NONE, + SCHED_BOOST_ON_BIG, + SCHED_BOOST_ON_ALL, +}; static inline struct sched_cluster *cpu_cluster(int cpu) { @@ -1180,20 +1202,9 @@ static inline u64 scale_load_to_cpu(u64 task_load, int cpu) static inline unsigned int task_load(struct task_struct *p) { - if (sched_use_pelt) - return p->se.avg.runnable_avg_sum_scaled; - return p->ravg.demand; } -#ifdef CONFIG_SCHED_FREQ_INPUT -#define set_pred_demands_sum(stats, x) ((stats)->pred_demands_sum = (x)) -#define verify_pred_demands_sum(stat) BUG_ON((s64)(stat)->pred_demands_sum < 0) -#else -#define set_pred_demands_sum(stats, x) -#define verify_pred_demands_sum(stat) -#endif - static inline void inc_cumulative_runnable_avg(struct hmp_sched_stats *stats, struct task_struct *p) @@ -1203,33 +1214,29 @@ inc_cumulative_runnable_avg(struct hmp_sched_stats *stats, if (!sched_enable_hmp || sched_disable_window_stats) return; - task_load = sched_use_pelt ? p->se.avg.runnable_avg_sum_scaled : - (sched_disable_window_stats ? 0 : p->ravg.demand); + task_load = sched_disable_window_stats ? 0 : p->ravg.demand; stats->cumulative_runnable_avg += task_load; - set_pred_demands_sum(stats, stats->pred_demands_sum + - p->ravg.pred_demand); + stats->pred_demands_sum += p->ravg.pred_demand; } static inline void dec_cumulative_runnable_avg(struct hmp_sched_stats *stats, - struct task_struct *p) + struct task_struct *p) { u32 task_load; if (!sched_enable_hmp || sched_disable_window_stats) return; - task_load = sched_use_pelt ? p->se.avg.runnable_avg_sum_scaled : - (sched_disable_window_stats ? 0 : p->ravg.demand); + task_load = sched_disable_window_stats ? 0 : p->ravg.demand; stats->cumulative_runnable_avg -= task_load; BUG_ON((s64)stats->cumulative_runnable_avg < 0); - set_pred_demands_sum(stats, stats->pred_demands_sum - - p->ravg.pred_demand); - verify_pred_demands_sum(stats); + stats->pred_demands_sum -= p->ravg.pred_demand; + BUG_ON((s64)stats->pred_demands_sum < 0); } static inline void @@ -1243,12 +1250,10 @@ fixup_cumulative_runnable_avg(struct hmp_sched_stats *stats, stats->cumulative_runnable_avg += task_load_delta; BUG_ON((s64)stats->cumulative_runnable_avg < 0); - set_pred_demands_sum(stats, stats->pred_demands_sum + - pred_demand_delta); - verify_pred_demands_sum(stats); + stats->pred_demands_sum += pred_demand_delta; + BUG_ON((s64)stats->pred_demands_sum < 0); } - #define pct_to_real(tunable) \ (div64_u64((u64)tunable * (u64)max_task_load(), 100)) @@ -1280,90 +1285,25 @@ static inline int sched_cpu_high_irqload(int cpu) return sched_irqload(cpu) >= sysctl_sched_cpu_high_irqload; } -static inline -struct related_thread_group *task_related_thread_group(struct task_struct *p) +static inline bool task_in_related_thread_group(struct task_struct *p) { - return rcu_dereference(p->grp); + return !!(rcu_access_pointer(p->grp) != NULL); } -#else /* CONFIG_SCHED_HMP */ - -#define sched_use_pelt 0 - -struct hmp_sched_stats; -struct related_thread_group; - -static inline u64 scale_load_to_cpu(u64 load, int cpu) -{ - return load; -} - -static inline unsigned int nr_eligible_big_tasks(int cpu) -{ - return 0; -} - -static inline int pct_task_load(struct task_struct *p) { return 0; } - -static inline int cpu_capacity(int cpu) -{ - return SCHED_LOAD_SCALE; -} - -static inline int same_cluster(int src_cpu, int dst_cpu) { return 1; } - -static inline void inc_cumulative_runnable_avg(struct hmp_sched_stats *stats, - struct task_struct *p) -{ -} - -static inline void dec_cumulative_runnable_avg(struct hmp_sched_stats *stats, - struct task_struct *p) -{ -} - -static inline void sched_account_irqtime(int cpu, struct task_struct *curr, - u64 delta, u64 wallclock) -{ -} - -static inline void sched_account_irqstart(int cpu, struct task_struct *curr, - u64 wallclock) -{ -} - -static inline int sched_cpu_high_irqload(int cpu) { return 0; } - -static inline void set_preferred_cluster(struct related_thread_group *grp) { } - static inline struct related_thread_group *task_related_thread_group(struct task_struct *p) { - return NULL; -} - -static inline u32 task_load(struct task_struct *p) { return 0; } - -static inline int update_preferred_cluster(struct related_thread_group *grp, - struct task_struct *p, u32 old_load) -{ - return 0; + return rcu_dereference(p->grp); } -#endif /* CONFIG_SCHED_HMP */ - -/* - * Returns the rq capacity of any rq in a group. This does not play - * well with groups where rq capacity can change independently. - */ -#define group_rq_capacity(group) cpu_capacity(group_first_cpu(group)) - -#ifdef CONFIG_SCHED_FREQ_INPUT #define PRED_DEMAND_DELTA ((s64)new_pred_demand - p->ravg.pred_demand) extern void check_for_freq_change(struct rq *rq, bool check_pred, bool check_groups); +extern void notify_migration(int src_cpu, int dest_cpu, + bool src_cpu_dead, struct task_struct *p); + struct group_cpu_time { u64 curr_runnable_sum; u64 prev_runnable_sum; @@ -1383,23 +1323,6 @@ static inline int same_freq_domain(int src_cpu, int dst_cpu) return cpumask_test_cpu(dst_cpu, &rq->freq_domain_cpumask); } -#else /* CONFIG_SCHED_FREQ_INPUT */ - -#define sched_migration_fixup 0 -#define PRED_DEMAND_DELTA (0) - -static inline void -check_for_freq_change(struct rq *rq, bool check_pred, bool check_groups) { } - -static inline int same_freq_domain(int src_cpu, int dst_cpu) -{ - return 1; -} - -#endif /* CONFIG_SCHED_FREQ_INPUT */ - -#ifdef CONFIG_SCHED_HMP - #define BOOST_KICK 0 #define CPU_RESERVED 1 @@ -1456,11 +1379,220 @@ extern unsigned int power_cost(int cpu, u64 demand); extern void reset_all_window_stats(u64 window_start, unsigned int window_size); extern void boost_kick(int cpu); extern int sched_boost(void); +extern int task_load_will_fit(struct task_struct *p, u64 task_load, int cpu, + enum sched_boost_type boost_type); +extern enum sched_boost_type sched_boost_type(void); +extern int task_will_fit(struct task_struct *p, int cpu); +extern int group_will_fit(struct sched_cluster *cluster, + struct related_thread_group *grp, u64 demand); +extern u64 cpu_load(int cpu); +extern u64 cpu_load_sync(int cpu, int sync); +extern int preferred_cluster(struct sched_cluster *cluster, + struct task_struct *p); +extern void inc_nr_big_task(struct hmp_sched_stats *stats, + struct task_struct *p); +extern void dec_nr_big_task(struct hmp_sched_stats *stats, + struct task_struct *p); +extern void inc_rq_hmp_stats(struct rq *rq, + struct task_struct *p, int change_cra); +extern void dec_rq_hmp_stats(struct rq *rq, + struct task_struct *p, int change_cra); +extern int is_big_task(struct task_struct *p); +extern int upmigrate_discouraged(struct task_struct *p); +extern struct sched_cluster *rq_cluster(struct rq *rq); +extern int nr_big_tasks(struct rq *rq); +extern void fixup_nr_big_tasks(struct hmp_sched_stats *stats, + struct task_struct *p, s64 delta); +extern void reset_task_stats(struct task_struct *p); +extern void reset_cfs_rq_hmp_stats(int cpu, int reset_cra); +extern void _inc_hmp_sched_stats_fair(struct rq *rq, + struct task_struct *p, int change_cra); +extern u64 cpu_upmigrate_discourage_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft); +extern int cpu_upmigrate_discourage_write_u64(struct cgroup_subsys_state *css, + struct cftype *cft, u64 upmigrate_discourage); + +#else /* CONFIG_SCHED_HMP */ + +struct hmp_sched_stats; +struct related_thread_group; +struct sched_cluster; + +static inline int got_boost_kick(void) +{ + return 0; +} + +static inline void update_task_ravg(struct task_struct *p, struct rq *rq, + int event, u64 wallclock, u64 irqtime) { } + +static inline bool early_detection_notify(struct rq *rq, u64 wallclock) +{ + return 0; +} + +static inline void clear_ed_task(struct task_struct *p, struct rq *rq) { } +static inline void fixup_busy_time(struct task_struct *p, int new_cpu) { } +static inline void clear_boost_kick(int cpu) { } +static inline void clear_hmp_request(int cpu) { } +static inline void mark_task_starting(struct task_struct *p) { } +static inline void set_window_start(struct rq *rq) { } +static inline void migrate_sync_cpu(int cpu) { } +static inline void update_cluster_topology(void) { } +static inline void set_task_last_wake(struct task_struct *p, u64 wallclock) { } +static inline void set_task_last_switch_out(struct task_struct *p, + u64 wallclock) { } + +static inline int task_will_fit(struct task_struct *p, int cpu) +{ + return 1; +} + +static inline int select_best_cpu(struct task_struct *p, int target, + int reason, int sync) +{ + return 0; +} + +static inline unsigned int power_cost(int cpu, u64 demand) +{ + return SCHED_CAPACITY_SCALE; +} + +static inline int sched_boost(void) +{ + return 0; +} + +static inline int is_big_task(struct task_struct *p) +{ + return 0; +} -#else /* CONFIG_SCHED_HMP */ +static inline int nr_big_tasks(struct rq *rq) +{ + return 0; +} + +static inline int is_cpu_throttling_imminent(int cpu) +{ + return 0; +} + +static inline int is_task_migration_throttled(struct task_struct *p) +{ + return 0; +} + +static inline unsigned int cpu_temp(int cpu) +{ + return 0; +} + +static inline void +inc_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra) { } + +static inline void +dec_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra) { } + +static inline void +inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p) { } + +static inline void +dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p) { } + +static inline int +preferred_cluster(struct sched_cluster *cluster, struct task_struct *p) +{ + return 1; +} + +static inline struct sched_cluster *rq_cluster(struct rq *rq) +{ + return NULL; +} + +static inline void init_new_task_load(struct task_struct *p) { } + +static inline u64 scale_load_to_cpu(u64 load, int cpu) +{ + return load; +} + +static inline unsigned int nr_eligible_big_tasks(int cpu) +{ + return 0; +} + +static inline int pct_task_load(struct task_struct *p) { return 0; } + +static inline int cpu_capacity(int cpu) +{ + return SCHED_LOAD_SCALE; +} + +static inline int same_cluster(int src_cpu, int dst_cpu) { return 1; } + +static inline void inc_cumulative_runnable_avg(struct hmp_sched_stats *stats, + struct task_struct *p) +{ +} + +static inline void dec_cumulative_runnable_avg(struct hmp_sched_stats *stats, + struct task_struct *p) +{ +} + +static inline void sched_account_irqtime(int cpu, struct task_struct *curr, + u64 delta, u64 wallclock) +{ +} + +static inline void sched_account_irqstart(int cpu, struct task_struct *curr, + u64 wallclock) +{ +} + +static inline int sched_cpu_high_irqload(int cpu) { return 0; } + +static inline void set_preferred_cluster(struct related_thread_group *grp) { } + +static inline bool task_in_related_thread_group(struct task_struct *p) +{ + return false; +} + +static inline +struct related_thread_group *task_related_thread_group(struct task_struct *p) +{ + return NULL; +} + +static inline u32 task_load(struct task_struct *p) { return 0; } + +static inline int update_preferred_cluster(struct related_thread_group *grp, + struct task_struct *p, u32 old_load) +{ + return 0; +} + +static inline void add_new_task_to_grp(struct task_struct *new) {} #define sched_enable_hmp 0 #define sched_freq_legacy_mode 1 +#define sched_migration_fixup 0 +#define PRED_DEMAND_DELTA (0) + +static inline void +check_for_freq_change(struct rq *rq, bool check_pred, bool check_groups) { } + +static inline void notify_migration(int src_cpu, int dest_cpu, + bool src_cpu_dead, struct task_struct *p) { } + +static inline int same_freq_domain(int src_cpu, int dst_cpu) +{ + return 1; +} static inline void check_for_migration(struct rq *rq, struct task_struct *p) { } static inline void pre_big_task_count_change(void) { } @@ -1474,7 +1606,13 @@ static inline void clear_reserved(int cpu) { } #define trace_sched_cpu_load_cgroup(...) #define trace_sched_cpu_load_wakeup(...) -#endif /* CONFIG_SCHED_HMP */ +#endif /* CONFIG_SCHED_HMP */ + +/* + * Returns the rq capacity of any rq in a group. This does not play + * well with groups where rq capacity can change independently. + */ +#define group_rq_capacity(group) cpu_capacity(group_first_cpu(group)) #ifdef CONFIG_CGROUP_SCHED @@ -1496,11 +1634,6 @@ static inline struct task_group *task_group(struct task_struct *p) return p->sched_task_group; } -static inline bool task_notify_on_migrate(struct task_struct *p) -{ - return task_group(p)->notify_on_migrate; -} - /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { @@ -1526,10 +1659,6 @@ static inline struct task_group *task_group(struct task_struct *p) { return NULL; } -static inline bool task_notify_on_migrate(struct task_struct *p) -{ - return false; -} #endif /* CONFIG_CGROUP_SCHED */ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 81fbed978da3..ac34212f6881 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -292,14 +292,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "sched_wakeup_load_threshold", - .data = &sysctl_sched_wakeup_load_threshold, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#ifdef CONFIG_SCHED_FREQ_INPUT +#ifdef CONFIG_SCHED_HMP { .procname = "sched_freq_inc_notify", .data = &sysctl_sched_freq_inc_notify, @@ -316,8 +309,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = &zero, }, -#endif -#ifdef CONFIG_SCHED_HMP { .procname = "sched_cpu_high_irqload", .data = &sysctl_sched_cpu_high_irqload, @@ -414,7 +405,13 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = sched_hmp_proc_update_handler, }, -#ifdef CONFIG_SCHED_FREQ_INPUT + { + .procname = "sched_enable_thread_grouping", + .data = &sysctl_sched_enable_thread_grouping, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { .procname = "sched_new_task_windows", .data = &sysctl_sched_new_task_windows, @@ -437,7 +434,6 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = sched_window_update_handler, }, -#endif { .procname = "sched_boost", .data = &sysctl_sched_boost, |