/* Copyright (c) 2012-2016, The Linux Foundation. All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 and * only version 2 as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * Implementation credits: Srivatsa Vaddagiri, Steve Muckle * Syed Rameez Mustafa, Olav haugan, Joonwoo Park, Pavan Kumar Kondeti * and Vikram Mulukutla */ #include #include #include #include "sched.h" #include #define CSTATE_LATENCY_GRANULARITY_SHIFT (6) const char *task_event_names[] = {"PUT_PREV_TASK", "PICK_NEXT_TASK", "TASK_WAKE", "TASK_MIGRATE", "TASK_UPDATE", "IRQ_UPDATE"}; const char *migrate_type_names[] = {"GROUP_TO_RQ", "RQ_TO_GROUP", "RQ_TO_RQ", "GROUP_TO_GROUP"}; static ktime_t ktime_last; static bool sched_ktime_suspended; static bool use_cycle_counter; static struct cpu_cycle_counter_cb cpu_cycle_counter_cb; u64 sched_ktime_clock(void) { if (unlikely(sched_ktime_suspended)) return ktime_to_ns(ktime_last); return ktime_get_ns(); } static void sched_resume(void) { sched_ktime_suspended = false; } static int sched_suspend(void) { ktime_last = ktime_get(); sched_ktime_suspended = true; return 0; } static struct syscore_ops sched_syscore_ops = { .resume = sched_resume, .suspend = sched_suspend }; static int __init sched_init_ops(void) { register_syscore_ops(&sched_syscore_ops); return 0; } late_initcall(sched_init_ops); inline void clear_ed_task(struct task_struct *p, struct rq *rq) { if (p == rq->ed_task) rq->ed_task = NULL; } inline void set_task_last_wake(struct task_struct *p, u64 wallclock) { p->last_wake_ts = wallclock; } inline void set_task_last_switch_out(struct task_struct *p, u64 wallclock) { p->last_switch_out_ts = wallclock; } /* * Note C-state for (idle) cpus. * * @cstate = cstate index, 0 -> active state * @wakeup_energy = energy spent in waking up cpu * @wakeup_latency = latency to wakeup from cstate * */ void sched_set_cpu_cstate(int cpu, int cstate, int wakeup_energy, int wakeup_latency) { struct rq *rq = cpu_rq(cpu); rq->cstate = cstate; /* C1, C2 etc */ rq->wakeup_energy = wakeup_energy; /* disregard small latency delta (64 us). */ rq->wakeup_latency = ((wakeup_latency >> CSTATE_LATENCY_GRANULARITY_SHIFT) << CSTATE_LATENCY_GRANULARITY_SHIFT); } /* * Note D-state for (idle) cluster. * * @dstate = dstate index, 0 -> active state * @wakeup_energy = energy spent in waking up cluster * @wakeup_latency = latency to wakeup from cluster * */ void sched_set_cluster_dstate(const cpumask_t *cluster_cpus, int dstate, int wakeup_energy, int wakeup_latency) { struct sched_cluster *cluster = cpu_rq(cpumask_first(cluster_cpus))->cluster; cluster->dstate = dstate; cluster->dstate_wakeup_energy = wakeup_energy; cluster->dstate_wakeup_latency = wakeup_latency; } u32 __weak get_freq_max_load(int cpu, u32 freq) { /* 100% by default */ return 100; } struct freq_max_load_entry { /* The maximum load which has accounted governor's headroom. */ u64 hdemand; }; struct freq_max_load { struct rcu_head rcu; int length; struct freq_max_load_entry freqs[0]; }; static DEFINE_PER_CPU(struct freq_max_load *, freq_max_load); static DEFINE_SPINLOCK(freq_max_load_lock); struct cpu_pwr_stats __weak *get_cpu_pwr_stats(void) { return NULL; } int sched_update_freq_max_load(const cpumask_t *cpumask) { int i, cpu, ret; unsigned int freq; struct cpu_pstate_pwr *costs; struct cpu_pwr_stats *per_cpu_info = get_cpu_pwr_stats(); struct freq_max_load *max_load, *old_max_load; struct freq_max_load_entry *entry; u64 max_demand_capacity, max_demand; unsigned long flags; u32 hfreq; int hpct; if (!per_cpu_info) return 0; spin_lock_irqsave(&freq_max_load_lock, flags); max_demand_capacity = div64_u64(max_task_load(), max_possible_capacity); for_each_cpu(cpu, cpumask) { if (!per_cpu_info[cpu].ptable) { ret = -EINVAL; goto fail; } old_max_load = rcu_dereference(per_cpu(freq_max_load, cpu)); /* * allocate len + 1 and leave the last power cost as 0 for * power_cost() can stop iterating index when * per_cpu_info[cpu].len > len of max_load due to race between * cpu power stats update and get_cpu_pwr_stats(). */ max_load = kzalloc(sizeof(struct freq_max_load) + sizeof(struct freq_max_load_entry) * (per_cpu_info[cpu].len + 1), GFP_ATOMIC); if (unlikely(!max_load)) { ret = -ENOMEM; goto fail; } max_load->length = per_cpu_info[cpu].len; max_demand = max_demand_capacity * cpu_max_possible_capacity(cpu); i = 0; costs = per_cpu_info[cpu].ptable; while (costs[i].freq) { entry = &max_load->freqs[i]; freq = costs[i].freq; hpct = get_freq_max_load(cpu, freq); if (hpct <= 0 || hpct > 100) hpct = 100; hfreq = div64_u64((u64)freq * hpct, 100); entry->hdemand = div64_u64(max_demand * hfreq, cpu_max_possible_freq(cpu)); i++; } rcu_assign_pointer(per_cpu(freq_max_load, cpu), max_load); if (old_max_load) kfree_rcu(old_max_load, rcu); } spin_unlock_irqrestore(&freq_max_load_lock, flags); return 0; fail: for_each_cpu(cpu, cpumask) { max_load = rcu_dereference(per_cpu(freq_max_load, cpu)); if (max_load) { rcu_assign_pointer(per_cpu(freq_max_load, cpu), NULL); kfree_rcu(max_load, rcu); } } spin_unlock_irqrestore(&freq_max_load_lock, flags); return ret; } unsigned int max_possible_efficiency = 1; unsigned int min_possible_efficiency = UINT_MAX; unsigned long __weak arch_get_cpu_efficiency(int cpu) { return SCHED_LOAD_SCALE; } /* Keep track of max/min capacity possible across CPUs "currently" */ static void __update_min_max_capacity(void) { int i; int max_cap = 0, min_cap = INT_MAX; for_each_online_cpu(i) { max_cap = max(max_cap, cpu_capacity(i)); min_cap = min(min_cap, cpu_capacity(i)); } max_capacity = max_cap; min_capacity = min_cap; } static void update_min_max_capacity(void) { unsigned long flags; int i; local_irq_save(flags); for_each_possible_cpu(i) raw_spin_lock(&cpu_rq(i)->lock); __update_min_max_capacity(); for_each_possible_cpu(i) raw_spin_unlock(&cpu_rq(i)->lock); local_irq_restore(flags); } /* * Return 'capacity' of a cpu in reference to "least" efficient cpu, such that * least efficient cpu gets capacity of 1024 */ static unsigned long capacity_scale_cpu_efficiency(struct sched_cluster *cluster) { return (1024 * cluster->efficiency) / min_possible_efficiency; } /* * Return 'capacity' of a cpu in reference to cpu with lowest max_freq * (min_max_freq), such that one with lowest max_freq gets capacity of 1024. */ static unsigned long capacity_scale_cpu_freq(struct sched_cluster *cluster) { return (1024 * cluster_max_freq(cluster)) / min_max_freq; } /* * Return load_scale_factor of a cpu in reference to "most" efficient cpu, so * that "most" efficient cpu gets a load_scale_factor of 1 */ static inline unsigned long load_scale_cpu_efficiency(struct sched_cluster *cluster) { return DIV_ROUND_UP(1024 * max_possible_efficiency, cluster->efficiency); } /* * Return load_scale_factor of a cpu in reference to cpu with best max_freq * (max_possible_freq), so that one with best max_freq gets a load_scale_factor * of 1. */ static inline unsigned long load_scale_cpu_freq(struct sched_cluster *cluster) { return DIV_ROUND_UP(1024 * max_possible_freq, cluster_max_freq(cluster)); } static int compute_capacity(struct sched_cluster *cluster) { int capacity = 1024; capacity *= capacity_scale_cpu_efficiency(cluster); capacity >>= 10; capacity *= capacity_scale_cpu_freq(cluster); capacity >>= 10; return capacity; } static int compute_max_possible_capacity(struct sched_cluster *cluster) { int capacity = 1024; capacity *= capacity_scale_cpu_efficiency(cluster); capacity >>= 10; capacity *= (1024 * cluster->max_possible_freq) / min_max_freq; capacity >>= 10; return capacity; } static int compute_load_scale_factor(struct sched_cluster *cluster) { int load_scale = 1024; /* * load_scale_factor accounts for the fact that task load * is in reference to "best" performing cpu. Task's load will need to be * scaled (up) by a factor to determine suitability to be placed on a * (little) cpu. */ load_scale *= load_scale_cpu_efficiency(cluster); load_scale >>= 10; load_scale *= load_scale_cpu_freq(cluster); load_scale >>= 10; return load_scale; } struct list_head cluster_head; static DEFINE_MUTEX(cluster_lock); static cpumask_t all_cluster_cpus = CPU_MASK_NONE; DECLARE_BITMAP(all_cluster_ids, NR_CPUS); struct sched_cluster *sched_cluster[NR_CPUS]; int num_clusters; unsigned int max_power_cost = 1; struct sched_cluster init_cluster = { .list = LIST_HEAD_INIT(init_cluster.list), .id = 0, .max_power_cost = 1, .min_power_cost = 1, .capacity = 1024, .max_possible_capacity = 1024, .efficiency = 1, .load_scale_factor = 1024, .cur_freq = 1, .max_freq = 1, .max_mitigated_freq = UINT_MAX, .min_freq = 1, .max_possible_freq = 1, .dstate = 0, .dstate_wakeup_energy = 0, .dstate_wakeup_latency = 0, .exec_scale_factor = 1024, .notifier_sent = 0, }; static void update_all_clusters_stats(void) { struct sched_cluster *cluster; u64 highest_mpc = 0, lowest_mpc = U64_MAX; pre_big_task_count_change(cpu_possible_mask); for_each_sched_cluster(cluster) { u64 mpc; cluster->capacity = compute_capacity(cluster); mpc = cluster->max_possible_capacity = compute_max_possible_capacity(cluster); cluster->load_scale_factor = compute_load_scale_factor(cluster); cluster->exec_scale_factor = DIV_ROUND_UP(cluster->efficiency * 1024, max_possible_efficiency); if (mpc > highest_mpc) highest_mpc = mpc; if (mpc < lowest_mpc) lowest_mpc = mpc; } max_possible_capacity = highest_mpc; min_max_possible_capacity = lowest_mpc; __update_min_max_capacity(); sched_update_freq_max_load(cpu_possible_mask); post_big_task_count_change(cpu_possible_mask); } static void assign_cluster_ids(struct list_head *head) { struct sched_cluster *cluster; int pos = 0; list_for_each_entry(cluster, head, list) { cluster->id = pos; sched_cluster[pos++] = cluster; } } static void move_list(struct list_head *dst, struct list_head *src, bool sync_rcu) { struct list_head *first, *last; first = src->next; last = src->prev; if (sync_rcu) { INIT_LIST_HEAD_RCU(src); synchronize_rcu(); } first->prev = dst; dst->prev = last; last->next = dst; /* Ensure list sanity before making the head visible to all CPUs. */ smp_mb(); dst->next = first; } static int compare_clusters(void *priv, struct list_head *a, struct list_head *b) { struct sched_cluster *cluster1, *cluster2; int ret; cluster1 = container_of(a, struct sched_cluster, list); cluster2 = container_of(b, struct sched_cluster, list); ret = cluster1->max_power_cost > cluster2->max_power_cost || (cluster1->max_power_cost == cluster2->max_power_cost && cluster1->max_possible_capacity < cluster2->max_possible_capacity); return ret; } static void sort_clusters(void) { struct sched_cluster *cluster; struct list_head new_head; unsigned int tmp_max = 1; INIT_LIST_HEAD(&new_head); for_each_sched_cluster(cluster) { cluster->max_power_cost = power_cost(cluster_first_cpu(cluster), max_task_load()); cluster->min_power_cost = power_cost(cluster_first_cpu(cluster), 0); if (cluster->max_power_cost > tmp_max) tmp_max = cluster->max_power_cost; } max_power_cost = tmp_max; move_list(&new_head, &cluster_head, true); list_sort(NULL, &new_head, compare_clusters); assign_cluster_ids(&new_head); /* * Ensure cluster ids are visible to all CPUs before making * cluster_head visible. */ move_list(&cluster_head, &new_head, false); } static void insert_cluster(struct sched_cluster *cluster, struct list_head *head) { struct sched_cluster *tmp; struct list_head *iter = head; list_for_each_entry(tmp, head, list) { if (cluster->max_power_cost < tmp->max_power_cost) break; iter = &tmp->list; } list_add(&cluster->list, iter); } static struct sched_cluster *alloc_new_cluster(const struct cpumask *cpus) { struct sched_cluster *cluster = NULL; cluster = kzalloc(sizeof(struct sched_cluster), GFP_ATOMIC); if (!cluster) { __WARN_printf("Cluster allocation failed. \ Possible bad scheduling\n"); return NULL; } INIT_LIST_HEAD(&cluster->list); cluster->max_power_cost = 1; cluster->min_power_cost = 1; cluster->capacity = 1024; cluster->max_possible_capacity = 1024; cluster->efficiency = 1; cluster->load_scale_factor = 1024; cluster->cur_freq = 1; cluster->max_freq = 1; cluster->max_mitigated_freq = UINT_MAX; cluster->min_freq = 1; cluster->max_possible_freq = 1; cluster->dstate = 0; cluster->dstate_wakeup_energy = 0; cluster->dstate_wakeup_latency = 0; cluster->freq_init_done = false; raw_spin_lock_init(&cluster->load_lock); cluster->cpus = *cpus; cluster->efficiency = arch_get_cpu_efficiency(cpumask_first(cpus)); if (cluster->efficiency > max_possible_efficiency) max_possible_efficiency = cluster->efficiency; if (cluster->efficiency < min_possible_efficiency) min_possible_efficiency = cluster->efficiency; cluster->notifier_sent = 0; return cluster; } static void add_cluster(const struct cpumask *cpus, struct list_head *head) { struct sched_cluster *cluster = alloc_new_cluster(cpus); int i; if (!cluster) return; for_each_cpu(i, cpus) cpu_rq(i)->cluster = cluster; insert_cluster(cluster, head); set_bit(num_clusters, all_cluster_ids); num_clusters++; } void update_cluster_topology(void) { struct cpumask cpus = *cpu_possible_mask; const struct cpumask *cluster_cpus; struct list_head new_head; int i; INIT_LIST_HEAD(&new_head); for_each_cpu(i, &cpus) { cluster_cpus = cpu_coregroup_mask(i); cpumask_or(&all_cluster_cpus, &all_cluster_cpus, cluster_cpus); cpumask_andnot(&cpus, &cpus, cluster_cpus); add_cluster(cluster_cpus, &new_head); } assign_cluster_ids(&new_head); /* * Ensure cluster ids are visible to all CPUs before making * cluster_head visible. */ move_list(&cluster_head, &new_head, false); } void init_clusters(void) { bitmap_clear(all_cluster_ids, 0, NR_CPUS); init_cluster.cpus = *cpu_possible_mask; raw_spin_lock_init(&init_cluster.load_lock); INIT_LIST_HEAD(&cluster_head); } int register_cpu_cycle_counter_cb(struct cpu_cycle_counter_cb *cb) { mutex_lock(&cluster_lock); if (!cb->get_cpu_cycle_counter) { mutex_unlock(&cluster_lock); return -EINVAL; } cpu_cycle_counter_cb = *cb; use_cycle_counter = true; mutex_unlock(&cluster_lock); return 0; } int __init set_sched_enable_hmp(char *str) { int enable_hmp = 0; get_option(&str, &enable_hmp); sched_enable_hmp = !!enable_hmp; return 0; } early_param("sched_enable_hmp", set_sched_enable_hmp); /* Clear any HMP scheduler related requests pending from or on cpu */ void clear_hmp_request(int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long flags; clear_boost_kick(cpu); clear_reserved(cpu); if (rq->push_task) { raw_spin_lock_irqsave(&rq->lock, flags); if (rq->push_task) { clear_reserved(rq->push_cpu); put_task_struct(rq->push_task); rq->push_task = NULL; } rq->active_balance = 0; raw_spin_unlock_irqrestore(&rq->lock, flags); } } int sched_set_static_cpu_pwr_cost(int cpu, unsigned int cost) { struct rq *rq = cpu_rq(cpu); rq->static_cpu_pwr_cost = cost; return 0; } unsigned int sched_get_static_cpu_pwr_cost(int cpu) { return cpu_rq(cpu)->static_cpu_pwr_cost; } int sched_set_static_cluster_pwr_cost(int cpu, unsigned int cost) { struct sched_cluster *cluster = cpu_rq(cpu)->cluster; cluster->static_cluster_pwr_cost = cost; return 0; } unsigned int sched_get_static_cluster_pwr_cost(int cpu) { return cpu_rq(cpu)->cluster->static_cluster_pwr_cost; } /* * sched_window_stats_policy and sched_ravg_hist_size have a 'sysctl' copy * associated with them. This is required for atomic update of those variables * when being modifed via sysctl interface. * * IMPORTANT: Initialize both copies to same value!! */ /* * Tasks that are runnable continuously for a period greather than * EARLY_DETECTION_DURATION can be flagged early as potential * high load tasks. */ #define EARLY_DETECTION_DURATION 9500000 static __read_mostly unsigned int sched_ravg_hist_size = 5; __read_mostly unsigned int sysctl_sched_ravg_hist_size = 5; static __read_mostly unsigned int sched_window_stats_policy = WINDOW_STATS_MAX_RECENT_AVG; __read_mostly unsigned int sysctl_sched_window_stats_policy = WINDOW_STATS_MAX_RECENT_AVG; #define SCHED_ACCOUNT_WAIT_TIME 1 __read_mostly unsigned int sysctl_sched_cpu_high_irqload = (10 * NSEC_PER_MSEC); unsigned int __read_mostly sysctl_sched_enable_colocation = 1; /* * Enable colocation and frequency aggregation for all threads in a process. * The children inherits the group id from the parent. */ unsigned int __read_mostly sysctl_sched_enable_thread_grouping; __read_mostly unsigned int sysctl_sched_new_task_windows = 5; #define SCHED_FREQ_ACCOUNT_WAIT_TIME 0 /* * This governs what load needs to be used when reporting CPU busy time * to the cpufreq governor. */ __read_mostly unsigned int sysctl_sched_freq_reporting_policy; /* * For increase, send notification if * freq_required - cur_freq > sysctl_sched_freq_inc_notify */ __read_mostly int sysctl_sched_freq_inc_notify = 10 * 1024 * 1024; /* + 10GHz */ /* * For decrease, send notification if * cur_freq - freq_required > sysctl_sched_freq_dec_notify */ __read_mostly int sysctl_sched_freq_dec_notify = 10 * 1024 * 1024; /* - 10GHz */ static __read_mostly unsigned int sched_io_is_busy; __read_mostly unsigned int sysctl_sched_pred_alert_freq = 10 * 1024 * 1024; /* * Maximum possible frequency across all cpus. Task demand and cpu * capacity (cpu_power) metrics are scaled in reference to it. */ unsigned int max_possible_freq = 1; /* * Minimum possible max_freq across all cpus. This will be same as * max_possible_freq on homogeneous systems and could be different from * max_possible_freq on heterogenous systems. min_max_freq is used to derive * capacity (cpu_power) of cpus. */ unsigned int min_max_freq = 1; unsigned int max_capacity = 1024; /* max(rq->capacity) */ unsigned int min_capacity = 1024; /* min(rq->capacity) */ unsigned int max_possible_capacity = 1024; /* max(rq->max_possible_capacity) */ unsigned int min_max_possible_capacity = 1024; /* min(rq->max_possible_capacity) */ /* Min window size (in ns) = 10ms */ #define MIN_SCHED_RAVG_WINDOW 10000000 /* Max window size (in ns) = 1s */ #define MAX_SCHED_RAVG_WINDOW 1000000000 /* Window size (in ns) */ __read_mostly unsigned int sched_ravg_window = MIN_SCHED_RAVG_WINDOW; /* Maximum allowed threshold before freq aggregation must be enabled */ #define MAX_FREQ_AGGR_THRESH 1000 /* Temporarily disable window-stats activity on all cpus */ unsigned int __read_mostly sched_disable_window_stats; /* * Major task runtime. If a task runs for more than sched_major_task_runtime * in a window, it's considered to be generating majority of workload * for this window. Prediction could be adjusted for such tasks. */ __read_mostly unsigned int sched_major_task_runtime = 10000000; static unsigned int sync_cpu; static LIST_HEAD(related_thread_groups); static DEFINE_RWLOCK(related_thread_group_lock); #define for_each_related_thread_group(grp) \ list_for_each_entry(grp, &related_thread_groups, list) /* * Task load is categorized into buckets for the purpose of top task tracking. * The entire range of load from 0 to sched_ravg_window needs to be covered * in NUM_LOAD_INDICES number of buckets. Therefore the size of each bucket * is given by sched_ravg_window / NUM_LOAD_INDICES. Since the default value * of sched_ravg_window is MIN_SCHED_RAVG_WINDOW, use that to compute * sched_load_granule. */ __read_mostly unsigned int sched_load_granule = MIN_SCHED_RAVG_WINDOW / NUM_LOAD_INDICES; /* Size of bitmaps maintained to track top tasks */ static const unsigned int top_tasks_bitmap_size = BITS_TO_LONGS(NUM_LOAD_INDICES + 1) * sizeof(unsigned long); /* * Demand aggregation for frequency purpose: * * 'sched_freq_aggregate' controls aggregation of cpu demand of related threads * for frequency determination purpose. This aggregation is done per-cluster. * * CPU demand of tasks from various related groups is aggregated per-cluster and * added to the "max_busy_cpu" in that cluster, where max_busy_cpu is determined * by just rq->prev_runnable_sum. * * Some examples follow, which assume: * Cluster0 = CPU0-3, Cluster1 = CPU4-7 * One related thread group A that has tasks A0, A1, A2 * * A->cpu_time[X].curr/prev_sum = counters in which cpu execution stats of * tasks belonging to group A are accumulated when they run on cpu X. * * CX->curr/prev_sum = counters in which cpu execution stats of all tasks * not belonging to group A are accumulated when they run on cpu X * * Lets say the stats for window M was as below: * * C0->prev_sum = 1ms, A->cpu_time[0].prev_sum = 5ms * Task A0 ran 5ms on CPU0 * Task B0 ran 1ms on CPU0 * * C1->prev_sum = 5ms, A->cpu_time[1].prev_sum = 6ms * Task A1 ran 4ms on CPU1 * Task A2 ran 2ms on CPU1 * Task B1 ran 5ms on CPU1 * * C2->prev_sum = 0ms, A->cpu_time[2].prev_sum = 0 * CPU2 idle * * C3->prev_sum = 0ms, A->cpu_time[3].prev_sum = 0 * CPU3 idle * * In this case, CPU1 was most busy going by just its prev_sum counter. Demand * from all group A tasks are added to CPU1. IOW, at end of window M, cpu busy * time reported to governor will be: * * * C0 busy time = 1ms * C1 busy time = 5 + 5 + 6 = 16ms * */ static __read_mostly unsigned int sched_freq_aggregate = 1; __read_mostly unsigned int sysctl_sched_freq_aggregate = 1; unsigned int __read_mostly sysctl_sched_freq_aggregate_threshold_pct; static unsigned int __read_mostly sched_freq_aggregate_threshold; /* Initial task load. Newly created tasks are assigned this load. */ unsigned int __read_mostly sched_init_task_load_windows; unsigned int __read_mostly sysctl_sched_init_task_load_pct = 15; unsigned int max_task_load(void) { return sched_ravg_window; } /* Use this knob to turn on or off HMP-aware task placement logic */ unsigned int __read_mostly sched_enable_hmp; /* A cpu can no longer accommodate more tasks if: * * rq->nr_running > sysctl_sched_spill_nr_run || * rq->hmp_stats.cumulative_runnable_avg > sched_spill_load */ unsigned int __read_mostly sysctl_sched_spill_nr_run = 10; /* * Place sync wakee tasks those have less than configured demand to the waker's * cluster. */ unsigned int __read_mostly sched_small_wakee_task_load; unsigned int __read_mostly sysctl_sched_small_wakee_task_load_pct = 10; unsigned int __read_mostly sched_big_waker_task_load; unsigned int __read_mostly sysctl_sched_big_waker_task_load_pct = 25; /* * CPUs with load greater than the sched_spill_load_threshold are not * eligible for task placement. When all CPUs in a cluster achieve a * load higher than this level, tasks becomes eligible for inter * cluster migration. */ unsigned int __read_mostly sched_spill_load; unsigned int __read_mostly sysctl_sched_spill_load_pct = 100; /* * Prefer the waker CPU for sync wakee task, if the CPU has only 1 runnable * task. This eliminates the LPM exit latency associated with the idle * CPUs in the waker cluster. */ unsigned int __read_mostly sysctl_sched_prefer_sync_wakee_to_waker; /* * Tasks whose bandwidth consumption on a cpu is more than * sched_upmigrate are considered "big" tasks. Big tasks will be * considered for "up" migration, i.e migrating to a cpu with better * capacity. */ unsigned int __read_mostly sched_upmigrate; unsigned int __read_mostly sysctl_sched_upmigrate_pct = 80; /* * Big tasks, once migrated, will need to drop their bandwidth * consumption to less than sched_downmigrate before they are "down" * migrated. */ unsigned int __read_mostly sched_downmigrate; unsigned int __read_mostly sysctl_sched_downmigrate_pct = 60; /* * Task groups whose aggregate demand on a cpu is more than * sched_group_upmigrate need to be up-migrated if possible. */ unsigned int __read_mostly sched_group_upmigrate; unsigned int __read_mostly sysctl_sched_group_upmigrate_pct = 100; /* * Task groups, once up-migrated, will need to drop their aggregate * demand to less than sched_group_downmigrate before they are "down" * migrated. */ unsigned int __read_mostly sched_group_downmigrate; unsigned int __read_mostly sysctl_sched_group_downmigrate_pct = 95; /* * The load scale factor of a CPU gets boosted when its max frequency * is restricted due to which the tasks are migrating to higher capacity * CPUs early. The sched_upmigrate threshold is auto-upgraded by * rq->max_possible_freq/rq->max_freq of a lower capacity CPU. */ unsigned int up_down_migrate_scale_factor = 1024; /* * Scheduler selects and places task to its previous CPU if sleep time is * less than sysctl_sched_select_prev_cpu_us. */ unsigned int __read_mostly sched_short_sleep_task_threshold = 2000 * NSEC_PER_USEC; unsigned int __read_mostly sysctl_sched_select_prev_cpu_us = 2000; unsigned int __read_mostly sched_long_cpu_selection_threshold = 100 * NSEC_PER_MSEC; unsigned int __read_mostly sysctl_sched_restrict_cluster_spill; static void _update_up_down_migrate(unsigned int *up_migrate, unsigned int *down_migrate) { unsigned int delta; if (up_down_migrate_scale_factor == 1024) return; delta = *up_migrate - *down_migrate; *up_migrate /= NSEC_PER_USEC; *up_migrate *= up_down_migrate_scale_factor; *up_migrate >>= 10; *up_migrate *= NSEC_PER_USEC; *up_migrate = min(*up_migrate, sched_ravg_window); *down_migrate /= NSEC_PER_USEC; *down_migrate *= up_down_migrate_scale_factor; *down_migrate >>= 10; *down_migrate *= NSEC_PER_USEC; *down_migrate = min(*down_migrate, *up_migrate - delta); } static void update_up_down_migrate(void) { unsigned int up_migrate = pct_to_real(sysctl_sched_upmigrate_pct); unsigned int down_migrate = pct_to_real(sysctl_sched_downmigrate_pct); _update_up_down_migrate(&up_migrate, &down_migrate); sched_upmigrate = up_migrate; sched_downmigrate = down_migrate; up_migrate = pct_to_real(sysctl_sched_group_upmigrate_pct); down_migrate = pct_to_real(sysctl_sched_group_downmigrate_pct); _update_up_down_migrate(&up_migrate, &down_migrate); sched_group_upmigrate = up_migrate; sched_group_downmigrate = down_migrate; } void set_hmp_defaults(void) { sched_spill_load = pct_to_real(sysctl_sched_spill_load_pct); update_up_down_migrate(); sched_major_task_runtime = mult_frac(sched_ravg_window, MAJOR_TASK_PCT, 100); sched_init_task_load_windows = div64_u64((u64)sysctl_sched_init_task_load_pct * (u64)sched_ravg_window, 100); sched_short_sleep_task_threshold = sysctl_sched_select_prev_cpu_us * NSEC_PER_USEC; sched_small_wakee_task_load = div64_u64((u64)sysctl_sched_small_wakee_task_load_pct * (u64)sched_ravg_window, 100); sched_big_waker_task_load = div64_u64((u64)sysctl_sched_big_waker_task_load_pct * (u64)sched_ravg_window, 100); sched_freq_aggregate_threshold = pct_to_real(sysctl_sched_freq_aggregate_threshold_pct); } u32 sched_get_init_task_load(struct task_struct *p) { return p->init_load_pct; } int sched_set_init_task_load(struct task_struct *p, int init_load_pct) { if (init_load_pct < 0 || init_load_pct > 100) return -EINVAL; p->init_load_pct = init_load_pct; return 0; } #ifdef CONFIG_CGROUP_SCHED int upmigrate_discouraged(struct task_struct *p) { return task_group(p)->upmigrate_discouraged; } #else static inline int upmigrate_discouraged(struct task_struct *p) { return 0; } #endif /* Is a task "big" on its current cpu */ static inline int __is_big_task(struct task_struct *p, u64 scaled_load) { int nice = task_nice(p); if (nice > SCHED_UPMIGRATE_MIN_NICE || upmigrate_discouraged(p)) return 0; return scaled_load > sched_upmigrate; } int is_big_task(struct task_struct *p) { return __is_big_task(p, scale_load_to_cpu(task_load(p), task_cpu(p))); } u64 cpu_load(int cpu) { struct rq *rq = cpu_rq(cpu); return scale_load_to_cpu(rq->hmp_stats.cumulative_runnable_avg, cpu); } u64 cpu_load_sync(int cpu, int sync) { return scale_load_to_cpu(cpu_cravg_sync(cpu, sync), cpu); } /* * Task will fit on a cpu if it's bandwidth consumption on that cpu * will be less than sched_upmigrate. A big task that was previously * "up" migrated will be considered fitting on "little" cpu if its * bandwidth consumption on "little" cpu will be less than * sched_downmigrate. This will help avoid frequenty migrations for * tasks with load close to the upmigrate threshold */ int task_load_will_fit(struct task_struct *p, u64 task_load, int cpu, enum sched_boost_policy boost_policy) { int upmigrate = sched_upmigrate; if (cpu_capacity(cpu) == max_capacity) return 1; if (cpu_capacity(task_cpu(p)) > cpu_capacity(cpu)) upmigrate = sched_downmigrate; if (boost_policy != SCHED_BOOST_ON_BIG) { if (task_nice(p) > SCHED_UPMIGRATE_MIN_NICE || upmigrate_discouraged(p)) return 1; if (task_load < upmigrate) return 1; } else { if (task_sched_boost(p) || task_load >= upmigrate) return 0; return 1; } return 0; } int task_will_fit(struct task_struct *p, int cpu) { u64 tload = scale_load_to_cpu(task_load(p), cpu); return task_load_will_fit(p, tload, cpu, sched_boost_policy()); } static int group_will_fit(struct sched_cluster *cluster, struct related_thread_group *grp, u64 demand, bool group_boost) { int cpu = cluster_first_cpu(cluster); int prev_capacity = 0; unsigned int threshold = sched_group_upmigrate; u64 load; if (cluster->capacity == max_capacity) return 1; if (group_boost) return 0; if (!demand) return 1; if (grp->preferred_cluster) prev_capacity = grp->preferred_cluster->capacity; if (cluster->capacity < prev_capacity) threshold = sched_group_downmigrate; load = scale_load_to_cpu(demand, cpu); if (load < threshold) return 1; return 0; } /* * Return the cost of running task p on CPU cpu. This function * currently assumes that task p is the only task which will run on * the CPU. */ unsigned int power_cost(int cpu, u64 demand) { int first, mid, last; struct cpu_pwr_stats *per_cpu_info = get_cpu_pwr_stats(); struct cpu_pstate_pwr *costs; struct freq_max_load *max_load; int total_static_pwr_cost = 0; struct rq *rq = cpu_rq(cpu); unsigned int pc; if (!per_cpu_info || !per_cpu_info[cpu].ptable) /* * When power aware scheduling is not in use, or CPU * power data is not available, just use the CPU * capacity as a rough stand-in for real CPU power * numbers, assuming bigger CPUs are more power * hungry. */ return cpu_max_possible_capacity(cpu); rcu_read_lock(); max_load = rcu_dereference(per_cpu(freq_max_load, cpu)); if (!max_load) { pc = cpu_max_possible_capacity(cpu); goto unlock; } costs = per_cpu_info[cpu].ptable; if (demand <= max_load->freqs[0].hdemand) { pc = costs[0].power; goto unlock; } else if (demand > max_load->freqs[max_load->length - 1].hdemand) { pc = costs[max_load->length - 1].power; goto unlock; } first = 0; last = max_load->length - 1; mid = (last - first) >> 1; while (1) { if (demand <= max_load->freqs[mid].hdemand) last = mid; else first = mid; if (last - first == 1) break; mid = first + ((last - first) >> 1); } pc = costs[last].power; unlock: rcu_read_unlock(); if (idle_cpu(cpu) && rq->cstate) { total_static_pwr_cost += rq->static_cpu_pwr_cost; if (rq->cluster->dstate) total_static_pwr_cost += rq->cluster->static_cluster_pwr_cost; } return pc + total_static_pwr_cost; } void inc_nr_big_task(struct hmp_sched_stats *stats, struct task_struct *p) { if (!sched_enable_hmp || sched_disable_window_stats) return; if (is_big_task(p)) stats->nr_big_tasks++; } void dec_nr_big_task(struct hmp_sched_stats *stats, struct task_struct *p) { if (!sched_enable_hmp || sched_disable_window_stats) return; if (is_big_task(p)) stats->nr_big_tasks--; BUG_ON(stats->nr_big_tasks < 0); } void inc_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra) { inc_nr_big_task(&rq->hmp_stats, p); if (change_cra) inc_cumulative_runnable_avg(&rq->hmp_stats, p); } void dec_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra) { dec_nr_big_task(&rq->hmp_stats, p); if (change_cra) dec_cumulative_runnable_avg(&rq->hmp_stats, p); } void reset_hmp_stats(struct hmp_sched_stats *stats, int reset_cra) { stats->nr_big_tasks = 0; if (reset_cra) { stats->cumulative_runnable_avg = 0; stats->pred_demands_sum = 0; } } int preferred_cluster(struct sched_cluster *cluster, struct task_struct *p) { struct related_thread_group *grp; int rc = 0; rcu_read_lock(); grp = task_related_thread_group(p); if (!grp || !sysctl_sched_enable_colocation) rc = 1; else rc = (grp->preferred_cluster == cluster); rcu_read_unlock(); return rc; } struct sched_cluster *rq_cluster(struct rq *rq) { return rq->cluster; } /* * reset_cpu_hmp_stats - reset HMP stats for a cpu * nr_big_tasks * cumulative_runnable_avg (iff reset_cra is true) */ void reset_cpu_hmp_stats(int cpu, int reset_cra) { reset_cfs_rq_hmp_stats(cpu, reset_cra); reset_hmp_stats(&cpu_rq(cpu)->hmp_stats, reset_cra); } void fixup_nr_big_tasks(struct hmp_sched_stats *stats, struct task_struct *p, s64 delta) { u64 new_task_load; u64 old_task_load; if (!sched_enable_hmp || sched_disable_window_stats) return; old_task_load = scale_load_to_cpu(task_load(p), task_cpu(p)); new_task_load = scale_load_to_cpu(delta + task_load(p), task_cpu(p)); if (__is_big_task(p, old_task_load) && !__is_big_task(p, new_task_load)) stats->nr_big_tasks--; else if (!__is_big_task(p, old_task_load) && __is_big_task(p, new_task_load)) stats->nr_big_tasks++; BUG_ON(stats->nr_big_tasks < 0); } /* * Walk runqueue of cpu and re-initialize 'nr_big_tasks' counters. */ static void update_nr_big_tasks(int cpu) { struct rq *rq = cpu_rq(cpu); struct task_struct *p; /* Do not reset cumulative_runnable_avg */ reset_cpu_hmp_stats(cpu, 0); list_for_each_entry(p, &rq->cfs_tasks, se.group_node) _inc_hmp_sched_stats_fair(rq, p, 0); } /* Disable interrupts and grab runqueue lock of all cpus listed in @cpus */ void pre_big_task_count_change(const struct cpumask *cpus) { int i; local_irq_disable(); for_each_cpu(i, cpus) raw_spin_lock(&cpu_rq(i)->lock); } /* * Reinitialize 'nr_big_tasks' counters on all affected cpus */ void post_big_task_count_change(const struct cpumask *cpus) { int i; /* Assumes local_irq_disable() keeps online cpumap stable */ for_each_cpu(i, cpus) update_nr_big_tasks(i); for_each_cpu(i, cpus) raw_spin_unlock(&cpu_rq(i)->lock); local_irq_enable(); } DEFINE_MUTEX(policy_mutex); unsigned int update_freq_aggregate_threshold(unsigned int threshold) { unsigned int old_threshold; mutex_lock(&policy_mutex); old_threshold = sysctl_sched_freq_aggregate_threshold_pct; sysctl_sched_freq_aggregate_threshold_pct = threshold; sched_freq_aggregate_threshold = pct_to_real(sysctl_sched_freq_aggregate_threshold_pct); mutex_unlock(&policy_mutex); return old_threshold; } static inline int invalid_value_freq_input(unsigned int *data) { if (data == &sysctl_sched_freq_aggregate) return !(*data == 0 || *data == 1); return 0; } static inline int invalid_value(unsigned int *data) { unsigned int val = *data; if (data == &sysctl_sched_ravg_hist_size) return (val < 2 || val > RAVG_HIST_SIZE_MAX); if (data == &sysctl_sched_window_stats_policy) return val >= WINDOW_STATS_INVALID_POLICY; return invalid_value_freq_input(data); } /* * Handle "atomic" update of sysctl_sched_window_stats_policy, * sysctl_sched_ravg_hist_size variables. */ int sched_window_update_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; unsigned int *data = (unsigned int *)table->data; unsigned int old_val; if (!sched_enable_hmp) return -EINVAL; mutex_lock(&policy_mutex); old_val = *data; ret = proc_dointvec(table, write, buffer, lenp, ppos); if (ret || !write || (write && (old_val == *data))) goto done; if (invalid_value(data)) { *data = old_val; ret = -EINVAL; goto done; } reset_all_window_stats(0, 0); done: mutex_unlock(&policy_mutex); return ret; } /* * Convert percentage value into absolute form. This will avoid div() operation * in fast path, to convert task load in percentage scale. */ int sched_hmp_proc_update_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; unsigned int old_val; unsigned int *data = (unsigned int *)table->data; int update_min_nice = 0; mutex_lock(&policy_mutex); old_val = *data; ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret || !write || !sched_enable_hmp) goto done; if (write && (old_val == *data)) goto done; if (sysctl_sched_downmigrate_pct > sysctl_sched_upmigrate_pct || sysctl_sched_group_downmigrate_pct > sysctl_sched_group_upmigrate_pct) { *data = old_val; ret = -EINVAL; goto done; } /* * Big task tunable change will need to re-classify tasks on * runqueue as big and set their counters appropriately. * sysctl interface affects secondary variables (*_pct), which is then * "atomically" carried over to the primary variables. Atomic change * includes taking runqueue lock of all online cpus and re-initiatizing * their big counter values based on changed criteria. */ if ((data == &sysctl_sched_upmigrate_pct || update_min_nice)) { get_online_cpus(); pre_big_task_count_change(cpu_online_mask); } set_hmp_defaults(); if ((data == &sysctl_sched_upmigrate_pct || update_min_nice)) { post_big_task_count_change(cpu_online_mask); put_online_cpus(); } done: mutex_unlock(&policy_mutex); return ret; } inline int nr_big_tasks(struct rq *rq) { return rq->hmp_stats.nr_big_tasks; } unsigned int cpu_temp(int cpu) { struct cpu_pwr_stats *per_cpu_info = get_cpu_pwr_stats(); if (per_cpu_info) return per_cpu_info[cpu].temp; else return 0; } void free_task_load_ptrs(struct task_struct *p) { kfree(p->ravg.curr_window_cpu); kfree(p->ravg.prev_window_cpu); /* * update_task_ravg() can be called for exiting tasks. While the * function itself ensures correct behavior, the corresponding * trace event requires that these pointers be NULL. */ p->ravg.curr_window_cpu = NULL; p->ravg.prev_window_cpu = NULL; } void init_new_task_load(struct task_struct *p, bool idle_task) { int i; u32 init_load_windows = sched_init_task_load_windows; u32 init_load_pct = current->init_load_pct; p->init_load_pct = 0; rcu_assign_pointer(p->grp, NULL); INIT_LIST_HEAD(&p->grp_list); memset(&p->ravg, 0, sizeof(struct ravg)); p->cpu_cycles = 0; p->ravg.curr_window_cpu = kcalloc(nr_cpu_ids, sizeof(u32), GFP_KERNEL); p->ravg.prev_window_cpu = kcalloc(nr_cpu_ids, sizeof(u32), GFP_KERNEL); /* Don't have much choice. CPU frequency would be bogus */ BUG_ON(!p->ravg.curr_window_cpu || !p->ravg.prev_window_cpu); if (idle_task) return; if (init_load_pct) init_load_windows = div64_u64((u64)init_load_pct * (u64)sched_ravg_window, 100); p->ravg.demand = init_load_windows; p->ravg.pred_demand = 0; for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i) p->ravg.sum_history[i] = init_load_windows; } /* Return task demand in percentage scale */ unsigned int pct_task_load(struct task_struct *p) { unsigned int load; load = div64_u64((u64)task_load(p) * 100, (u64)max_task_load()); return load; } /* * Return total number of tasks "eligible" to run on highest capacity cpu * * This is simply nr_big_tasks for cpus which are not of max_capacity and * nr_running for cpus of max_capacity */ unsigned int nr_eligible_big_tasks(int cpu) { struct rq *rq = cpu_rq(cpu); int nr_big = rq->hmp_stats.nr_big_tasks; int nr = rq->nr_running; if (cpu_max_possible_capacity(cpu) != max_possible_capacity) return nr_big; return nr; } static inline int exiting_task(struct task_struct *p) { return (p->ravg.sum_history[0] == EXITING_TASK_MARKER); } static int __init set_sched_ravg_window(char *str) { unsigned int window_size; get_option(&str, &window_size); if (window_size < MIN_SCHED_RAVG_WINDOW || window_size > MAX_SCHED_RAVG_WINDOW) { WARN_ON(1); return -EINVAL; } sched_ravg_window = window_size; return 0; } early_param("sched_ravg_window", set_sched_ravg_window); static inline void update_window_start(struct rq *rq, u64 wallclock) { s64 delta; int nr_windows; delta = wallclock - rq->window_start; BUG_ON(delta < 0); if (delta < sched_ravg_window) return; nr_windows = div64_u64(delta, sched_ravg_window); rq->window_start += (u64)nr_windows * (u64)sched_ravg_window; } #define DIV64_U64_ROUNDUP(X, Y) div64_u64((X) + (Y - 1), Y) static inline u64 scale_exec_time(u64 delta, struct rq *rq) { u32 freq; freq = cpu_cycles_to_freq(rq->cc.cycles, rq->cc.time); delta = DIV64_U64_ROUNDUP(delta * freq, max_possible_freq); delta *= rq->cluster->exec_scale_factor; delta >>= 10; return delta; } static inline int cpu_is_waiting_on_io(struct rq *rq) { if (!sched_io_is_busy) return 0; return atomic_read(&rq->nr_iowait); } /* Does freq_required sufficiently exceed or fall behind cur_freq? */ static inline int nearly_same_freq(unsigned int cur_freq, unsigned int freq_required) { int delta = freq_required - cur_freq; if (freq_required > cur_freq) return delta < sysctl_sched_freq_inc_notify; delta = -delta; return delta < sysctl_sched_freq_dec_notify; } /* Convert busy time to frequency equivalent */ static inline unsigned int load_to_freq(struct rq *rq, u64 load) { unsigned int freq; load = scale_load_to_cpu(load, cpu_of(rq)); load *= 128; load = div64_u64(load, max_task_load()); freq = load * cpu_max_possible_freq(cpu_of(rq)); freq /= 128; return freq; } static inline struct group_cpu_time * _group_cpu_time(struct related_thread_group *grp, int cpu); /* * Return load from all related group in given cpu. * Caller must ensure that related_thread_group_lock is held. */ static void _group_load_in_cpu(int cpu, u64 *grp_load, u64 *new_grp_load) { struct related_thread_group *grp; for_each_related_thread_group(grp) { struct group_cpu_time *cpu_time; cpu_time = _group_cpu_time(grp, cpu); *grp_load += cpu_time->prev_runnable_sum; if (new_grp_load) *new_grp_load += cpu_time->nt_prev_runnable_sum; } } /* * Return load from all related groups in given frequency domain. * Caller must ensure that related_thread_group_lock is held. */ static void group_load_in_freq_domain(struct cpumask *cpus, u64 *grp_load, u64 *new_grp_load) { struct related_thread_group *grp; int j; for_each_related_thread_group(grp) { for_each_cpu(j, cpus) { struct group_cpu_time *cpu_time; cpu_time = _group_cpu_time(grp, j); *grp_load += cpu_time->prev_runnable_sum; *new_grp_load += cpu_time->nt_prev_runnable_sum; } } } static inline u64 freq_policy_load(struct rq *rq, u64 load); /* * Should scheduler alert governor for changing frequency? * * @check_pred - evaluate frequency based on the predictive demand * @check_groups - add load from all related groups on given cpu * * check_groups is set to 1 if a "related" task movement/wakeup is triggering * the notification check. To avoid "re-aggregation" of demand in such cases, * we check whether the migrated/woken tasks demand (along with demand from * existing tasks on the cpu) can be met on target cpu * */ static int send_notification(struct rq *rq, int check_pred, int check_groups) { unsigned int cur_freq, freq_required; unsigned long flags; int rc = 0; u64 group_load = 0, new_load = 0; if (!sched_enable_hmp) return 0; if (check_pred) { u64 prev = rq->old_busy_time; u64 predicted = rq->hmp_stats.pred_demands_sum; if (rq->cluster->cur_freq == cpu_max_freq(cpu_of(rq))) return 0; prev = max(prev, rq->old_estimated_time); if (prev > predicted) return 0; cur_freq = load_to_freq(rq, prev); freq_required = load_to_freq(rq, predicted); if (freq_required < cur_freq + sysctl_sched_pred_alert_freq) return 0; } else { read_lock_irqsave(&related_thread_group_lock, flags); /* * Protect from concurrent update of rq->prev_runnable_sum and * group cpu load */ raw_spin_lock(&rq->lock); if (check_groups) _group_load_in_cpu(cpu_of(rq), &group_load, NULL); new_load = rq->prev_runnable_sum + group_load; new_load = freq_policy_load(rq, new_load); raw_spin_unlock(&rq->lock); read_unlock_irqrestore(&related_thread_group_lock, flags); cur_freq = load_to_freq(rq, rq->old_busy_time); freq_required = load_to_freq(rq, new_load); if (nearly_same_freq(cur_freq, freq_required)) return 0; } raw_spin_lock_irqsave(&rq->lock, flags); if (!rq->cluster->notifier_sent) { rq->cluster->notifier_sent = 1; rc = 1; trace_sched_freq_alert(cpu_of(rq), check_pred, check_groups, rq, new_load); } raw_spin_unlock_irqrestore(&rq->lock, flags); return rc; } /* Alert governor if there is a need to change frequency */ void check_for_freq_change(struct rq *rq, bool check_pred, bool check_groups) { int cpu = cpu_of(rq); if (!send_notification(rq, check_pred, check_groups)) return; atomic_notifier_call_chain( &load_alert_notifier_head, 0, (void *)(long)cpu); } void notify_migration(int src_cpu, int dest_cpu, bool src_cpu_dead, struct task_struct *p) { bool check_groups; rcu_read_lock(); check_groups = task_in_related_thread_group(p); rcu_read_unlock(); if (!same_freq_domain(src_cpu, dest_cpu)) { if (!src_cpu_dead) check_for_freq_change(cpu_rq(src_cpu), false, check_groups); check_for_freq_change(cpu_rq(dest_cpu), false, check_groups); } else { check_for_freq_change(cpu_rq(dest_cpu), true, check_groups); } } static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p, u64 irqtime, int event) { if (is_idle_task(p)) { /* TASK_WAKE && TASK_MIGRATE is not possible on idle task! */ if (event == PICK_NEXT_TASK) return 0; /* PUT_PREV_TASK, TASK_UPDATE && IRQ_UPDATE are left */ return irqtime || cpu_is_waiting_on_io(rq); } if (event == TASK_WAKE) return 0; if (event == PUT_PREV_TASK || event == IRQ_UPDATE) return 1; /* * TASK_UPDATE can be called on sleeping task, when its moved between * related groups */ if (event == TASK_UPDATE) { if (rq->curr == p) return 1; return p->on_rq ? SCHED_FREQ_ACCOUNT_WAIT_TIME : 0; } /* TASK_MIGRATE, PICK_NEXT_TASK left */ return SCHED_FREQ_ACCOUNT_WAIT_TIME; } static inline bool is_new_task(struct task_struct *p) { return p->ravg.active_windows < sysctl_sched_new_task_windows; } #define INC_STEP 8 #define DEC_STEP 2 #define CONSISTENT_THRES 16 #define INC_STEP_BIG 16 /* * bucket_increase - update the count of all buckets * * @buckets: array of buckets tracking busy time of a task * @idx: the index of bucket to be incremented * * Each time a complete window finishes, count of bucket that runtime * falls in (@idx) is incremented. Counts of all other buckets are * decayed. The rate of increase and decay could be different based * on current count in the bucket. */ static inline void bucket_increase(u8 *buckets, int idx) { int i, step; for (i = 0; i < NUM_BUSY_BUCKETS; i++) { if (idx != i) { if (buckets[i] > DEC_STEP) buckets[i] -= DEC_STEP; else buckets[i] = 0; } else { step = buckets[i] >= CONSISTENT_THRES ? INC_STEP_BIG : INC_STEP; if (buckets[i] > U8_MAX - step) buckets[i] = U8_MAX; else buckets[i] += step; } } } static inline int busy_to_bucket(u32 normalized_rt) { int bidx; bidx = mult_frac(normalized_rt, NUM_BUSY_BUCKETS, max_task_load()); bidx = min(bidx, NUM_BUSY_BUCKETS - 1); /* * Combine lowest two buckets. The lowest frequency falls into * 2nd bucket and thus keep predicting lowest bucket is not * useful. */ if (!bidx) bidx++; return bidx; } static inline u64 scale_load_to_freq(u64 load, unsigned int src_freq, unsigned int dst_freq) { return div64_u64(load * (u64)src_freq, (u64)dst_freq); } #define HEAVY_TASK_SKIP 2 #define HEAVY_TASK_SKIP_LIMIT 4 /* * get_pred_busy - calculate predicted demand for a task on runqueue * * @rq: runqueue of task p * @p: task whose prediction is being updated * @start: starting bucket. returned prediction should not be lower than * this bucket. * @runtime: runtime of the task. returned prediction should not be lower * than this runtime. * Note: @start can be derived from @runtime. It's passed in only to * avoid duplicated calculation in some cases. * * A new predicted busy time is returned for task @p based on @runtime * passed in. The function searches through buckets that represent busy * time equal to or bigger than @runtime and attempts to find the bucket to * to use for prediction. Once found, it searches through historical busy * time and returns the latest that falls into the bucket. If no such busy * time exists, it returns the medium of that bucket. */ static u32 get_pred_busy(struct rq *rq, struct task_struct *p, int start, u32 runtime) { int i; u8 *buckets = p->ravg.busy_buckets; u32 *hist = p->ravg.sum_history; u32 dmin, dmax; u64 cur_freq_runtime = 0; int first = NUM_BUSY_BUCKETS, final, skip_to; u32 ret = runtime; /* skip prediction for new tasks due to lack of history */ if (unlikely(is_new_task(p))) goto out; /* find minimal bucket index to pick */ for (i = start; i < NUM_BUSY_BUCKETS; i++) { if (buckets[i]) { first = i; break; } } /* if no higher buckets are filled, predict runtime */ if (first >= NUM_BUSY_BUCKETS) goto out; /* compute the bucket for prediction */ final = first; if (first < HEAVY_TASK_SKIP_LIMIT) { /* compute runtime at current CPU frequency */ cur_freq_runtime = mult_frac(runtime, max_possible_efficiency, rq->cluster->efficiency); cur_freq_runtime = scale_load_to_freq(cur_freq_runtime, max_possible_freq, rq->cluster->cur_freq); /* * if the task runs for majority of the window, try to * pick higher buckets. */ if (cur_freq_runtime >= sched_major_task_runtime) { int next = NUM_BUSY_BUCKETS; /* * if there is a higher bucket that's consistently * hit, don't jump beyond that. */ for (i = start + 1; i <= HEAVY_TASK_SKIP_LIMIT && i < NUM_BUSY_BUCKETS; i++) { if (buckets[i] > CONSISTENT_THRES) { next = i; break; } } skip_to = min(next, start + HEAVY_TASK_SKIP); /* don't jump beyond HEAVY_TASK_SKIP_LIMIT */ skip_to = min(HEAVY_TASK_SKIP_LIMIT, skip_to); /* don't go below first non-empty bucket, if any */ final = max(first, skip_to); } } /* determine demand range for the predicted bucket */ if (final < 2) { /* lowest two buckets are combined */ dmin = 0; final = 1; } else { dmin = mult_frac(final, max_task_load(), NUM_BUSY_BUCKETS); } dmax = mult_frac(final + 1, max_task_load(), NUM_BUSY_BUCKETS); /* * search through runtime history and return first runtime that falls * into the range of predicted bucket. */ for (i = 0; i < sched_ravg_hist_size; i++) { if (hist[i] >= dmin && hist[i] < dmax) { ret = hist[i]; break; } } /* no historical runtime within bucket found, use average of the bin */ if (ret < dmin) ret = (dmin + dmax) / 2; /* * when updating in middle of a window, runtime could be higher * than all recorded history. Always predict at least runtime. */ ret = max(runtime, ret); out: trace_sched_update_pred_demand(rq, p, runtime, mult_frac((unsigned int)cur_freq_runtime, 100, sched_ravg_window), ret); return ret; } static inline u32 calc_pred_demand(struct rq *rq, struct task_struct *p) { if (p->ravg.pred_demand >= p->ravg.curr_window) return p->ravg.pred_demand; return get_pred_busy(rq, p, busy_to_bucket(p->ravg.curr_window), p->ravg.curr_window); } /* * predictive demand of a task is calculated at the window roll-over. * if the task current window busy time exceeds the predicted * demand, update it here to reflect the task needs. */ void update_task_pred_demand(struct rq *rq, struct task_struct *p, int event) { u32 new, old; if (is_idle_task(p) || exiting_task(p)) return; if (event != PUT_PREV_TASK && event != TASK_UPDATE && (!SCHED_FREQ_ACCOUNT_WAIT_TIME || (event != TASK_MIGRATE && event != PICK_NEXT_TASK))) return; /* * TASK_UPDATE can be called on sleeping task, when its moved between * related groups */ if (event == TASK_UPDATE) { if (!p->on_rq && !SCHED_FREQ_ACCOUNT_WAIT_TIME) return; } new = calc_pred_demand(rq, p); old = p->ravg.pred_demand; if (old >= new) return; if (task_on_rq_queued(p) && (!task_has_dl_policy(p) || !p->dl.dl_throttled)) p->sched_class->fixup_hmp_sched_stats(rq, p, p->ravg.demand, new); p->ravg.pred_demand = new; } void clear_top_tasks_bitmap(unsigned long *bitmap) { memset(bitmap, 0, top_tasks_bitmap_size); __set_bit(NUM_LOAD_INDICES, bitmap); } /* * Special case the last index and provide a fast path for index = 0. * Note that sched_load_granule can change underneath us if we are not * holding any runqueue locks while calling the two functions below. */ static u32 top_task_load(struct rq *rq) { int index = rq->prev_top; u8 prev = 1 - rq->curr_table; if (!index) { int msb = NUM_LOAD_INDICES - 1; if (!test_bit(msb, rq->top_tasks_bitmap[prev])) return 0; else return sched_load_granule; } else if (index == NUM_LOAD_INDICES - 1) { return sched_ravg_window; } else { return (index + 1) * sched_load_granule; } } static int load_to_index(u32 load) { if (load < sched_load_granule) return 0; else if (load >= sched_ravg_window) return NUM_LOAD_INDICES - 1; else return load / sched_load_granule; } static void update_top_tasks(struct task_struct *p, struct rq *rq, u32 old_curr_window, int new_window, bool full_window) { u8 curr = rq->curr_table; u8 prev = 1 - curr; u8 *curr_table = rq->top_tasks[curr]; u8 *prev_table = rq->top_tasks[prev]; int old_index, new_index, update_index; u32 curr_window = p->ravg.curr_window; u32 prev_window = p->ravg.prev_window; bool zero_index_update; if (old_curr_window == curr_window && !new_window) return; old_index = load_to_index(old_curr_window); new_index = load_to_index(curr_window); if (!new_window) { zero_index_update = !old_curr_window && curr_window; if (old_index != new_index || zero_index_update) { if (old_curr_window) curr_table[old_index] -= 1; if (curr_window) curr_table[new_index] += 1; if (new_index > rq->curr_top) rq->curr_top = new_index; } if (!curr_table[old_index]) __clear_bit(NUM_LOAD_INDICES - old_index - 1, rq->top_tasks_bitmap[curr]); if (curr_table[new_index] == 1) __set_bit(NUM_LOAD_INDICES - new_index - 1, rq->top_tasks_bitmap[curr]); return; } /* * The window has rolled over for this task. By the time we get * here, curr/prev swaps would has already occurred. So we need * to use prev_window for the new index. */ update_index = load_to_index(prev_window); if (full_window) { /* * Two cases here. Either 'p' ran for the entire window or * it didn't run at all. In either case there is no entry * in the prev table. If 'p' ran the entire window, we just * need to create a new entry in the prev table. In this case * update_index will be correspond to sched_ravg_window * so we can unconditionally update the top index. */ if (prev_window) { prev_table[update_index] += 1; rq->prev_top = update_index; } if (prev_table[update_index] == 1) __set_bit(NUM_LOAD_INDICES - update_index - 1, rq->top_tasks_bitmap[prev]); } else { zero_index_update = !old_curr_window && prev_window; if (old_index != update_index || zero_index_update) { if (old_curr_window) prev_table[old_index] -= 1; prev_table[update_index] += 1; if (update_index > rq->prev_top) rq->prev_top = update_index; if (!prev_table[old_index]) __clear_bit(NUM_LOAD_INDICES - old_index - 1, rq->top_tasks_bitmap[prev]); if (prev_table[update_index] == 1) __set_bit(NUM_LOAD_INDICES - update_index - 1, rq->top_tasks_bitmap[prev]); } } if (curr_window) { curr_table[new_index] += 1; if (new_index > rq->curr_top) rq->curr_top = new_index; if (curr_table[new_index] == 1) __set_bit(NUM_LOAD_INDICES - new_index - 1, rq->top_tasks_bitmap[curr]); } } static inline void clear_top_tasks_table(u8 *table) { memset(table, 0, NUM_LOAD_INDICES * sizeof(u8)); } static u32 empty_windows[NR_CPUS]; static void rollover_task_window(struct task_struct *p, bool full_window) { u32 *curr_cpu_windows = empty_windows; u32 curr_window; int i; /* Rollover the sum */ curr_window = 0; if (!full_window) { curr_window = p->ravg.curr_window; curr_cpu_windows = p->ravg.curr_window_cpu; } p->ravg.prev_window = curr_window; p->ravg.curr_window = 0; /* Roll over individual CPU contributions */ for (i = 0; i < nr_cpu_ids; i++) { p->ravg.prev_window_cpu[i] = curr_cpu_windows[i]; p->ravg.curr_window_cpu[i] = 0; } } /* * Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum) */ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq, int event, u64 wallclock, u64 irqtime) { int new_window, full_window = 0; int p_is_curr_task = (p == rq->curr); u64 mark_start = p->ravg.mark_start; u64 window_start = rq->window_start; u32 window_size = sched_ravg_window; u64 delta; u64 *curr_runnable_sum = &rq->curr_runnable_sum; u64 *prev_runnable_sum = &rq->prev_runnable_sum; u64 *nt_curr_runnable_sum = &rq->nt_curr_runnable_sum; u64 *nt_prev_runnable_sum = &rq->nt_prev_runnable_sum; int flip_counters = 0; int prev_sum_reset = 0; bool new_task; struct related_thread_group *grp; int cpu = rq->cpu; u32 old_curr_window; new_window = mark_start < window_start; if (new_window) { full_window = (window_start - mark_start) >= window_size; if (p->ravg.active_windows < USHRT_MAX) p->ravg.active_windows++; } new_task = is_new_task(p); grp = p->grp; if (grp && sched_freq_aggregate) { /* cpu_time protected by rq_lock */ struct group_cpu_time *cpu_time = _group_cpu_time(grp, cpu_of(rq)); curr_runnable_sum = &cpu_time->curr_runnable_sum; prev_runnable_sum = &cpu_time->prev_runnable_sum; nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; if (cpu_time->window_start != rq->window_start) { int nr_windows; delta = rq->window_start - cpu_time->window_start; nr_windows = div64_u64(delta, window_size); if (nr_windows > 1) prev_sum_reset = 1; cpu_time->window_start = rq->window_start; flip_counters = 1; } if (p_is_curr_task && new_window) { u64 curr_sum = rq->curr_runnable_sum; u64 nt_curr_sum = rq->nt_curr_runnable_sum; if (full_window) curr_sum = nt_curr_sum = 0; rq->prev_runnable_sum = curr_sum; rq->nt_prev_runnable_sum = nt_curr_sum; rq->curr_runnable_sum = 0; rq->nt_curr_runnable_sum = 0; } } else { if (p_is_curr_task && new_window) { flip_counters = 1; if (full_window) prev_sum_reset = 1; } } /* * Handle per-task window rollover. We don't care about the idle * task or exiting tasks. */ if (!is_idle_task(p) && !exiting_task(p)) { old_curr_window = p->ravg.curr_window; if (new_window) rollover_task_window(p, full_window); } if (flip_counters) { u64 curr_sum = *curr_runnable_sum; u64 nt_curr_sum = *nt_curr_runnable_sum; u8 curr_table = rq->curr_table; u8 prev_table = 1 - curr_table; int curr_top = rq->curr_top; clear_top_tasks_table(rq->top_tasks[prev_table]); clear_top_tasks_bitmap(rq->top_tasks_bitmap[prev_table]); if (prev_sum_reset) { curr_sum = nt_curr_sum = 0; curr_top = 0; clear_top_tasks_table(rq->top_tasks[curr_table]); clear_top_tasks_bitmap( rq->top_tasks_bitmap[curr_table]); } *prev_runnable_sum = curr_sum; *nt_prev_runnable_sum = nt_curr_sum; *curr_runnable_sum = 0; *nt_curr_runnable_sum = 0; rq->curr_table = prev_table; rq->prev_top = curr_top; rq->curr_top = 0; } if (!account_busy_for_cpu_time(rq, p, irqtime, event)) goto done; if (!new_window) { /* * account_busy_for_cpu_time() = 1 so busy time needs * to be accounted to the current window. No rollover * since we didn't start a new window. An example of this is * when a task starts execution and then sleeps within the * same window. */ if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) delta = wallclock - mark_start; else delta = irqtime; delta = scale_exec_time(delta, rq); *curr_runnable_sum += delta; if (new_task) *nt_curr_runnable_sum += delta; if (!is_idle_task(p) && !exiting_task(p)) { p->ravg.curr_window += delta; p->ravg.curr_window_cpu[cpu] += delta; } goto done; } if (!p_is_curr_task) { /* * account_busy_for_cpu_time() = 1 so busy time needs * to be accounted to the current window. A new window * has also started, but p is not the current task, so the * window is not rolled over - just split up and account * as necessary into curr and prev. The window is only * rolled over when a new window is processed for the current * task. * * Irqtime can't be accounted by a task that isn't the * currently running task. */ if (!full_window) { /* * A full window hasn't elapsed, account partial * contribution to previous completed window. */ delta = scale_exec_time(window_start - mark_start, rq); if (!exiting_task(p)) { p->ravg.prev_window += delta; p->ravg.prev_window_cpu[cpu] += delta; } } else { /* * Since at least one full window has elapsed, * the contribution to the previous window is the * full window (window_size). */ delta = scale_exec_time(window_size, rq); if (!exiting_task(p)) { p->ravg.prev_window = delta; p->ravg.prev_window_cpu[cpu] = delta; } } *prev_runnable_sum += delta; if (new_task) *nt_prev_runnable_sum += delta; /* Account piece of busy time in the current window. */ delta = scale_exec_time(wallclock - window_start, rq); *curr_runnable_sum += delta; if (new_task) *nt_curr_runnable_sum += delta; if (!exiting_task(p)) { p->ravg.curr_window = delta; p->ravg.curr_window_cpu[cpu] = delta; } goto done; } if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) { /* * account_busy_for_cpu_time() = 1 so busy time needs * to be accounted to the current window. A new window * has started and p is the current task so rollover is * needed. If any of these three above conditions are true * then this busy time can't be accounted as irqtime. * * Busy time for the idle task or exiting tasks need not * be accounted. * * An example of this would be a task that starts execution * and then sleeps once a new window has begun. */ if (!full_window) { /* * A full window hasn't elapsed, account partial * contribution to previous completed window. */ delta = scale_exec_time(window_start - mark_start, rq); if (!is_idle_task(p) && !exiting_task(p)) { p->ravg.prev_window += delta; p->ravg.prev_window_cpu[cpu] += delta; } } else { /* * Since at least one full window has elapsed, * the contribution to the previous window is the * full window (window_size). */ delta = scale_exec_time(window_size, rq); if (!is_idle_task(p) && !exiting_task(p)) { p->ravg.prev_window = delta; p->ravg.prev_window_cpu[cpu] = delta; } } /* * Rollover is done here by overwriting the values in * prev_runnable_sum and curr_runnable_sum. */ *prev_runnable_sum += delta; if (new_task) *nt_prev_runnable_sum += delta; /* Account piece of busy time in the current window. */ delta = scale_exec_time(wallclock - window_start, rq); *curr_runnable_sum += delta; if (new_task) *nt_curr_runnable_sum += delta; if (!is_idle_task(p) && !exiting_task(p)) { p->ravg.curr_window = delta; p->ravg.curr_window_cpu[cpu] = delta; } goto done; } if (irqtime) { /* * account_busy_for_cpu_time() = 1 so busy time needs * to be accounted to the current window. A new window * has started and p is the current task so rollover is * needed. The current task must be the idle task because * irqtime is not accounted for any other task. * * Irqtime will be accounted each time we process IRQ activity * after a period of idleness, so we know the IRQ busy time * started at wallclock - irqtime. */ BUG_ON(!is_idle_task(p)); mark_start = wallclock - irqtime; /* * Roll window over. If IRQ busy time was just in the current * window then that is all that need be accounted. */ if (mark_start > window_start) { *curr_runnable_sum = scale_exec_time(irqtime, rq); return; } /* * The IRQ busy time spanned multiple windows. Process the * busy time preceding the current window start first. */ delta = window_start - mark_start; if (delta > window_size) delta = window_size; delta = scale_exec_time(delta, rq); *prev_runnable_sum += delta; /* Process the remaining IRQ busy time in the current window. */ delta = wallclock - window_start; rq->curr_runnable_sum = scale_exec_time(delta, rq); return; } done: if (!is_idle_task(p) && !exiting_task(p)) update_top_tasks(p, rq, old_curr_window, new_window, full_window); } static inline u32 predict_and_update_buckets(struct rq *rq, struct task_struct *p, u32 runtime) { int bidx; u32 pred_demand; bidx = busy_to_bucket(runtime); pred_demand = get_pred_busy(rq, p, bidx, runtime); bucket_increase(p->ravg.busy_buckets, bidx); return pred_demand; } static void update_task_cpu_cycles(struct task_struct *p, int cpu) { if (use_cycle_counter) p->cpu_cycles = cpu_cycle_counter_cb.get_cpu_cycle_counter(cpu); } static void update_task_rq_cpu_cycles(struct task_struct *p, struct rq *rq, int event, u64 wallclock, u64 irqtime) { u64 cur_cycles; int cpu = cpu_of(rq); lockdep_assert_held(&rq->lock); if (!use_cycle_counter) { rq->cc.cycles = cpu_cur_freq(cpu); rq->cc.time = 1; return; } cur_cycles = cpu_cycle_counter_cb.get_cpu_cycle_counter(cpu); /* * If current task is idle task and irqtime == 0 CPU was * indeed idle and probably its cycle counter was not * increasing. We still need estimatied CPU frequency * for IO wait time accounting. Use the previously * calculated frequency in such a case. */ if (!is_idle_task(rq->curr) || irqtime) { if (unlikely(cur_cycles < p->cpu_cycles)) rq->cc.cycles = cur_cycles + (U64_MAX - p->cpu_cycles); else rq->cc.cycles = cur_cycles - p->cpu_cycles; rq->cc.cycles = rq->cc.cycles * NSEC_PER_MSEC; if (event == IRQ_UPDATE && is_idle_task(p)) /* * Time between mark_start of idle task and IRQ handler * entry time is CPU cycle counter stall period. * Upon IRQ handler entry sched_account_irqstart() * replenishes idle task's cpu cycle counter so * rq->cc.cycles now represents increased cycles during * IRQ handler rather than time between idle entry and * IRQ exit. Thus use irqtime as time delta. */ rq->cc.time = irqtime; else rq->cc.time = wallclock - p->ravg.mark_start; BUG_ON((s64)rq->cc.time < 0); } p->cpu_cycles = cur_cycles; trace_sched_get_task_cpu_cycles(cpu, event, rq->cc.cycles, rq->cc.time); } static int account_busy_for_task_demand(struct task_struct *p, int event) { /* * No need to bother updating task demand for exiting tasks * or the idle task. */ if (exiting_task(p) || is_idle_task(p)) return 0; /* * When a task is waking up it is completing a segment of non-busy * time. Likewise, if wait time is not treated as busy time, then * when a task begins to run or is migrated, it is not running and * is completing a segment of non-busy time. */ if (event == TASK_WAKE || (!SCHED_ACCOUNT_WAIT_TIME && (event == PICK_NEXT_TASK || event == TASK_MIGRATE))) return 0; return 1; } /* * Called when new window is starting for a task, to record cpu usage over * recently concluded window(s). Normally 'samples' should be 1. It can be > 1 * when, say, a real-time task runs without preemption for several windows at a * stretch. */ static void update_history(struct rq *rq, struct task_struct *p, u32 runtime, int samples, int event) { u32 *hist = &p->ravg.sum_history[0]; int ridx, widx; u32 max = 0, avg, demand, pred_demand; u64 sum = 0; /* Ignore windows where task had no activity */ if (!runtime || is_idle_task(p) || exiting_task(p) || !samples) goto done; /* Push new 'runtime' value onto stack */ widx = sched_ravg_hist_size - 1; ridx = widx - samples; for (; ridx >= 0; --widx, --ridx) { hist[widx] = hist[ridx]; sum += hist[widx]; if (hist[widx] > max) max = hist[widx]; } for (widx = 0; widx < samples && widx < sched_ravg_hist_size; widx++) { hist[widx] = runtime; sum += hist[widx]; if (hist[widx] > max) max = hist[widx]; } p->ravg.sum = 0; if (sched_window_stats_policy == WINDOW_STATS_RECENT) { demand = runtime; } else if (sched_window_stats_policy == WINDOW_STATS_MAX) { demand = max; } else { avg = div64_u64(sum, sched_ravg_hist_size); if (sched_window_stats_policy == WINDOW_STATS_AVG) demand = avg; else demand = max(avg, runtime); } pred_demand = predict_and_update_buckets(rq, p, runtime); /* * A throttled deadline sched class task gets dequeued without * changing p->on_rq. Since the dequeue decrements hmp stats * avoid decrementing it here again. */ if (task_on_rq_queued(p) && (!task_has_dl_policy(p) || !p->dl.dl_throttled)) p->sched_class->fixup_hmp_sched_stats(rq, p, demand, pred_demand); p->ravg.demand = demand; p->ravg.pred_demand = pred_demand; done: trace_sched_update_history(rq, p, runtime, samples, event); } static void add_to_task_demand(struct rq *rq, struct task_struct *p, u64 delta) { delta = scale_exec_time(delta, rq); p->ravg.sum += delta; if (unlikely(p->ravg.sum > sched_ravg_window)) p->ravg.sum = sched_ravg_window; } /* * Account cpu demand of task and/or update task's cpu demand history * * ms = p->ravg.mark_start; * wc = wallclock * ws = rq->window_start * * Three possibilities: * * a) Task event is contained within one window. * window_start < mark_start < wallclock * * ws ms wc * | | | * V V V * |---------------| * * In this case, p->ravg.sum is updated *iff* event is appropriate * (ex: event == PUT_PREV_TASK) * * b) Task event spans two windows. * mark_start < window_start < wallclock * * ms ws wc * | | | * V V V * -----|------------------- * * In this case, p->ravg.sum is updated with (ws - ms) *iff* event * is appropriate, then a new window sample is recorded followed * by p->ravg.sum being set to (wc - ws) *iff* event is appropriate. * * c) Task event spans more than two windows. * * ms ws_tmp ws wc * | | | | * V V V V * ---|-------|-------|-------|-------|------ * | | * |<------ nr_full_windows ------>| * * In this case, p->ravg.sum is updated with (ws_tmp - ms) first *iff* * event is appropriate, window sample of p->ravg.sum is recorded, * 'nr_full_window' samples of window_size is also recorded *iff* * event is appropriate and finally p->ravg.sum is set to (wc - ws) * *iff* event is appropriate. * * IMPORTANT : Leave p->ravg.mark_start unchanged, as update_cpu_busy_time() * depends on it! */ static void update_task_demand(struct task_struct *p, struct rq *rq, int event, u64 wallclock) { u64 mark_start = p->ravg.mark_start; u64 delta, window_start = rq->window_start; int new_window, nr_full_windows; u32 window_size = sched_ravg_window; new_window = mark_start < window_start; if (!account_busy_for_task_demand(p, event)) { if (new_window) /* * If the time accounted isn't being accounted as * busy time, and a new window started, only the * previous window need be closed out with the * pre-existing demand. Multiple windows may have * elapsed, but since empty windows are dropped, * it is not necessary to account those. */ update_history(rq, p, p->ravg.sum, 1, event); return; } if (!new_window) { /* * The simple case - busy time contained within the existing * window. */ add_to_task_demand(rq, p, wallclock - mark_start); return; } /* * Busy time spans at least two windows. Temporarily rewind * window_start to first window boundary after mark_start. */ delta = window_start - mark_start; nr_full_windows = div64_u64(delta, window_size); window_start -= (u64)nr_full_windows * (u64)window_size; /* Process (window_start - mark_start) first */ add_to_task_demand(rq, p, window_start - mark_start); /* Push new sample(s) into task's demand history */ update_history(rq, p, p->ravg.sum, 1, event); if (nr_full_windows) update_history(rq, p, scale_exec_time(window_size, rq), nr_full_windows, event); /* * Roll window_start back to current to process any remainder * in current window. */ window_start += (u64)nr_full_windows * (u64)window_size; /* Process (wallclock - window_start) next */ mark_start = window_start; add_to_task_demand(rq, p, wallclock - mark_start); } /* Reflect task activity on its demand and cpu's busy time statistics */ void update_task_ravg(struct task_struct *p, struct rq *rq, int event, u64 wallclock, u64 irqtime) { if (!rq->window_start || sched_disable_window_stats || p->ravg.mark_start == wallclock) return; lockdep_assert_held(&rq->lock); update_window_start(rq, wallclock); if (!p->ravg.mark_start) { update_task_cpu_cycles(p, cpu_of(rq)); goto done; } update_task_rq_cpu_cycles(p, rq, event, wallclock, irqtime); update_task_demand(p, rq, event, wallclock); update_cpu_busy_time(p, rq, event, wallclock, irqtime); update_task_pred_demand(rq, p, event); done: trace_sched_update_task_ravg(p, rq, event, wallclock, irqtime, rq->cc.cycles, rq->cc.time, _group_cpu_time(p->grp, cpu_of(rq))); p->ravg.mark_start = wallclock; } void sched_account_irqtime(int cpu, struct task_struct *curr, u64 delta, u64 wallclock) { struct rq *rq = cpu_rq(cpu); unsigned long flags, nr_windows; u64 cur_jiffies_ts; raw_spin_lock_irqsave(&rq->lock, flags); /* * cputime (wallclock) uses sched_clock so use the same here for * consistency. */ delta += sched_clock() - wallclock; cur_jiffies_ts = get_jiffies_64(); if (is_idle_task(curr)) update_task_ravg(curr, rq, IRQ_UPDATE, sched_ktime_clock(), delta); nr_windows = cur_jiffies_ts - rq->irqload_ts; if (nr_windows) { if (nr_windows < 10) { /* Decay CPU's irqload by 3/4 for each window. */ rq->avg_irqload *= (3 * nr_windows); rq->avg_irqload = div64_u64(rq->avg_irqload, 4 * nr_windows); } else { rq->avg_irqload = 0; } rq->avg_irqload += rq->cur_irqload; rq->cur_irqload = 0; } rq->cur_irqload += delta; rq->irqload_ts = cur_jiffies_ts; raw_spin_unlock_irqrestore(&rq->lock, flags); } void sched_account_irqstart(int cpu, struct task_struct *curr, u64 wallclock) { struct rq *rq = cpu_rq(cpu); if (!rq->window_start || sched_disable_window_stats) return; if (is_idle_task(curr)) { /* We're here without rq->lock held, IRQ disabled */ raw_spin_lock(&rq->lock); update_task_cpu_cycles(curr, cpu); raw_spin_unlock(&rq->lock); } } void reset_task_stats(struct task_struct *p) { u32 sum = 0; u32 *curr_window_ptr = NULL; u32 *prev_window_ptr = NULL; if (exiting_task(p)) { sum = EXITING_TASK_MARKER; } else { curr_window_ptr = p->ravg.curr_window_cpu; prev_window_ptr = p->ravg.prev_window_cpu; memset(curr_window_ptr, 0, sizeof(u32) * nr_cpu_ids); memset(prev_window_ptr, 0, sizeof(u32) * nr_cpu_ids); } memset(&p->ravg, 0, sizeof(struct ravg)); p->ravg.curr_window_cpu = curr_window_ptr; p->ravg.prev_window_cpu = prev_window_ptr; /* Retain EXITING_TASK marker */ p->ravg.sum_history[0] = sum; } void mark_task_starting(struct task_struct *p) { u64 wallclock; struct rq *rq = task_rq(p); if (!rq->window_start || sched_disable_window_stats) { reset_task_stats(p); return; } wallclock = sched_ktime_clock(); p->ravg.mark_start = p->last_wake_ts = wallclock; p->last_cpu_selected_ts = wallclock; p->last_switch_out_ts = 0; update_task_cpu_cycles(p, cpu_of(rq)); } void set_window_start(struct rq *rq) { int cpu = cpu_of(rq); struct rq *sync_rq = cpu_rq(sync_cpu); if (rq->window_start || !sched_enable_hmp) return; if (cpu == sync_cpu) { rq->window_start = sched_ktime_clock(); } else { raw_spin_unlock(&rq->lock); double_rq_lock(rq, sync_rq); rq->window_start = cpu_rq(sync_cpu)->window_start; rq->curr_runnable_sum = rq->prev_runnable_sum = 0; rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0; raw_spin_unlock(&sync_rq->lock); } rq->curr->ravg.mark_start = rq->window_start; } void migrate_sync_cpu(int cpu, int new_cpu) { if (cpu == sync_cpu) sync_cpu = new_cpu; } static void reset_all_task_stats(void) { struct task_struct *g, *p; do_each_thread(g, p) { reset_task_stats(p); } while_each_thread(g, p); } enum reset_reason_code { WINDOW_CHANGE, POLICY_CHANGE, HIST_SIZE_CHANGE, FREQ_AGGREGATE_CHANGE, }; const char *sched_window_reset_reasons[] = { "WINDOW_CHANGE", "POLICY_CHANGE", "HIST_SIZE_CHANGE", }; /* Called with IRQs enabled */ void reset_all_window_stats(u64 window_start, unsigned int window_size) { int cpu, i; unsigned long flags; u64 start_ts = sched_ktime_clock(); int reason = WINDOW_CHANGE; unsigned int old = 0, new = 0; struct related_thread_group *grp; local_irq_save(flags); read_lock(&tasklist_lock); read_lock(&related_thread_group_lock); /* Taking all runqueue locks prevents race with sched_exit(). */ for_each_possible_cpu(cpu) raw_spin_lock(&cpu_rq(cpu)->lock); sched_disable_window_stats = 1; reset_all_task_stats(); read_unlock(&tasklist_lock); list_for_each_entry(grp, &related_thread_groups, list) { int j; for_each_possible_cpu(j) { struct group_cpu_time *cpu_time; /* Protected by rq lock */ cpu_time = _group_cpu_time(grp, j); memset(cpu_time, 0, sizeof(struct group_cpu_time)); if (window_start) cpu_time->window_start = window_start; } } if (window_size) { sched_ravg_window = window_size * TICK_NSEC; set_hmp_defaults(); sched_load_granule = sched_ravg_window / NUM_LOAD_INDICES; } sched_disable_window_stats = 0; for_each_possible_cpu(cpu) { struct rq *rq = cpu_rq(cpu); if (window_start) rq->window_start = window_start; rq->curr_runnable_sum = rq->prev_runnable_sum = 0; rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0; for (i = 0; i < NUM_TRACKED_WINDOWS; i++) { memset(&rq->load_subs[i], 0, sizeof(struct load_subtractions)); clear_top_tasks_table(rq->top_tasks[i]); clear_top_tasks_bitmap(rq->top_tasks_bitmap[i]); } rq->curr_table = 0; rq->curr_top = 0; rq->prev_top = 0; reset_cpu_hmp_stats(cpu, 1); } if (sched_window_stats_policy != sysctl_sched_window_stats_policy) { reason = POLICY_CHANGE; old = sched_window_stats_policy; new = sysctl_sched_window_stats_policy; sched_window_stats_policy = sysctl_sched_window_stats_policy; } else if (sched_ravg_hist_size != sysctl_sched_ravg_hist_size) { reason = HIST_SIZE_CHANGE; old = sched_ravg_hist_size; new = sysctl_sched_ravg_hist_size; sched_ravg_hist_size = sysctl_sched_ravg_hist_size; } else if (sched_freq_aggregate != sysctl_sched_freq_aggregate) { reason = FREQ_AGGREGATE_CHANGE; old = sched_freq_aggregate; new = sysctl_sched_freq_aggregate; sched_freq_aggregate = sysctl_sched_freq_aggregate; } for_each_possible_cpu(cpu) raw_spin_unlock(&cpu_rq(cpu)->lock); read_unlock(&related_thread_group_lock); local_irq_restore(flags); trace_sched_reset_all_window_stats(window_start, window_size, sched_ktime_clock() - start_ts, reason, old, new); } /* * In this function we match the accumulated subtractions with the current * and previous windows we are operating with. Ignore any entries where * the window start in the load_subtraction struct does not match either * the curent or the previous window. This could happen whenever CPUs * become idle or busy with interrupts disabled for an extended period. */ static inline void account_load_subtractions(struct rq *rq) { u64 ws = rq->window_start; u64 prev_ws = ws - sched_ravg_window; struct load_subtractions *ls = rq->load_subs; int i; for (i = 0; i < NUM_TRACKED_WINDOWS; i++) { if (ls[i].window_start == ws) { rq->curr_runnable_sum -= ls[i].subs; rq->nt_curr_runnable_sum -= ls[i].new_subs; } else if (ls[i].window_start == prev_ws) { rq->prev_runnable_sum -= ls[i].subs; rq->nt_prev_runnable_sum -= ls[i].new_subs; } ls[i].subs = 0; ls[i].new_subs = 0; } BUG_ON((s64)rq->prev_runnable_sum < 0); BUG_ON((s64)rq->curr_runnable_sum < 0); BUG_ON((s64)rq->nt_prev_runnable_sum < 0); BUG_ON((s64)rq->nt_curr_runnable_sum < 0); } static inline u64 freq_policy_load(struct rq *rq, u64 load) { unsigned int reporting_policy = sysctl_sched_freq_reporting_policy; switch (reporting_policy) { case FREQ_REPORT_MAX_CPU_LOAD_TOP_TASK: load = max_t(u64, load, top_task_load(rq)); break; case FREQ_REPORT_TOP_TASK: load = top_task_load(rq); break; case FREQ_REPORT_CPU_LOAD: break; default: WARN_ON_ONCE(1); } return load; } static inline void sync_window_start(struct rq *rq, struct group_cpu_time *cpu_time); void sched_get_cpus_busy(struct sched_load *busy, const struct cpumask *query_cpus) { unsigned long flags; struct rq *rq; const int cpus = cpumask_weight(query_cpus); u64 load[cpus], group_load[cpus]; u64 nload[cpus], ngload[cpus]; u64 pload[cpus]; unsigned int max_freq[cpus]; int notifier_sent = 0; int early_detection[cpus]; int cpu, i = 0; unsigned int window_size; u64 max_prev_sum = 0; int max_busy_cpu = cpumask_first(query_cpus); struct related_thread_group *grp; u64 total_group_load = 0, total_ngload = 0; bool aggregate_load = false; struct sched_cluster *cluster = cpu_cluster(cpumask_first(query_cpus)); if (unlikely(cpus == 0)) return; local_irq_save(flags); read_lock(&related_thread_group_lock); /* * This function could be called in timer context, and the * current task may have been executing for a long time. Ensure * that the window stats are current by doing an update. */ for_each_cpu(cpu, query_cpus) raw_spin_lock(&cpu_rq(cpu)->lock); window_size = sched_ravg_window; /* * We don't really need the cluster lock for this entire for loop * block. However, there is no advantage in optimizing this as rq * locks are held regardless and would prevent migration anyways */ raw_spin_lock(&cluster->load_lock); for_each_cpu(cpu, query_cpus) { rq = cpu_rq(cpu); update_task_ravg(rq->curr, rq, TASK_UPDATE, sched_ktime_clock(), 0); account_load_subtractions(rq); load[i] = rq->prev_runnable_sum; nload[i] = rq->nt_prev_runnable_sum; pload[i] = rq->hmp_stats.pred_demands_sum; rq->old_estimated_time = pload[i]; if (load[i] > max_prev_sum) { max_prev_sum = load[i]; max_busy_cpu = cpu; } /* * sched_get_cpus_busy() is called for all CPUs in a * frequency domain. So the notifier_sent flag per * cluster works even when a frequency domain spans * more than 1 cluster. */ if (rq->cluster->notifier_sent) { notifier_sent = 1; rq->cluster->notifier_sent = 0; } early_detection[i] = (rq->ed_task != NULL); max_freq[i] = cpu_max_freq(cpu); i++; } raw_spin_unlock(&cluster->load_lock); for_each_related_thread_group(grp) { for_each_cpu(cpu, query_cpus) { /* Protected by rq_lock */ struct group_cpu_time *cpu_time = _group_cpu_time(grp, cpu); sync_window_start(cpu_rq(cpu), cpu_time); } } group_load_in_freq_domain( &cpu_rq(max_busy_cpu)->freq_domain_cpumask, &total_group_load, &total_ngload); aggregate_load = !!(total_group_load > sched_freq_aggregate_threshold); i = 0; for_each_cpu(cpu, query_cpus) { group_load[i] = 0; ngload[i] = 0; if (early_detection[i]) goto skip_early; rq = cpu_rq(cpu); if (aggregate_load) { if (cpu == max_busy_cpu) { group_load[i] = total_group_load; ngload[i] = total_ngload; } } else { _group_load_in_cpu(cpu, &group_load[i], &ngload[i]); } load[i] += group_load[i]; nload[i] += ngload[i]; load[i] = freq_policy_load(rq, load[i]); rq->old_busy_time = load[i]; /* * Scale load in reference to cluster max_possible_freq. * * Note that scale_load_to_cpu() scales load in reference to * the cluster max_freq. */ load[i] = scale_load_to_cpu(load[i], cpu); nload[i] = scale_load_to_cpu(nload[i], cpu); pload[i] = scale_load_to_cpu(pload[i], cpu); skip_early: i++; } for_each_cpu(cpu, query_cpus) raw_spin_unlock(&(cpu_rq(cpu))->lock); read_unlock(&related_thread_group_lock); local_irq_restore(flags); i = 0; for_each_cpu(cpu, query_cpus) { rq = cpu_rq(cpu); if (early_detection[i]) { busy[i].prev_load = div64_u64(sched_ravg_window, NSEC_PER_USEC); busy[i].new_task_load = 0; goto exit_early; } load[i] = scale_load_to_freq(load[i], max_freq[i], cpu_max_possible_freq(cpu)); nload[i] = scale_load_to_freq(nload[i], max_freq[i], cpu_max_possible_freq(cpu)); pload[i] = scale_load_to_freq(pload[i], max_freq[i], rq->cluster->max_possible_freq); busy[i].prev_load = div64_u64(load[i], NSEC_PER_USEC); busy[i].new_task_load = div64_u64(nload[i], NSEC_PER_USEC); busy[i].predicted_load = div64_u64(pload[i], NSEC_PER_USEC); exit_early: trace_sched_get_busy(cpu, busy[i].prev_load, busy[i].new_task_load, busy[i].predicted_load, early_detection[i]); i++; } } void sched_set_io_is_busy(int val) { sched_io_is_busy = val; } int sched_set_window(u64 window_start, unsigned int window_size) { u64 now, cur_jiffies, jiffy_ktime_ns; s64 ws; unsigned long flags; if (window_size * TICK_NSEC < MIN_SCHED_RAVG_WINDOW) return -EINVAL; mutex_lock(&policy_mutex); /* * Get a consistent view of ktime, jiffies, and the time * since the last jiffy (based on last_jiffies_update). */ local_irq_save(flags); cur_jiffies = jiffy_to_ktime_ns(&now, &jiffy_ktime_ns); local_irq_restore(flags); /* translate window_start from jiffies to nanoseconds */ ws = (window_start - cur_jiffies); /* jiffy difference */ ws *= TICK_NSEC; ws += jiffy_ktime_ns; /* * Roll back calculated window start so that it is in * the past (window stats must have a current window). */ while (ws > now) ws -= (window_size * TICK_NSEC); BUG_ON(sched_ktime_clock() < ws); reset_all_window_stats(ws, window_size); sched_update_freq_max_load(cpu_possible_mask); mutex_unlock(&policy_mutex); return 0; } static inline void create_subtraction_entry(struct rq *rq, u64 ws, int index) { rq->load_subs[index].window_start = ws; rq->load_subs[index].subs = 0; rq->load_subs[index].new_subs = 0; } static bool get_subtraction_index(struct rq *rq, u64 ws) { int i; u64 oldest = ULLONG_MAX; int oldest_index = 0; for (i = 0; i < NUM_TRACKED_WINDOWS; i++) { u64 entry_ws = rq->load_subs[i].window_start; if (ws == entry_ws) return i; if (entry_ws < oldest) { oldest = entry_ws; oldest_index = i; } } create_subtraction_entry(rq, ws, oldest_index); return oldest_index; } static void update_rq_load_subtractions(int index, struct rq *rq, u32 sub_load, bool new_task) { rq->load_subs[index].subs += sub_load; if (new_task) rq->load_subs[index].new_subs += sub_load; } static void update_cluster_load_subtractions(struct task_struct *p, int cpu, u64 ws, bool new_task) { struct sched_cluster *cluster = cpu_cluster(cpu); struct cpumask cluster_cpus = cluster->cpus; u64 prev_ws = ws - sched_ravg_window; int i; cpumask_clear_cpu(cpu, &cluster_cpus); raw_spin_lock(&cluster->load_lock); for_each_cpu(i, &cluster_cpus) { struct rq *rq = cpu_rq(i); int index; if (p->ravg.curr_window_cpu[i]) { index = get_subtraction_index(rq, ws); update_rq_load_subtractions(index, rq, p->ravg.curr_window_cpu[i], new_task); p->ravg.curr_window_cpu[i] = 0; } if (p->ravg.prev_window_cpu[i]) { index = get_subtraction_index(rq, prev_ws); update_rq_load_subtractions(index, rq, p->ravg.prev_window_cpu[i], new_task); p->ravg.prev_window_cpu[i] = 0; } } raw_spin_unlock(&cluster->load_lock); } static inline void inter_cluster_migration_fixup (struct task_struct *p, int new_cpu, int task_cpu, bool new_task) { struct rq *dest_rq = cpu_rq(new_cpu); struct rq *src_rq = cpu_rq(task_cpu); if (same_freq_domain(new_cpu, task_cpu)) return; p->ravg.curr_window_cpu[new_cpu] = p->ravg.curr_window; p->ravg.prev_window_cpu[new_cpu] = p->ravg.prev_window; dest_rq->curr_runnable_sum += p->ravg.curr_window; dest_rq->prev_runnable_sum += p->ravg.prev_window; src_rq->curr_runnable_sum -= p->ravg.curr_window_cpu[task_cpu]; src_rq->prev_runnable_sum -= p->ravg.prev_window_cpu[task_cpu]; if (new_task) { dest_rq->nt_curr_runnable_sum += p->ravg.curr_window; dest_rq->nt_prev_runnable_sum += p->ravg.prev_window; src_rq->nt_curr_runnable_sum -= p->ravg.curr_window_cpu[task_cpu]; src_rq->nt_prev_runnable_sum -= p->ravg.prev_window_cpu[task_cpu]; } p->ravg.curr_window_cpu[task_cpu] = 0; p->ravg.prev_window_cpu[task_cpu] = 0; update_cluster_load_subtractions(p, task_cpu, src_rq->window_start, new_task); BUG_ON((s64)src_rq->prev_runnable_sum < 0); BUG_ON((s64)src_rq->curr_runnable_sum < 0); BUG_ON((s64)src_rq->nt_prev_runnable_sum < 0); BUG_ON((s64)src_rq->nt_curr_runnable_sum < 0); } static int get_top_index(unsigned long *bitmap, unsigned long old_top) { int index = find_next_bit(bitmap, NUM_LOAD_INDICES, old_top); if (index == NUM_LOAD_INDICES) return 0; return NUM_LOAD_INDICES - 1 - index; } static void migrate_top_tasks(struct task_struct *p, struct rq *src_rq, struct rq *dst_rq) { int index; int top_index; u32 curr_window = p->ravg.curr_window; u32 prev_window = p->ravg.prev_window; u8 src = src_rq->curr_table; u8 dst = dst_rq->curr_table; u8 *src_table; u8 *dst_table; if (curr_window) { src_table = src_rq->top_tasks[src]; dst_table = dst_rq->top_tasks[dst]; index = load_to_index(curr_window); src_table[index] -= 1; dst_table[index] += 1; if (!src_table[index]) __clear_bit(NUM_LOAD_INDICES - index - 1, src_rq->top_tasks_bitmap[src]); if (dst_table[index] == 1) __set_bit(NUM_LOAD_INDICES - index - 1, dst_rq->top_tasks_bitmap[dst]); if (index > dst_rq->curr_top) dst_rq->curr_top = index; top_index = src_rq->curr_top; if (index == top_index && !src_table[index]) src_rq->curr_top = get_top_index( src_rq->top_tasks_bitmap[src], top_index); } if (prev_window) { src = 1 - src; dst = 1 - dst; src_table = src_rq->top_tasks[src]; dst_table = dst_rq->top_tasks[dst]; index = load_to_index(prev_window); src_table[index] -= 1; dst_table[index] += 1; if (!src_table[index]) __clear_bit(NUM_LOAD_INDICES - index - 1, src_rq->top_tasks_bitmap[src]); if (dst_table[index] == 1) __set_bit(NUM_LOAD_INDICES - index - 1, dst_rq->top_tasks_bitmap[dst]); if (index > dst_rq->prev_top) dst_rq->prev_top = index; top_index = src_rq->prev_top; if (index == top_index && !src_table[index]) src_rq->prev_top = get_top_index( src_rq->top_tasks_bitmap[src], top_index); } } void fixup_busy_time(struct task_struct *p, int new_cpu) { struct rq *src_rq = task_rq(p); struct rq *dest_rq = cpu_rq(new_cpu); u64 wallclock; u64 *src_curr_runnable_sum, *dst_curr_runnable_sum; u64 *src_prev_runnable_sum, *dst_prev_runnable_sum; u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum; u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum; bool new_task; struct related_thread_group *grp; if (!sched_enable_hmp || (!p->on_rq && p->state != TASK_WAKING)) return; if (exiting_task(p)) { clear_ed_task(p, src_rq); return; } if (p->state == TASK_WAKING) double_rq_lock(src_rq, dest_rq); if (sched_disable_window_stats) goto done; wallclock = sched_ktime_clock(); update_task_ravg(task_rq(p)->curr, task_rq(p), TASK_UPDATE, wallclock, 0); update_task_ravg(dest_rq->curr, dest_rq, TASK_UPDATE, wallclock, 0); update_task_ravg(p, task_rq(p), TASK_MIGRATE, wallclock, 0); update_task_cpu_cycles(p, new_cpu); new_task = is_new_task(p); /* Protected by rq_lock */ grp = p->grp; /* * For frequency aggregation, we continue to do migration fixups * even for intra cluster migrations. This is because, the aggregated * load has to reported on a single CPU regardless. */ if (grp && sched_freq_aggregate) { struct group_cpu_time *cpu_time; cpu_time = _group_cpu_time(grp, cpu_of(src_rq)); src_curr_runnable_sum = &cpu_time->curr_runnable_sum; src_prev_runnable_sum = &cpu_time->prev_runnable_sum; src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; cpu_time = _group_cpu_time(grp, cpu_of(dest_rq)); dst_curr_runnable_sum = &cpu_time->curr_runnable_sum; dst_prev_runnable_sum = &cpu_time->prev_runnable_sum; dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; sync_window_start(dest_rq, cpu_time); if (p->ravg.curr_window) { *src_curr_runnable_sum -= p->ravg.curr_window; *dst_curr_runnable_sum += p->ravg.curr_window; if (new_task) { *src_nt_curr_runnable_sum -= p->ravg.curr_window; *dst_nt_curr_runnable_sum += p->ravg.curr_window; } } if (p->ravg.prev_window) { *src_prev_runnable_sum -= p->ravg.prev_window; *dst_prev_runnable_sum += p->ravg.prev_window; if (new_task) { *src_nt_prev_runnable_sum -= p->ravg.prev_window; *dst_nt_prev_runnable_sum += p->ravg.prev_window; } } } else { inter_cluster_migration_fixup(p, new_cpu, task_cpu(p), new_task); } migrate_top_tasks(p, src_rq, dest_rq); if (p == src_rq->ed_task) { src_rq->ed_task = NULL; if (!dest_rq->ed_task) dest_rq->ed_task = p; } done: if (p->state == TASK_WAKING) double_rq_unlock(src_rq, dest_rq); } #define sched_up_down_migrate_auto_update 1 static void check_for_up_down_migrate_update(const struct cpumask *cpus) { int i = cpumask_first(cpus); if (!sched_up_down_migrate_auto_update) return; if (cpu_max_possible_capacity(i) == max_possible_capacity) return; if (cpu_max_possible_freq(i) == cpu_max_freq(i)) up_down_migrate_scale_factor = 1024; else up_down_migrate_scale_factor = (1024 * cpu_max_possible_freq(i)) / cpu_max_freq(i); update_up_down_migrate(); } /* Return cluster which can offer required capacity for group */ static struct sched_cluster *best_cluster(struct related_thread_group *grp, u64 total_demand, bool group_boost) { struct sched_cluster *cluster = NULL; for_each_sched_cluster(cluster) { if (group_will_fit(cluster, grp, total_demand, group_boost)) return cluster; } return NULL; } static void _set_preferred_cluster(struct related_thread_group *grp) { struct task_struct *p; u64 combined_demand = 0; bool boost_on_big = sched_boost_policy() == SCHED_BOOST_ON_BIG; bool group_boost = false; u64 wallclock; if (!sysctl_sched_enable_colocation) { grp->last_update = sched_ktime_clock(); grp->preferred_cluster = NULL; return; } if (list_empty(&grp->tasks)) return; wallclock = sched_ktime_clock(); /* * wakeup of two or more related tasks could race with each other and * could result in multiple calls to _set_preferred_cluster being issued * at same time. Avoid overhead in such cases of rechecking preferred * cluster */ if (wallclock - grp->last_update < sched_ravg_window / 10) return; list_for_each_entry(p, &grp->tasks, grp_list) { if (boost_on_big && task_sched_boost(p)) { group_boost = true; break; } if (p->ravg.mark_start < wallclock - (sched_ravg_window * sched_ravg_hist_size)) continue; combined_demand += p->ravg.demand; } grp->preferred_cluster = best_cluster(grp, combined_demand, group_boost); grp->last_update = sched_ktime_clock(); trace_sched_set_preferred_cluster(grp, combined_demand); } void set_preferred_cluster(struct related_thread_group *grp) { raw_spin_lock(&grp->lock); _set_preferred_cluster(grp); raw_spin_unlock(&grp->lock); } #define ADD_TASK 0 #define REM_TASK 1 #define DEFAULT_CGROUP_COLOC_ID 1 static inline void free_group_cputime(struct related_thread_group *grp) { free_percpu(grp->cpu_time); } static int alloc_group_cputime(struct related_thread_group *grp) { int i; struct group_cpu_time *cpu_time; int cpu = raw_smp_processor_id(); struct rq *rq = cpu_rq(cpu); u64 window_start = rq->window_start; grp->cpu_time = alloc_percpu_gfp(struct group_cpu_time, GFP_ATOMIC); if (!grp->cpu_time) return -ENOMEM; for_each_possible_cpu(i) { cpu_time = per_cpu_ptr(grp->cpu_time, i); memset(cpu_time, 0, sizeof(struct group_cpu_time)); cpu_time->window_start = window_start; } return 0; } /* * A group's window_start may be behind. When moving it forward, flip prev/curr * counters. When moving forward > 1 window, prev counter is set to 0 */ static inline void sync_window_start(struct rq *rq, struct group_cpu_time *cpu_time) { u64 delta; int nr_windows; u64 curr_sum = cpu_time->curr_runnable_sum; u64 nt_curr_sum = cpu_time->nt_curr_runnable_sum; delta = rq->window_start - cpu_time->window_start; if (!delta) return; nr_windows = div64_u64(delta, sched_ravg_window); if (nr_windows > 1) curr_sum = nt_curr_sum = 0; cpu_time->prev_runnable_sum = curr_sum; cpu_time->curr_runnable_sum = 0; cpu_time->nt_prev_runnable_sum = nt_curr_sum; cpu_time->nt_curr_runnable_sum = 0; cpu_time->window_start = rq->window_start; } /* * Task's cpu usage is accounted in: * rq->curr/prev_runnable_sum, when its ->grp is NULL * grp->cpu_time[cpu]->curr/prev_runnable_sum, when its ->grp is !NULL * * Transfer task's cpu usage between those counters when transitioning between * groups */ static void transfer_busy_time(struct rq *rq, struct related_thread_group *grp, struct task_struct *p, int event) { u64 wallclock; struct group_cpu_time *cpu_time; u64 *src_curr_runnable_sum, *dst_curr_runnable_sum; u64 *src_prev_runnable_sum, *dst_prev_runnable_sum; u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum; u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum; struct migration_sum_data d; int migrate_type; int cpu = cpu_of(rq); bool new_task = is_new_task(p); int i; if (!sched_freq_aggregate) return; wallclock = sched_ktime_clock(); update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); update_task_ravg(p, rq, TASK_UPDATE, wallclock, 0); /* cpu_time protected by related_thread_group_lock, grp->lock rq_lock */ cpu_time = _group_cpu_time(grp, cpu); if (event == ADD_TASK) { sync_window_start(rq, cpu_time); migrate_type = RQ_TO_GROUP; d.src_rq = rq; d.src_cpu_time = NULL; d.dst_rq = NULL; d.dst_cpu_time = cpu_time; src_curr_runnable_sum = &rq->curr_runnable_sum; dst_curr_runnable_sum = &cpu_time->curr_runnable_sum; src_prev_runnable_sum = &rq->prev_runnable_sum; dst_prev_runnable_sum = &cpu_time->prev_runnable_sum; src_nt_curr_runnable_sum = &rq->nt_curr_runnable_sum; dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; src_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum; dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; *src_curr_runnable_sum -= p->ravg.curr_window_cpu[cpu]; *src_prev_runnable_sum -= p->ravg.prev_window_cpu[cpu]; if (new_task) { *src_nt_curr_runnable_sum -= p->ravg.curr_window_cpu[cpu]; *src_nt_prev_runnable_sum -= p->ravg.prev_window_cpu[cpu]; } update_cluster_load_subtractions(p, cpu, rq->window_start, new_task); } else { migrate_type = GROUP_TO_RQ; d.src_rq = NULL; d.src_cpu_time = cpu_time; d.dst_rq = rq; d.dst_cpu_time = NULL; /* * In case of REM_TASK, cpu_time->window_start would be * uptodate, because of the update_task_ravg() we called * above on the moving task. Hence no need for * sync_window_start() */ src_curr_runnable_sum = &cpu_time->curr_runnable_sum; dst_curr_runnable_sum = &rq->curr_runnable_sum; src_prev_runnable_sum = &cpu_time->prev_runnable_sum; dst_prev_runnable_sum = &rq->prev_runnable_sum; src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; dst_nt_curr_runnable_sum = &rq->nt_curr_runnable_sum; src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; dst_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum; *src_curr_runnable_sum -= p->ravg.curr_window; *src_prev_runnable_sum -= p->ravg.prev_window; if (new_task) { *src_nt_curr_runnable_sum -= p->ravg.curr_window; *src_nt_prev_runnable_sum -= p->ravg.prev_window; } /* * Need to reset curr/prev windows for all CPUs, not just the * ones in the same cluster. Since inter cluster migrations * did not result in the appropriate book keeping, the values * per CPU would be inaccurate. */ for_each_possible_cpu(i) { p->ravg.curr_window_cpu[i] = 0; p->ravg.prev_window_cpu[i] = 0; } } *dst_curr_runnable_sum += p->ravg.curr_window; *dst_prev_runnable_sum += p->ravg.prev_window; if (new_task) { *dst_nt_curr_runnable_sum += p->ravg.curr_window; *dst_nt_prev_runnable_sum += p->ravg.prev_window; } /* * When a task enter or exits a group, it's curr and prev windows are * moved to a single CPU. This behavior might be sub-optimal in the * exit case, however, it saves us the overhead of handling inter * cluster migration fixups while the task is part of a related group. */ p->ravg.curr_window_cpu[cpu] = p->ravg.curr_window; p->ravg.prev_window_cpu[cpu] = p->ravg.prev_window; trace_sched_migration_update_sum(p, migrate_type, &d); BUG_ON((s64)*src_curr_runnable_sum < 0); BUG_ON((s64)*src_prev_runnable_sum < 0); } static inline struct group_cpu_time * task_group_cpu_time(struct task_struct *p, int cpu) { return _group_cpu_time(rcu_dereference(p->grp), cpu); } static inline struct group_cpu_time * _group_cpu_time(struct related_thread_group *grp, int cpu) { return grp ? per_cpu_ptr(grp->cpu_time, cpu) : NULL; } struct related_thread_group *alloc_related_thread_group(int group_id) { struct related_thread_group *grp; grp = kzalloc(sizeof(*grp), GFP_ATOMIC); if (!grp) return ERR_PTR(-ENOMEM); if (alloc_group_cputime(grp)) { kfree(grp); return ERR_PTR(-ENOMEM); } grp->id = group_id; INIT_LIST_HEAD(&grp->tasks); INIT_LIST_HEAD(&grp->list); raw_spin_lock_init(&grp->lock); return grp; } struct related_thread_group *lookup_related_thread_group(unsigned int group_id) { struct related_thread_group *grp; list_for_each_entry(grp, &related_thread_groups, list) { if (grp->id == group_id) return grp; } return NULL; } /* See comments before preferred_cluster() */ static void free_related_thread_group(struct rcu_head *rcu) { struct related_thread_group *grp = container_of(rcu, struct related_thread_group, rcu); free_group_cputime(grp); kfree(grp); } static void remove_task_from_group(struct task_struct *p) { struct related_thread_group *grp = p->grp; struct rq *rq; int empty_group = 1; raw_spin_lock(&grp->lock); rq = __task_rq_lock(p); transfer_busy_time(rq, p->grp, p, REM_TASK); list_del_init(&p->grp_list); rcu_assign_pointer(p->grp, NULL); __task_rq_unlock(rq); if (!list_empty(&grp->tasks)) { empty_group = 0; _set_preferred_cluster(grp); } raw_spin_unlock(&grp->lock); /* Reserved groups cannot be destroyed */ if (empty_group && grp->id != DEFAULT_CGROUP_COLOC_ID) { list_del(&grp->list); call_rcu(&grp->rcu, free_related_thread_group); } } static int add_task_to_group(struct task_struct *p, struct related_thread_group *grp) { struct rq *rq; raw_spin_lock(&grp->lock); /* * Change p->grp under rq->lock. Will prevent races with read-side * reference of p->grp in various hot-paths */ rq = __task_rq_lock(p); transfer_busy_time(rq, grp, p, ADD_TASK); list_add(&p->grp_list, &grp->tasks); rcu_assign_pointer(p->grp, grp); __task_rq_unlock(rq); _set_preferred_cluster(grp); raw_spin_unlock(&grp->lock); return 0; } void add_new_task_to_grp(struct task_struct *new) { unsigned long flags; struct related_thread_group *grp; struct task_struct *leader = new->group_leader; unsigned int leader_grp_id = sched_get_group_id(leader); if (!sysctl_sched_enable_thread_grouping && leader_grp_id != DEFAULT_CGROUP_COLOC_ID) return; if (thread_group_leader(new)) return; if (leader_grp_id == DEFAULT_CGROUP_COLOC_ID) { if (!same_schedtune(new, leader)) return; } write_lock_irqsave(&related_thread_group_lock, flags); rcu_read_lock(); grp = task_related_thread_group(leader); rcu_read_unlock(); /* * It's possible that someone already added the new task to the * group. A leader's thread group is updated prior to calling * this function. It's also possible that the leader has exited * the group. In either case, there is nothing else to do. */ if (!grp || new->grp) { write_unlock_irqrestore(&related_thread_group_lock, flags); return; } raw_spin_lock(&grp->lock); rcu_assign_pointer(new->grp, grp); list_add(&new->grp_list, &grp->tasks); raw_spin_unlock(&grp->lock); write_unlock_irqrestore(&related_thread_group_lock, flags); } #if defined(CONFIG_SCHED_TUNE) && defined(CONFIG_CGROUP_SCHEDTUNE) /* * We create a default colocation group at boot. There is no need to * synchronize tasks between cgroups at creation time because the * correct cgroup hierarchy is not available at boot. Therefore cgroup * colocation is turned off by default even though the colocation group * itself has been allocated. Furthermore this colocation group cannot * be destroyted once it has been created. All of this has been as part * of runtime optimizations. * * The job of synchronizing tasks to the colocation group is done when * the colocation flag in the cgroup is turned on. */ static int __init create_default_coloc_group(void) { struct related_thread_group *grp = NULL; unsigned long flags; grp = alloc_related_thread_group(DEFAULT_CGROUP_COLOC_ID); if (IS_ERR(grp)) { WARN_ON(1); return -ENOMEM; } write_lock_irqsave(&related_thread_group_lock, flags); list_add(&grp->list, &related_thread_groups); write_unlock_irqrestore(&related_thread_group_lock, flags); update_freq_aggregate_threshold(MAX_FREQ_AGGR_THRESH); return 0; } late_initcall(create_default_coloc_group); int sync_cgroup_colocation(struct task_struct *p, bool insert) { unsigned int grp_id = insert ? DEFAULT_CGROUP_COLOC_ID : 0; return sched_set_group_id(p, grp_id); } #endif int sched_set_group_id(struct task_struct *p, unsigned int group_id) { int rc = 0; unsigned long flags; struct related_thread_group *grp = NULL; raw_spin_lock_irqsave(&p->pi_lock, flags); write_lock(&related_thread_group_lock); /* Switching from one group to another directly is not permitted */ if ((current != p && p->flags & PF_EXITING) || (!p->grp && !group_id) || (p->grp && group_id)) goto done; if (!group_id) { remove_task_from_group(p); goto done; } grp = lookup_related_thread_group(group_id); if (!grp) { /* This is a reserved id */ if (group_id == DEFAULT_CGROUP_COLOC_ID) { rc = -EINVAL; goto done; } grp = alloc_related_thread_group(group_id); if (IS_ERR(grp)) { rc = -ENOMEM; goto done; } list_add(&grp->list, &related_thread_groups); } rc = add_task_to_group(p, grp); done: write_unlock(&related_thread_group_lock); raw_spin_unlock_irqrestore(&p->pi_lock, flags); return rc; } unsigned int sched_get_group_id(struct task_struct *p) { unsigned int group_id; struct related_thread_group *grp; rcu_read_lock(); grp = task_related_thread_group(p); group_id = grp ? grp->id : 0; rcu_read_unlock(); return group_id; } static void update_cpu_cluster_capacity(const cpumask_t *cpus) { int i; struct sched_cluster *cluster; struct cpumask cpumask; cpumask_copy(&cpumask, cpus); pre_big_task_count_change(cpu_possible_mask); for_each_cpu(i, &cpumask) { cluster = cpu_rq(i)->cluster; cpumask_andnot(&cpumask, &cpumask, &cluster->cpus); cluster->capacity = compute_capacity(cluster); cluster->load_scale_factor = compute_load_scale_factor(cluster); /* 'cpus' can contain cpumask more than one cluster */ check_for_up_down_migrate_update(&cluster->cpus); } __update_min_max_capacity(); post_big_task_count_change(cpu_possible_mask); } static DEFINE_SPINLOCK(cpu_freq_min_max_lock); void sched_update_cpu_freq_min_max(const cpumask_t *cpus, u32 fmin, u32 fmax) { struct cpumask cpumask; struct sched_cluster *cluster; int i, update_capacity = 0; unsigned long flags; spin_lock_irqsave(&cpu_freq_min_max_lock, flags); cpumask_copy(&cpumask, cpus); for_each_cpu(i, &cpumask) { cluster = cpu_rq(i)->cluster; cpumask_andnot(&cpumask, &cpumask, &cluster->cpus); update_capacity += (cluster->max_mitigated_freq != fmax); cluster->max_mitigated_freq = fmax; } spin_unlock_irqrestore(&cpu_freq_min_max_lock, flags); if (update_capacity) update_cpu_cluster_capacity(cpus); } static int cpufreq_notifier_policy(struct notifier_block *nb, unsigned long val, void *data) { struct cpufreq_policy *policy = (struct cpufreq_policy *)data; struct sched_cluster *cluster = NULL; struct cpumask policy_cluster = *policy->related_cpus; unsigned int orig_max_freq = 0; int i, j, update_capacity = 0; if (val != CPUFREQ_NOTIFY && val != CPUFREQ_REMOVE_POLICY && val != CPUFREQ_CREATE_POLICY) return 0; if (val == CPUFREQ_REMOVE_POLICY || val == CPUFREQ_CREATE_POLICY) { update_min_max_capacity(); return 0; } max_possible_freq = max(max_possible_freq, policy->cpuinfo.max_freq); if (min_max_freq == 1) min_max_freq = UINT_MAX; min_max_freq = min(min_max_freq, policy->cpuinfo.max_freq); BUG_ON(!min_max_freq); BUG_ON(!policy->max); for_each_cpu(i, &policy_cluster) { cluster = cpu_rq(i)->cluster; cpumask_andnot(&policy_cluster, &policy_cluster, &cluster->cpus); orig_max_freq = cluster->max_freq; cluster->min_freq = policy->min; cluster->max_freq = policy->max; cluster->cur_freq = policy->cur; if (!cluster->freq_init_done) { mutex_lock(&cluster_lock); for_each_cpu(j, &cluster->cpus) cpumask_copy(&cpu_rq(j)->freq_domain_cpumask, policy->related_cpus); cluster->max_possible_freq = policy->cpuinfo.max_freq; cluster->max_possible_capacity = compute_max_possible_capacity(cluster); cluster->freq_init_done = true; sort_clusters(); update_all_clusters_stats(); mutex_unlock(&cluster_lock); continue; } update_capacity += (orig_max_freq != cluster->max_freq); } if (update_capacity) update_cpu_cluster_capacity(policy->related_cpus); return 0; } static int cpufreq_notifier_trans(struct notifier_block *nb, unsigned long val, void *data) { struct cpufreq_freqs *freq = (struct cpufreq_freqs *)data; unsigned int cpu = freq->cpu, new_freq = freq->new; unsigned long flags; struct sched_cluster *cluster; struct cpumask policy_cpus = cpu_rq(cpu)->freq_domain_cpumask; int i, j; if (val != CPUFREQ_POSTCHANGE) return 0; BUG_ON(!new_freq); if (cpu_cur_freq(cpu) == new_freq) return 0; for_each_cpu(i, &policy_cpus) { cluster = cpu_rq(i)->cluster; for_each_cpu(j, &cluster->cpus) { struct rq *rq = cpu_rq(j); raw_spin_lock_irqsave(&rq->lock, flags); update_task_ravg(rq->curr, rq, TASK_UPDATE, sched_ktime_clock(), 0); raw_spin_unlock_irqrestore(&rq->lock, flags); } cluster->cur_freq = new_freq; cpumask_andnot(&policy_cpus, &policy_cpus, &cluster->cpus); } return 0; } static int pwr_stats_ready_notifier(struct notifier_block *nb, unsigned long cpu, void *data) { cpumask_t mask = CPU_MASK_NONE; cpumask_set_cpu(cpu, &mask); sched_update_freq_max_load(&mask); mutex_lock(&cluster_lock); sort_clusters(); mutex_unlock(&cluster_lock); return 0; } static struct notifier_block notifier_policy_block = { .notifier_call = cpufreq_notifier_policy }; static struct notifier_block notifier_trans_block = { .notifier_call = cpufreq_notifier_trans }; static struct notifier_block notifier_pwr_stats_ready = { .notifier_call = pwr_stats_ready_notifier }; int __weak register_cpu_pwr_stats_ready_notifier(struct notifier_block *nb) { return -EINVAL; } static int register_sched_callback(void) { int ret; if (!sched_enable_hmp) return 0; ret = cpufreq_register_notifier(¬ifier_policy_block, CPUFREQ_POLICY_NOTIFIER); if (!ret) ret = cpufreq_register_notifier(¬ifier_trans_block, CPUFREQ_TRANSITION_NOTIFIER); register_cpu_pwr_stats_ready_notifier(¬ifier_pwr_stats_ready); return 0; } /* * cpufreq callbacks can be registered at core_initcall or later time. * Any registration done prior to that is "forgotten" by cpufreq. See * initialization of variable init_cpufreq_transition_notifier_list_called * for further information. */ core_initcall(register_sched_callback); int update_preferred_cluster(struct related_thread_group *grp, struct task_struct *p, u32 old_load) { u32 new_load = task_load(p); if (!grp) return 0; /* * Update if task's load has changed significantly or a complete window * has passed since we last updated preference */ if (abs(new_load - old_load) > sched_ravg_window / 4 || sched_ktime_clock() - grp->last_update > sched_ravg_window) return 1; return 0; } bool early_detection_notify(struct rq *rq, u64 wallclock) { struct task_struct *p; int loop_max = 10; if (sched_boost_policy() == SCHED_BOOST_NONE || !rq->cfs.h_nr_running) return 0; rq->ed_task = NULL; list_for_each_entry(p, &rq->cfs_tasks, se.group_node) { if (!loop_max) break; if (wallclock - p->last_wake_ts >= EARLY_DETECTION_DURATION) { rq->ed_task = p; return 1; } loop_max--; } return 0; } #ifdef CONFIG_CGROUP_SCHED u64 cpu_upmigrate_discourage_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) { struct task_group *tg = css_tg(css); return tg->upmigrate_discouraged; } int cpu_upmigrate_discourage_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, u64 upmigrate_discourage) { struct task_group *tg = css_tg(css); int discourage = upmigrate_discourage > 0; if (tg->upmigrate_discouraged == discourage) return 0; /* * Revisit big-task classification for tasks of this cgroup. It would * have been efficient to walk tasks of just this cgroup in running * state, but we don't have easy means to do that. Walk all tasks in * running state on all cpus instead and re-visit their big task * classification. */ get_online_cpus(); pre_big_task_count_change(cpu_online_mask); tg->upmigrate_discouraged = discourage; post_big_task_count_change(cpu_online_mask); put_online_cpus(); return 0; } #endif /* CONFIG_CGROUP_SCHED */