diff options
Diffstat (limited to 'kernel')
97 files changed, 15049 insertions, 1527 deletions
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index e0a4d971dd64..e76b512082a2 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2251,10 +2251,11 @@ int __audit_signal_info(int sig, struct task_struct *t) audit_sig_uid = uid; security_task_getsecid(tsk, &audit_sig_sid); } - if (!audit_signals || audit_dummy_context()) - return 0; } + if (!audit_signals || audit_dummy_context()) + return 0; + /* optimize the common case by putting first signal recipient directly * in audit_context */ if (!ctx->target_pid) { diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7dd44ebdb2c2..fe08e12683fe 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -211,6 +211,7 @@ static unsigned long have_free_callback __read_mostly; /* Ditto for the can_fork callback. */ static unsigned long have_canfork_callback __read_mostly; +static struct file_system_type cgroup2_fs_type; static struct cftype cgroup_dfl_base_files[]; static struct cftype cgroup_legacy_base_files[]; @@ -716,10 +717,10 @@ static void css_set_move_task(struct task_struct *task, if (to_cset) { /* - * We are synchronized through cgroup_threadgroup_rwsem - * against PF_EXITING setting such that we can't race - * against cgroup_exit() changing the css_set to - * init_css_set and dropping the old one. + * We are synchronized through css_set_lock against + * PF_EXITING setting such that we can't race against + * cgroup_exit() disassociating the task from the + * css_set. */ WARN_ON_ONCE(task->flags & PF_EXITING); @@ -1652,10 +1653,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) all_ss = true; continue; } - if (!strcmp(token, "__DEVEL__sane_behavior")) { - opts->flags |= CGRP_ROOT_SANE_BEHAVIOR; - continue; - } if (!strcmp(token, "noprefix")) { opts->flags |= CGRP_ROOT_NOPREFIX; continue; @@ -1722,15 +1719,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) return -ENOENT; } - if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { - pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); - if (nr_opts != 1) { - pr_err("sane_behavior: no other mount options allowed\n"); - return -EINVAL; - } - return 0; - } - /* * If the 'all' option was specified select all the subsystems, * otherwise if 'none', 'name=' and a subsystem name options were @@ -2013,9 +2001,10 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, int flags, const char *unused_dev_name, void *data) { + bool is_v2 = fs_type == &cgroup2_fs_type; struct super_block *pinned_sb = NULL; struct cgroup_subsys *ss; - struct cgroup_root *root; + struct cgroup_root *root = NULL; struct cgroup_sb_opts opts; struct dentry *dentry; int ret; @@ -2029,6 +2018,17 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if (!use_task_css_set_links) cgroup_enable_task_cg_lists(); + if (is_v2) { + if (data) { + pr_err("cgroup2: unknown option \"%s\"\n", (char *)data); + return ERR_PTR(-EINVAL); + } + cgrp_dfl_root_visible = true; + root = &cgrp_dfl_root; + cgroup_get(&root->cgrp); + goto out_mount; + } + mutex_lock(&cgroup_mutex); /* First find the desired set of subsystems */ @@ -2036,15 +2036,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if (ret) goto out_unlock; - /* look for a matching existing root */ - if (opts.flags & CGRP_ROOT_SANE_BEHAVIOR) { - cgrp_dfl_root_visible = true; - root = &cgrp_dfl_root; - cgroup_get(&root->cgrp); - ret = 0; - goto out_unlock; - } - /* * Destruction of cgroup root is asynchronous, so subsystems may * still be dying after the previous unmount. Let's drain the @@ -2155,9 +2146,10 @@ out_free: if (ret) return ERR_PTR(ret); - +out_mount: dentry = kernfs_mount(fs_type, flags, root->kf_root, - CGROUP_SUPER_MAGIC, &new_sb); + is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC, + &new_sb); if (IS_ERR(dentry) || !new_sb) cgroup_put(&root->cgrp); @@ -2200,6 +2192,12 @@ static struct file_system_type cgroup_fs_type = { .kill_sb = cgroup_kill_sb, }; +static struct file_system_type cgroup2_fs_type = { + .name = "cgroup2", + .mount = cgroup_mount, + .kill_sb = cgroup_kill_sb, +}; + /** * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy * @task: target task @@ -2801,6 +2799,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) int retval = 0; mutex_lock(&cgroup_mutex); + percpu_down_write(&cgroup_threadgroup_rwsem); for_each_root(root) { struct cgroup *from_cgrp; @@ -2815,6 +2814,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) if (retval) break; } + percpu_up_write(&cgroup_threadgroup_rwsem); mutex_unlock(&cgroup_mutex); return retval; @@ -4078,6 +4078,8 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) mutex_lock(&cgroup_mutex); + percpu_down_write(&cgroup_threadgroup_rwsem); + /* all tasks in @from are being moved, all csets are source */ spin_lock_irq(&css_set_lock); list_for_each_entry(link, &from->cset_links, cset_link) @@ -4110,6 +4112,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) } while (task && !ret); out_err: cgroup_migrate_finish(&preloaded_csets); + percpu_up_write(&cgroup_threadgroup_rwsem); mutex_unlock(&cgroup_mutex); return ret; } @@ -5405,6 +5408,7 @@ int __init cgroup_init(void) WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup")); WARN_ON(register_filesystem(&cgroup_fs_type)); + WARN_ON(register_filesystem(&cgroup2_fs_type)); WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations)); return 0; @@ -5710,19 +5714,22 @@ void cgroup_exit(struct task_struct *tsk) int i; /* - * Unlink from @tsk from its css_set. As migration path can't race - * with us, we can check css_set and cg_list without synchronization. + * Avoid potential race with the migrate path. + */ + spin_lock_irq(&css_set_lock); + /* + * Unlink from @tsk from its css_set. */ cset = task_css_set(tsk); if (!list_empty(&tsk->cg_list)) { - spin_lock_irq(&css_set_lock); css_set_move_task(tsk, cset, NULL, false); - spin_unlock_irq(&css_set_lock); } else { get_css_set(cset); } + spin_unlock_irq(&css_set_lock); + /* see cgroup_post_fork() for details */ for_each_subsys_which(ss, i, &have_exit_callback) ss->exit(tsk); @@ -5784,7 +5791,9 @@ static void cgroup_release_agent(struct work_struct *work) if (!pathbuf || !agentbuf) goto out; + spin_lock_irq(&css_set_lock); path = cgroup_path(cgrp, pathbuf, PATH_MAX); + spin_unlock_irq(&css_set_lock); if (!path) goto out; @@ -5959,7 +5968,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v) struct task_struct *task; int count = 0; - seq_printf(seq, "css_set %p\n", cset); + seq_printf(seq, "css_set %pK\n", cset); list_for_each_entry(task, &cset->tasks, cg_list) { if (count++ > MAX_TASKS_SHOWN_PER_CSS) diff --git a/kernel/cpu.c b/kernel/cpu.c index 977b1add6fc9..223f8f208df1 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -92,6 +92,11 @@ static struct { #define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map) #define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map) +void cpu_hotplug_mutex_held(void) +{ + lockdep_assert_held(&cpu_hotplug.lock); +} +EXPORT_SYMBOL(cpu_hotplug_mutex_held); void get_online_cpus(void) { @@ -368,6 +373,9 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) if (!cpu_online(cpu)) return -EINVAL; + if (!tasks_frozen && !cpu_isolated(cpu) && num_online_uniso_cpus() == 1) + return -EBUSY; + cpu_hotplug_begin(); err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); @@ -528,8 +536,8 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen) ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); if (ret) { nr_calls--; - pr_warn("%s: attempt to bring up CPU %u failed\n", - __func__, cpu); + pr_warn_ratelimited("%s: attempt to bring up CPU %u failed\n", + __func__, cpu); goto out_notify; } @@ -553,9 +561,41 @@ out: return ret; } +static int switch_to_rt_policy(void) +{ + struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; + unsigned int policy = current->policy; + int err; + + /* Nobody should be attempting hotplug from these policy contexts. */ + if (policy == SCHED_BATCH || policy == SCHED_IDLE || + policy == SCHED_DEADLINE) + return -EPERM; + + if (policy == SCHED_FIFO || policy == SCHED_RR) + return 1; + + /* Only SCHED_NORMAL left. */ + err = sched_setscheduler_nocheck(current, SCHED_FIFO, ¶m); + return err; + +} + +static int switch_to_fair_policy(void) +{ + struct sched_param param = { .sched_priority = 0 }; + + return sched_setscheduler_nocheck(current, SCHED_NORMAL, ¶m); +} + int cpu_up(unsigned int cpu) { int err = 0; + int switch_err = 0; + + switch_err = switch_to_rt_policy(); + if (switch_err < 0) + return switch_err; if (!cpu_possible(cpu)) { pr_err("can't online cpu %d because it is not configured as may-hotadd at boot time\n", @@ -581,6 +621,14 @@ int cpu_up(unsigned int cpu) out: cpu_maps_update_done(); + + if (!switch_err) { + switch_err = switch_to_fair_policy(); + if (switch_err) + pr_err("Hotplug policy switch err=%d Task %s pid=%d\n", + switch_err, current->comm, current->pid); + } + return err; } EXPORT_SYMBOL_GPL(cpu_up); @@ -600,7 +648,7 @@ int disable_nonboot_cpus(void) */ cpumask_clear(frozen_cpus); - pr_info("Disabling non-boot CPUs ...\n"); + pr_debug("Disabling non-boot CPUs ...\n"); for_each_online_cpu(cpu) { if (cpu == first_cpu) continue; @@ -650,7 +698,7 @@ void enable_nonboot_cpus(void) if (cpumask_empty(frozen_cpus)) goto out; - pr_info("Enabling non-boot CPUs ...\n"); + pr_debug("Enabling non-boot CPUs ...\n"); arch_enable_nonboot_cpus_begin(); @@ -659,7 +707,7 @@ void enable_nonboot_cpus(void) error = _cpu_up(cpu, 1); trace_suspend_resume(TPS("CPU_ON"), cpu, false); if (!error) { - pr_info("CPU%d is up\n", cpu); + pr_debug("CPU%d is up\n", cpu); cpu_device = get_cpu_device(cpu); if (!cpu_device) pr_err("%s: failed to get cpu%d device\n", @@ -805,6 +853,10 @@ static DECLARE_BITMAP(cpu_active_bits, CONFIG_NR_CPUS) __read_mostly; const struct cpumask *const cpu_active_mask = to_cpumask(cpu_active_bits); EXPORT_SYMBOL(cpu_active_mask); +static DECLARE_BITMAP(cpu_isolated_bits, CONFIG_NR_CPUS) __read_mostly; +const struct cpumask *const cpu_isolated_mask = to_cpumask(cpu_isolated_bits); +EXPORT_SYMBOL(cpu_isolated_mask); + void set_cpu_possible(unsigned int cpu, bool possible) { if (possible) @@ -839,6 +891,14 @@ void set_cpu_active(unsigned int cpu, bool active) cpumask_clear_cpu(cpu, to_cpumask(cpu_active_bits)); } +void set_cpu_isolated(unsigned int cpu, bool isolated) +{ + if (isolated) + cpumask_set_cpu(cpu, to_cpumask(cpu_isolated_bits)); + else + cpumask_clear_cpu(cpu, to_cpumask(cpu_isolated_bits)); +} + void init_cpu_present(const struct cpumask *src) { cpumask_copy(to_cpumask(cpu_present_bits), src); @@ -854,6 +914,11 @@ void init_cpu_online(const struct cpumask *src) cpumask_copy(to_cpumask(cpu_online_bits), src); } +void init_cpu_isolated(const struct cpumask *src) +{ + cpumask_copy(to_cpumask(cpu_isolated_bits), src); +} + enum cpu_mitigations cpu_mitigations = CPU_MITIGATIONS_AUTO; static int __init mitigations_parse_cmdline(char *arg) diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c index f1042d639eee..38dcb7c1c02c 100644 --- a/kernel/cpu_pm.c +++ b/kernel/cpu_pm.c @@ -22,14 +22,17 @@ #include <linux/spinlock.h> #include <linux/syscore_ops.h> +bool from_suspend = false; + static DEFINE_RWLOCK(cpu_pm_notifier_lock); static RAW_NOTIFIER_HEAD(cpu_pm_notifier_chain); -static int cpu_pm_notify(enum cpu_pm_event event, int nr_to_call, int *nr_calls) +static int cpu_pm_notify(enum cpu_pm_event event, int nr_to_call, int *nr_calls, + void *data) { int ret; - ret = __raw_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL, + ret = __raw_notifier_call_chain(&cpu_pm_notifier_chain, event, data, nr_to_call, nr_calls); return notifier_to_errno(ret); @@ -101,13 +104,13 @@ int cpu_pm_enter(void) int ret = 0; read_lock(&cpu_pm_notifier_lock); - ret = cpu_pm_notify(CPU_PM_ENTER, -1, &nr_calls); + ret = cpu_pm_notify(CPU_PM_ENTER, -1, &nr_calls, NULL); if (ret) /* * Inform listeners (nr_calls - 1) about failure of CPU PM * PM entry who are notified earlier to prepare for it. */ - cpu_pm_notify(CPU_PM_ENTER_FAILED, nr_calls - 1, NULL); + cpu_pm_notify(CPU_PM_ENTER_FAILED, nr_calls - 1, NULL, NULL); read_unlock(&cpu_pm_notifier_lock); return ret; @@ -131,7 +134,7 @@ int cpu_pm_exit(void) int ret; read_lock(&cpu_pm_notifier_lock); - ret = cpu_pm_notify(CPU_PM_EXIT, -1, NULL); + ret = cpu_pm_notify(CPU_PM_EXIT, -1, NULL, NULL); read_unlock(&cpu_pm_notifier_lock); return ret; @@ -154,19 +157,21 @@ EXPORT_SYMBOL_GPL(cpu_pm_exit); * * Return conditions are same as __raw_notifier_call_chain. */ -int cpu_cluster_pm_enter(void) +int cpu_cluster_pm_enter(unsigned long aff_level) { int nr_calls = 0; int ret = 0; read_lock(&cpu_pm_notifier_lock); - ret = cpu_pm_notify(CPU_CLUSTER_PM_ENTER, -1, &nr_calls); + ret = cpu_pm_notify(CPU_CLUSTER_PM_ENTER, -1, &nr_calls, + (void *) aff_level); if (ret) /* * Inform listeners (nr_calls - 1) about failure of CPU cluster * PM entry who are notified earlier to prepare for it. */ - cpu_pm_notify(CPU_CLUSTER_PM_ENTER_FAILED, nr_calls - 1, NULL); + cpu_pm_notify(CPU_CLUSTER_PM_ENTER_FAILED, nr_calls - 1, NULL, + (void *) aff_level); read_unlock(&cpu_pm_notifier_lock); return ret; @@ -188,12 +193,12 @@ EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter); * * Return conditions are same as __raw_notifier_call_chain. */ -int cpu_cluster_pm_exit(void) +int cpu_cluster_pm_exit(unsigned long aff_level) { int ret; read_lock(&cpu_pm_notifier_lock); - ret = cpu_pm_notify(CPU_CLUSTER_PM_EXIT, -1, NULL); + ret = cpu_pm_notify(CPU_CLUSTER_PM_EXIT, -1, NULL, (void *) aff_level); read_unlock(&cpu_pm_notifier_lock); return ret; @@ -205,17 +210,19 @@ static int cpu_pm_suspend(void) { int ret; + from_suspend = true; ret = cpu_pm_enter(); if (ret) return ret; - ret = cpu_cluster_pm_enter(); + ret = cpu_cluster_pm_enter(0); return ret; } static void cpu_pm_resume(void) { - cpu_cluster_pm_exit(); + from_suspend = false; + cpu_cluster_pm_exit(0); cpu_pm_exit(); } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index db86d6b609b6..2ce1a5297b92 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -99,7 +99,7 @@ struct cpuset { /* user-configured CPUs and Memory Nodes allow to tasks */ cpumask_var_t cpus_allowed; - cpumask_var_t cpus_requested; + cpumask_var_t cpus_requested; /* CPUS requested, but not used because of hotplug */ nodemask_t mems_allowed; /* effective CPUs and Memory Nodes allow to tasks */ @@ -419,14 +419,19 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) goto free_cs; + if (!alloc_cpumask_var(&trial->cpus_requested, GFP_KERNEL)) + goto free_allowed; if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL)) goto free_cpus; cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); + cpumask_copy(trial->cpus_requested, cs->cpus_requested); cpumask_copy(trial->effective_cpus, cs->effective_cpus); return trial; free_cpus: + free_cpumask_var(trial->cpus_requested); +free_allowed: free_cpumask_var(trial->cpus_allowed); free_cs: kfree(trial); @@ -440,6 +445,7 @@ free_cs: static void free_trial_cpuset(struct cpuset *trial) { free_cpumask_var(trial->effective_cpus); + free_cpumask_var(trial->cpus_requested); free_cpumask_var(trial->cpus_allowed); kfree(trial); } @@ -807,16 +813,15 @@ done: * 'cpus' is removed, then call this routine to rebuild the * scheduler's dynamic sched domains. * - * Call with cpuset_mutex held. Takes get_online_cpus(). */ -static void rebuild_sched_domains_locked(void) +static void rebuild_sched_domains_unlocked(void) { struct sched_domain_attr *attr; cpumask_var_t *doms; int ndoms; + cpu_hotplug_mutex_held(); lockdep_assert_held(&cpuset_mutex); - get_online_cpus(); /* * We have raced with CPU hotplug. Don't do anything to avoid @@ -824,27 +829,27 @@ static void rebuild_sched_domains_locked(void) * Anyways, hotplug work item will rebuild sched domains. */ if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) - goto out; + return; /* Generate domain masks and attrs */ ndoms = generate_sched_domains(&doms, &attr); /* Have scheduler rebuild the domains */ partition_sched_domains(ndoms, doms, attr); -out: - put_online_cpus(); } #else /* !CONFIG_SMP */ -static void rebuild_sched_domains_locked(void) +static void rebuild_sched_domains_unlocked(void) { } #endif /* CONFIG_SMP */ void rebuild_sched_domains(void) { + get_online_cpus(); mutex_lock(&cpuset_mutex); - rebuild_sched_domains_locked(); + rebuild_sched_domains_unlocked(); mutex_unlock(&cpuset_mutex); + put_online_cpus(); } /** @@ -876,7 +881,6 @@ static void update_tasks_cpumask(struct cpuset *cs) * * On legacy hierachy, effective_cpus will be the same with cpu_allowed. * - * Called with cpuset_mutex held */ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) { @@ -931,7 +935,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) rcu_read_unlock(); if (need_rebuild_sched_domains) - rebuild_sched_domains_locked(); + rebuild_sched_domains_unlocked(); } /** @@ -950,23 +954,23 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, return -EACCES; /* - * An empty cpus_allowed is ok only if the cpuset has no tasks. + * An empty cpus_requested is ok only if the cpuset has no tasks. * Since cpulist_parse() fails on an empty mask, we special case * that parsing. The validate_change() call ensures that cpusets * with tasks have cpus. */ if (!*buf) { - cpumask_clear(trialcs->cpus_allowed); + cpumask_clear(trialcs->cpus_requested); } else { retval = cpulist_parse(buf, trialcs->cpus_requested); if (retval < 0) return retval; + } - if (!cpumask_subset(trialcs->cpus_requested, cpu_present_mask)) - return -EINVAL; + if (!cpumask_subset(trialcs->cpus_requested, cpu_present_mask)) + return -EINVAL; - cpumask_and(trialcs->cpus_allowed, trialcs->cpus_requested, cpu_active_mask); - } + cpumask_and(trialcs->cpus_allowed, trialcs->cpus_requested, cpu_active_mask); /* Nothing to do if the cpus didn't change */ if (cpumask_equal(cs->cpus_requested, trialcs->cpus_requested)) @@ -1290,7 +1294,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) cs->relax_domain_level = val; if (!cpumask_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) - rebuild_sched_domains_locked(); + rebuild_sched_domains_unlocked(); } return 0; @@ -1321,7 +1325,6 @@ static void update_tasks_flags(struct cpuset *cs) * cs: the cpuset to update * turning_on: whether the flag is being set or cleared * - * Call with cpuset_mutex held. */ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, @@ -1356,7 +1359,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, spin_unlock_irq(&callback_lock); if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) - rebuild_sched_domains_locked(); + rebuild_sched_domains_unlocked(); if (spread_flag_changed) update_tasks_flags(cs); @@ -1621,6 +1624,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = 0; + get_online_cpus(); mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) { retval = -ENODEV; @@ -1658,6 +1662,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, } out_unlock: mutex_unlock(&cpuset_mutex); + put_online_cpus(); return retval; } @@ -1668,6 +1673,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = -ENODEV; + get_online_cpus(); mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) goto out_unlock; @@ -1682,6 +1688,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, } out_unlock: mutex_unlock(&cpuset_mutex); + put_online_cpus(); return retval; } @@ -1720,6 +1727,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, kernfs_break_active_protection(of->kn); flush_work(&cpuset_hotplug_work); + get_online_cpus(); mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) goto out_unlock; @@ -1745,6 +1753,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, free_trial_cpuset(trialcs); out_unlock: mutex_unlock(&cpuset_mutex); + put_online_cpus(); kernfs_unbreak_active_protection(of->kn); css_put(&cs->css); flush_workqueue(cpuset_migrate_mm_wq); @@ -1958,11 +1967,11 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css) if (!cs) return ERR_PTR(-ENOMEM); if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) - goto free_cs; - if (!alloc_cpumask_var(&cs->cpus_requested, GFP_KERNEL)) - goto free_allowed; + goto error_allowed; if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL)) - goto free_requested; + goto error_effective; + if (!alloc_cpumask_var(&cs->cpus_requested, GFP_KERNEL)) + goto error_requested; set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); cpumask_clear(cs->cpus_allowed); @@ -1975,11 +1984,11 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css) return &cs->css; -free_requested: - free_cpumask_var(cs->cpus_requested); -free_allowed: +error_requested: + free_cpumask_var(cs->effective_cpus); +error_effective: free_cpumask_var(cs->cpus_allowed); -free_cs: +error_allowed: kfree(cs); return ERR_PTR(-ENOMEM); } @@ -2051,13 +2060,14 @@ out_unlock: /* * If the cpuset being removed has its flag 'sched_load_balance' * enabled, then simulate turning sched_load_balance off, which - * will call rebuild_sched_domains_locked(). + * will call rebuild_sched_domains_unlocked(). */ static void cpuset_css_offline(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); + get_online_cpus(); mutex_lock(&cpuset_mutex); if (is_sched_load_balance(cs)) @@ -2067,6 +2077,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) clear_bit(CS_ONLINE, &cs->flags); mutex_unlock(&cpuset_mutex); + put_online_cpus(); } static void cpuset_css_free(struct cgroup_subsys_state *css) @@ -2278,7 +2289,8 @@ retry: goto retry; } - cpumask_and(&new_cpus, cs->cpus_requested, parent_cs(cs)->effective_cpus); + cpumask_and(&new_cpus, cs->cpus_requested, + parent_cs(cs)->effective_cpus); nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems); cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus); diff --git a/kernel/events/core.c b/kernel/events/core.c index c349c1ea6bec..a1fc28e6ea6a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -158,6 +158,7 @@ enum event_type_t { struct static_key_deferred perf_sched_events __read_mostly; static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); static DEFINE_PER_CPU(int, perf_sched_cb_usages); +static DEFINE_PER_CPU(bool, is_idle); static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; @@ -177,7 +178,9 @@ static struct srcu_struct pmus_srcu; * 2 - disallow kernel profiling for unpriv * 3 - disallow all unpriv perf event use */ -#ifdef CONFIG_SECURITY_PERF_EVENTS_RESTRICT +#ifdef CONFIG_PERF_EVENTS_USERMODE +int sysctl_perf_event_paranoid __read_mostly = -1; +#elif defined CONFIG_SECURITY_PERF_EVENTS_RESTRICT int sysctl_perf_event_paranoid __read_mostly = 3; #else int sysctl_perf_event_paranoid __read_mostly = 1; @@ -1497,10 +1500,17 @@ static void perf_group_detach(struct perf_event *event) * If this was a group event with sibling events then * upgrade the siblings to singleton events by adding them * to whatever list we are on. + * If this isn't on a list, make sure we still remove the sibling's + * group_entry from this sibling_list; otherwise, when that sibling + * is later deallocated, it will try to remove itself from this + * sibling_list, which may well have been deallocated already, + * resulting in a use-after-free. */ list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { if (list) list_move_tail(&sibling->group_entry, list); + else + list_del_init(&sibling->group_entry); sibling->group_leader = sibling; /* Inherit group flags from the previous leader */ @@ -1695,7 +1705,32 @@ static int __perf_remove_from_context(void *info) } -/* +#ifdef CONFIG_SMP +static void perf_retry_remove(struct perf_event *event, + struct remove_event *rep) +{ + int up_ret; + /* + * CPU was offline. Bring it online so we can + * gracefully exit a perf context. + */ + up_ret = cpu_up(event->cpu); + if (!up_ret) + /* Try the remove call once again. */ + cpu_function_call(event->cpu, __perf_remove_from_context, + rep); + else + pr_err("Failed to bring up CPU: %d, ret: %d\n", + event->cpu, up_ret); +} +#else +static void perf_retry_remove(struct perf_event *event, + struct remove_event *rep) +{ +} +#endif + + /* * Remove the event from a task's (or a CPU's) list of events. * * CPU events are removed with a smp call. For task events we only @@ -1708,7 +1743,8 @@ static int __perf_remove_from_context(void *info) * When called from perf_event_exit_task, it's OK because the * context has been detached from its task. */ -static void perf_remove_from_context(struct perf_event *event, bool detach_group) +static void __ref perf_remove_from_context(struct perf_event *event, + bool detach_group) { struct perf_event_context *ctx = event->ctx; struct task_struct *task = ctx->task; @@ -1716,6 +1752,7 @@ static void perf_remove_from_context(struct perf_event *event, bool detach_group .event = event, .detach_group = detach_group, }; + int ret; lockdep_assert_held(&ctx->mutex); @@ -1726,7 +1763,11 @@ static void perf_remove_from_context(struct perf_event *event, bool detach_group * already called __perf_remove_from_context from * perf_event_exit_cpu. */ - cpu_function_call(event->cpu, __perf_remove_from_context, &re); + ret = cpu_function_call(event->cpu, __perf_remove_from_context, + &re); + if (ret == -ENXIO) + perf_retry_remove(event, &re); + return; } @@ -1922,8 +1963,13 @@ event_sched_in(struct perf_event *event, if (event->state <= PERF_EVENT_STATE_OFF) return 0; - event->state = PERF_EVENT_STATE_ACTIVE; - event->oncpu = smp_processor_id(); + WRITE_ONCE(event->oncpu, smp_processor_id()); + /* + * Order event::oncpu write to happen before the ACTIVE state + * is visible. + */ + smp_wmb(); + WRITE_ONCE(event->state, PERF_EVENT_STATE_ACTIVE); /* * Unthrottle events, since we scheduled we might have missed several @@ -2404,6 +2450,29 @@ void perf_event_enable(struct perf_event *event) } EXPORT_SYMBOL_GPL(perf_event_enable); +static int __perf_event_stop(void *info) +{ + struct perf_event *event = info; + + /* for AUX events, our job is done if the event is already inactive */ + if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE) + return 0; + + /* matches smp_wmb() in event_sched_in() */ + smp_rmb(); + + /* + * There is a window with interrupts enabled before we get here, + * so we need to check again lest we try to stop another CPU's event. + */ + if (READ_ONCE(event->oncpu) != smp_processor_id()) + return -EAGAIN; + + event->pmu->stop(event, PERF_EF_UPDATE); + + return 0; +} + static int _perf_event_refresh(struct perf_event *event, int refresh) { /* @@ -3379,21 +3448,30 @@ u64 perf_event_read_local(struct perf_event *event) static int perf_event_read(struct perf_event *event, bool group) { - int ret = 0; + int event_cpu, ret = 0; /* * If event is enabled and currently active on a CPU, update the * value in the event structure: */ - if (event->state == PERF_EVENT_STATE_ACTIVE) { + event_cpu = READ_ONCE(event->oncpu); + + if (event->state == PERF_EVENT_STATE_ACTIVE && + !cpu_isolated(event_cpu)) { struct perf_read_data data = { .event = event, .group = group, .ret = 0, }; - smp_call_function_single(event->oncpu, - __perf_event_read, &data, 1); - ret = data.ret; + + if ((unsigned int)event_cpu >= nr_cpu_ids) + return 0; + if (!event->attr.exclude_idle || + !per_cpu(is_idle, event_cpu)) { + smp_call_function_single(event_cpu, + __perf_event_read, &data, 1); + ret = data.ret; + } } else if (event->state == PERF_EVENT_STATE_INACTIVE) { struct perf_event_context *ctx = event->ctx; unsigned long flags; @@ -3488,7 +3566,8 @@ find_get_context(struct pmu *pmu, struct task_struct *task, if (!task) { /* Must be root to operate on a CPU event: */ - if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) + if (event->owner != EVENT_OWNER_KERNEL && perf_paranoid_cpu() && + !capable(CAP_SYS_ADMIN)) return ERR_PTR(-EACCES); /* @@ -3731,6 +3810,9 @@ static void __free_event(struct perf_event *event) if (event->destroy) event->destroy(event); + if (event->pmu->free_drv_configs) + event->pmu->free_drv_configs(event); + if (event->ctx) put_ctx(event->ctx); @@ -3874,6 +3956,15 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel); */ static int perf_release(struct inode *inode, struct file *file) { + struct perf_event *event = file->private_data; + + /* + * Event can be in state OFF because of a constraint check. + * Change to ACTIVE so that it gets cleaned up correctly. + */ + if ((event->state == PERF_EVENT_STATE_OFF) && + event->attr.constraint_duplicate) + event->state = PERF_EVENT_STATE_ACTIVE; put_event(file->private_data); return 0; } @@ -4288,6 +4379,8 @@ static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event); static int perf_event_set_filter(struct perf_event *event, void __user *arg); static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd); +static int perf_event_drv_configs(struct perf_event *event, + void __user *arg); static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg) { @@ -4344,6 +4437,9 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon case PERF_EVENT_IOC_SET_BPF: return perf_event_set_bpf_prog(event, arg); + case PERF_EVENT_IOC_SET_DRV_CONFIGS: + return perf_event_drv_configs(event, (void __user *)arg); + default: return -ENOTTY; } @@ -4376,6 +4472,7 @@ static long perf_compat_ioctl(struct file *file, unsigned int cmd, switch (_IOC_NR(cmd)) { case _IOC_NR(PERF_EVENT_IOC_SET_FILTER): case _IOC_NR(PERF_EVENT_IOC_ID): + case _IOC_NR(PERF_EVENT_IOC_SET_DRV_CONFIGS): /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */ if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) { cmd &= ~IOCSIZE_MASK; @@ -4660,6 +4757,8 @@ static void perf_mmap_open(struct vm_area_struct *vma) event->pmu->event_mapped(event); } +static void perf_pmu_output_stop(struct perf_event *event); + /* * A buffer can be mmap()ed multiple times; either directly through the same * event, or through other events by use of perf_event_set_output(). @@ -4687,10 +4786,22 @@ static void perf_mmap_close(struct vm_area_struct *vma) */ if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff && atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) { + /* + * Stop all AUX events that are writing to this buffer, + * so that we can free its AUX pages and corresponding PMU + * data. Note that after rb::aux_mmap_count dropped to zero, + * they won't start any more (see perf_aux_output_begin()). + */ + perf_pmu_output_stop(event); + + /* now it's safe to free the pages */ atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm); vma->vm_mm->pinned_vm -= rb->aux_mmap_locked; + /* this has to be the last one */ rb_free_aux(rb); + WARN_ON_ONCE(atomic_read(&rb->aux_refcount)); + mutex_unlock(&event->mmap_mutex); } @@ -5775,6 +5886,80 @@ next: rcu_read_unlock(); } +struct remote_output { + struct ring_buffer *rb; + int err; +}; + +static void __perf_event_output_stop(struct perf_event *event, void *data) +{ + struct perf_event *parent = event->parent; + struct remote_output *ro = data; + struct ring_buffer *rb = ro->rb; + + if (!has_aux(event)) + return; + + if (!parent) + parent = event; + + /* + * In case of inheritance, it will be the parent that links to the + * ring-buffer, but it will be the child that's actually using it: + */ + if (rcu_dereference(parent->rb) == rb) + ro->err = __perf_event_stop(event); +} + +static int __perf_pmu_output_stop(void *info) +{ + struct perf_event *event = info; + struct pmu *pmu = event->pmu; + struct perf_cpu_context *cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); + struct remote_output ro = { + .rb = event->rb, + }; + + rcu_read_lock(); + perf_event_aux_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro); + if (cpuctx->task_ctx) + perf_event_aux_ctx(cpuctx->task_ctx, __perf_event_output_stop, + &ro); + rcu_read_unlock(); + + return ro.err; +} + +static void perf_pmu_output_stop(struct perf_event *event) +{ + struct perf_event *iter; + int err, cpu; + +restart: + rcu_read_lock(); + list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) { + /* + * For per-CPU events, we need to make sure that neither they + * nor their children are running; for cpu==-1 events it's + * sufficient to stop the event itself if it's active, since + * it can't have children. + */ + cpu = iter->cpu; + if (cpu == -1) + cpu = READ_ONCE(iter->oncpu); + + if (cpu == -1) + continue; + + err = cpu_function_call(cpu, __perf_pmu_output_stop, event); + if (err == -EAGAIN) { + rcu_read_unlock(); + goto restart; + } + } + rcu_read_unlock(); +} + /* * task tracking -- fork/exit * @@ -6978,6 +7163,8 @@ static struct pmu perf_swevent = { .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, + + .events_across_hotplug = 1, }; #ifdef CONFIG_EVENT_TRACING @@ -7101,6 +7288,8 @@ static struct pmu perf_tracepoint = { .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, + + .events_across_hotplug = 1, }; static inline void perf_tp_register(void) @@ -7213,6 +7402,15 @@ void perf_bp_event(struct perf_event *bp, void *data) } #endif +static int perf_event_drv_configs(struct perf_event *event, + void __user *arg) +{ + if (!event->pmu->get_drv_configs) + return -EINVAL; + + return event->pmu->get_drv_configs(event, arg); +} + /* * hrtimer based swevent callback */ @@ -7380,6 +7578,8 @@ static struct pmu perf_cpu_clock = { .start = cpu_clock_event_start, .stop = cpu_clock_event_stop, .read = cpu_clock_event_read, + + .events_across_hotplug = 1, }; /* @@ -7461,6 +7661,8 @@ static struct pmu perf_task_clock = { .start = task_clock_event_start, .stop = task_clock_event_stop, .read = task_clock_event_read, + + .events_across_hotplug = 1, }; static void perf_pmu_nop_void(struct pmu *pmu) @@ -7941,6 +8143,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (!group_leader) group_leader = event; + mutex_init(&event->group_leader_mutex); mutex_init(&event->child_mutex); INIT_LIST_HEAD(&event->child_list); @@ -7949,6 +8152,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, INIT_LIST_HEAD(&event->sibling_list); INIT_LIST_HEAD(&event->rb_entry); INIT_LIST_HEAD(&event->active_entry); + INIT_LIST_HEAD(&event->drv_configs); INIT_HLIST_NODE(&event->hlist_entry); @@ -8368,6 +8572,9 @@ SYSCALL_DEFINE5(perf_event_open, if (err) return err; + if (attr.constraint_duplicate || attr.__reserved_1) + return -EINVAL; + if (!attr.exclude_kernel) { if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) return -EACCES; @@ -8408,6 +8615,16 @@ SYSCALL_DEFINE5(perf_event_open, group_leader = NULL; } + /* + * Take the group_leader's group_leader_mutex before observing + * anything in the group leader that leads to changes in ctx, + * many of which may be changing on another thread. + * In particular, we want to take this lock before deciding + * whether we need to move_group. + */ + if (group_leader) + mutex_lock(&group_leader->group_leader_mutex); + if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) { task = find_lively_task_by_vpid(pid); if (IS_ERR(task)) { @@ -8686,6 +8903,8 @@ SYSCALL_DEFINE5(perf_event_open, if (move_group) perf_event_ctx_unlock(group_leader, gctx); mutex_unlock(&ctx->mutex); + if (group_leader) + mutex_unlock(&group_leader->group_leader_mutex); if (task) { mutex_unlock(&task->signal->cred_guard_mutex); @@ -8735,6 +8954,8 @@ err_task: if (task) put_task_struct(task); err_group_fd: + if (group_leader) + mutex_unlock(&group_leader->group_leader_mutex); fdput(group); err_fd: put_unused_fd(event_fd); @@ -9446,29 +9667,90 @@ static void __perf_event_exit_context(void *__info) rcu_read_unlock(); } +static void __perf_event_stop_swclock(void *__info) +{ + struct perf_event_context *ctx = __info; + struct perf_event *event, *tmp; + + list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) { + if (event->attr.config == PERF_COUNT_SW_CPU_CLOCK && + event->attr.type == PERF_TYPE_SOFTWARE) + cpu_clock_event_stop(event, 0); + } +} + static void perf_event_exit_cpu_context(int cpu) { + struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; + unsigned long flags; struct pmu *pmu; int idx; idx = srcu_read_lock(&pmus_srcu); list_for_each_entry_rcu(pmu, &pmus, entry) { - ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx; + cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); + ctx = &cpuctx->ctx; + + /* Cancel the mux hrtimer to avoid CPU migration */ + if (pmu->task_ctx_nr != perf_sw_context) { + raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags); + hrtimer_cancel(&cpuctx->hrtimer); + cpuctx->hrtimer_active = 0; + raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, + flags); + } mutex_lock(&ctx->mutex); - smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); + /* + * If keeping events across hotplugging is supported, do not + * remove the event list, but keep it alive across CPU hotplug. + * The context is exited via an fd close path when userspace + * is done and the target CPU is online. If software clock + * event is active, then stop hrtimer associated with it. + * Start the timer when the CPU comes back online. + */ + if (!pmu->events_across_hotplug) + smp_call_function_single(cpu, __perf_event_exit_context, + ctx, 1); + else + smp_call_function_single(cpu, __perf_event_stop_swclock, + ctx, 1); mutex_unlock(&ctx->mutex); } srcu_read_unlock(&pmus_srcu, idx); } +static void perf_event_start_swclock(int cpu) +{ + struct perf_event_context *ctx; + struct pmu *pmu; + int idx; + struct perf_event *event, *tmp; + + idx = srcu_read_lock(&pmus_srcu); + list_for_each_entry_rcu(pmu, &pmus, entry) { + if (pmu->events_across_hotplug) { + ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx; + list_for_each_entry_safe(event, tmp, &ctx->event_list, + event_entry) { + if (event->attr.config == + PERF_COUNT_SW_CPU_CLOCK && + event->attr.type == PERF_TYPE_SOFTWARE) + cpu_clock_event_start(event, 0); + } + } + } + srcu_read_unlock(&pmus_srcu, idx); +} + static void perf_event_exit_cpu(int cpu) { perf_event_exit_cpu_context(cpu); } #else static inline void perf_event_exit_cpu(int cpu) { } +static inline void perf_event_start_swclock(int cpu) { } #endif static int @@ -9507,6 +9789,11 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) case CPU_DOWN_PREPARE: perf_event_exit_cpu(cpu); break; + + case CPU_STARTING: + perf_event_start_swclock(cpu); + break; + default: break; } @@ -9514,6 +9801,25 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) return NOTIFY_OK; } +static int event_idle_notif(struct notifier_block *nb, unsigned long action, + void *data) +{ + switch (action) { + case IDLE_START: + __this_cpu_write(is_idle, true); + break; + case IDLE_END: + __this_cpu_write(is_idle, false); + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block perf_event_idle_nb = { + .notifier_call = event_idle_notif, +}; + void __init perf_event_init(void) { int ret; @@ -9527,6 +9833,7 @@ void __init perf_event_init(void) perf_pmu_register(&perf_task_clock, NULL, -1); perf_tp_register(); perf_cpu_notifier(perf_cpu_notify); + idle_notifier_register(&perf_event_idle_nb); register_reboot_notifier(&perf_reboot_notifier); ret = init_hw_breakpoint(); diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index a27245fdcd81..d97a5430d580 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -598,6 +598,8 @@ static struct pmu perf_breakpoint = { .start = hw_breakpoint_start, .stop = hw_breakpoint_stop, .read = hw_breakpoint_pmu_read, + + .events_across_hotplug = 1, }; int __init init_hw_breakpoint(void) diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 8baa3121e7a6..7e59e583e1a3 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -11,7 +11,6 @@ struct ring_buffer { atomic_t refcount; struct rcu_head rcu_head; - struct irq_work irq_work; #ifdef CONFIG_PERF_USE_VMALLOC struct work_struct work; int page_order; /* allocation order */ diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 410f83cad06c..b128cc829bd8 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -247,8 +247,6 @@ void perf_output_end(struct perf_output_handle *handle) rcu_read_unlock(); } -static void rb_irq_work(struct irq_work *work); - static void ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) { @@ -269,16 +267,6 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) INIT_LIST_HEAD(&rb->event_list); spin_lock_init(&rb->event_lock); - init_irq_work(&rb->irq_work, rb_irq_work); -} - -static void ring_buffer_put_async(struct ring_buffer *rb) -{ - if (!atomic_dec_and_test(&rb->refcount)) - return; - - rb->rcu_head.next = (void *)rb; - irq_work_queue(&rb->irq_work); } /* @@ -290,6 +278,10 @@ static void ring_buffer_put_async(struct ring_buffer *rb) * The ordering is similar to that of perf_output_{begin,end}, with * the exception of (B), which should be taken care of by the pmu * driver, since ordering rules will differ depending on hardware. + * + * Call this from pmu::start(); see the comment in perf_aux_output_end() + * about its use in pmu callbacks. Both can also be called from the PMI + * handler if needed. */ void *perf_aux_output_begin(struct perf_output_handle *handle, struct perf_event *event) @@ -318,7 +310,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, * the aux buffer is in perf_mmap_close(), about to get freed. */ if (!atomic_read(&rb->aux_mmap_count)) - goto err; + goto err_put; /* * Nesting is not supported for AUX area, make sure nested @@ -361,10 +353,11 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, return handle->rb->aux_priv; err_put: + /* can't be last */ rb_free_aux(rb); err: - ring_buffer_put_async(rb); + ring_buffer_put(rb); handle->event = NULL; return NULL; @@ -375,6 +368,10 @@ err: * aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the * pmu driver's responsibility to observe ordering rules of the hardware, * so that all the data is externally visible before this is called. + * + * Note: this has to be called from pmu::stop() callback, as the assumption + * of the AUX buffer management code is that after pmu::stop(), the AUX + * transaction must be stopped and therefore drop the AUX reference count. */ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size, bool truncated) @@ -422,8 +419,9 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size, handle->event = NULL; local_set(&rb->aux_nest, 0); + /* can't be last */ rb_free_aux(rb); - ring_buffer_put_async(rb); + ring_buffer_put(rb); } /* @@ -504,6 +502,14 @@ static void __rb_free_aux(struct ring_buffer *rb) { int pg; + /* + * Should never happen, the last reference should be dropped from + * perf_mmap_close() path, which first stops aux transactions (which + * in turn are the atomic holders of aux_refcount) and then does the + * last rb_free_aux(). + */ + WARN_ON_ONCE(in_atomic()); + if (rb->aux_priv) { rb->free_aux(rb->aux_priv); rb->free_aux = NULL; @@ -582,7 +588,7 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, goto out; } - rb->aux_priv = event->pmu->setup_aux(event->cpu, rb->aux_pages, nr_pages, + rb->aux_priv = event->pmu->setup_aux(event, rb->aux_pages, nr_pages, overwrite); if (!rb->aux_priv) goto out; @@ -615,18 +621,7 @@ out: void rb_free_aux(struct ring_buffer *rb) { if (atomic_dec_and_test(&rb->aux_refcount)) - irq_work_queue(&rb->irq_work); -} - -static void rb_irq_work(struct irq_work *work) -{ - struct ring_buffer *rb = container_of(work, struct ring_buffer, irq_work); - - if (!atomic_read(&rb->aux_refcount)) __rb_free_aux(rb); - - if (rb->rcu_head.next == (void *)rb) - call_rcu(&rb->rcu_head, rb_free_rcu); } #ifndef CONFIG_PERF_USE_VMALLOC diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index d937fbbc3642..93f70912d380 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1699,8 +1699,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) return -EINVAL; pagefault_disable(); - result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr, - sizeof(opcode)); + result = __get_user(opcode, (uprobe_opcode_t __user *)vaddr); pagefault_enable(); if (likely(result == 0)) diff --git a/kernel/exit.c b/kernel/exit.c index 553e8a047ab0..babbc3c0a181 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -391,6 +391,7 @@ static void exit_mm(struct task_struct *tsk) { struct mm_struct *mm = tsk->mm; struct core_state *core_state; + int mm_released; exit_mm_release(tsk, mm); if (!mm) @@ -440,9 +441,12 @@ static void exit_mm(struct task_struct *tsk) enter_lazy_tlb(mm, current); task_unlock(tsk); mm_update_next_owner(mm); - mmput(mm); + + mm_released = mmput(mm); if (test_thread_flag(TIF_MEMDIE)) exit_oom_victim(); + if (mm_released) + set_tsk_thread_flag(tsk, TIF_MM_RELEASED); } static struct task_struct *find_alive_thread(struct task_struct *p) @@ -646,6 +650,7 @@ static void check_stack_usage(void) static DEFINE_SPINLOCK(low_water_lock); static int lowest_to_date = THREAD_SIZE; unsigned long free; + int islower = false; free = stack_not_used(current); @@ -654,11 +659,16 @@ static void check_stack_usage(void) spin_lock(&low_water_lock); if (free < lowest_to_date) { - pr_warn("%s (%d) used greatest stack depth: %lu bytes left\n", - current->comm, task_pid_nr(current), free); lowest_to_date = free; + islower = true; } spin_unlock(&low_water_lock); + + if (islower) { + printk(KERN_WARNING "%s (%d) used greatest stack depth: " + "%lu bytes left\n", + current->comm, task_pid_nr(current), free); + } } #else static inline void check_stack_usage(void) {} @@ -706,6 +716,7 @@ void do_exit(long code) exit_signals(tsk); /* sets PF_EXITING */ + sched_exit(tsk); schedtune_exit_task(tsk); if (unlikely(in_atomic())) { diff --git a/kernel/fork.c b/kernel/fork.c index 56fda05f5137..5103cd9a2769 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -23,6 +23,7 @@ #include <linux/file.h> #include <linux/fdtable.h> #include <linux/iocontext.h> +#include <linux/kasan.h> #include <linux/key.h> #include <linux/binfmts.h> #include <linux/mman.h> @@ -174,6 +175,7 @@ static inline void free_thread_stack(unsigned long *stack) { struct page *page = virt_to_page(stack); + kasan_alloc_pages(page, THREAD_SIZE_ORDER); kaiser_unmap_thread_stack(stack); __free_kmem_pages(page, THREAD_SIZE_ORDER); } @@ -729,12 +731,16 @@ static inline void __mmput(struct mm_struct *mm) /* * Decrement the use count and release all resources for an mm. */ -void mmput(struct mm_struct *mm) +int mmput(struct mm_struct *mm) { + int mm_freed = 0; might_sleep(); - if (atomic_dec_and_test(&mm->mm_users)) + if (atomic_dec_and_test(&mm->mm_users)) { __mmput(mm); + mm_freed = 1; + } + return mm_freed; } EXPORT_SYMBOL_GPL(mmput); @@ -1721,6 +1727,7 @@ bad_fork_cleanup_audit: bad_fork_cleanup_perf: perf_event_free_task(p); bad_fork_cleanup_policy: + free_task_load_ptrs(p); #ifdef CONFIG_NUMA mpol_put(p->mempolicy); bad_fork_cleanup_threadgroup_lock: diff --git a/kernel/futex.c b/kernel/futex.c index 6d47b7dc1cfb..5990a8f95ae0 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -792,7 +792,7 @@ static int get_futex_value_locked(u32 *dest, u32 __user *from) int ret; pagefault_disable(); - ret = __copy_from_user_inatomic(dest, from, sizeof(u32)); + ret = __get_user(*dest, from); pagefault_enable(); return ret ? -EFAULT : 0; diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 67aafc2b249c..f4c99c41fee2 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -33,6 +33,7 @@ static irqreturn_t bad_chained_irq(int irq, void *dev_id) */ struct irqaction chained_action = { .handler = bad_chained_irq, + .name = "chained-irq", }; /** @@ -327,11 +328,12 @@ void unmask_threaded_irq(struct irq_desc *desc) * handler. The handler function is called inside the calling * threads context. */ -void handle_nested_irq(unsigned int irq) +bool handle_nested_irq(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action; irqreturn_t action_ret; + bool handled = false; might_sleep(); @@ -356,8 +358,11 @@ void handle_nested_irq(unsigned int irq) raw_spin_lock_irq(&desc->lock); irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); + handled = true; + out_unlock: raw_spin_unlock_irq(&desc->lock); + return handled; } EXPORT_SYMBOL_GPL(handle_nested_irq); @@ -404,8 +409,10 @@ static bool irq_may_run(struct irq_desc *desc) * Note: The caller is expected to handle the ack, clear, mask and * unmask issues if necessary. */ -void handle_simple_irq(struct irq_desc *desc) +bool handle_simple_irq(struct irq_desc *desc) { + bool handled = false; + raw_spin_lock(&desc->lock); if (!irq_may_run(desc)) @@ -421,8 +428,11 @@ void handle_simple_irq(struct irq_desc *desc) kstat_incr_irqs_this_cpu(desc); handle_irq_event(desc); + handled = true; + out_unlock: raw_spin_unlock(&desc->lock); + return handled; } EXPORT_SYMBOL_GPL(handle_simple_irq); @@ -453,8 +463,10 @@ static void cond_unmask_irq(struct irq_desc *desc) * it after the associated handler has acknowledged the device, so the * interrupt line is back to inactive. */ -void handle_level_irq(struct irq_desc *desc) +bool handle_level_irq(struct irq_desc *desc) { + bool handled = false; + raw_spin_lock(&desc->lock); mask_ack_irq(desc); @@ -477,8 +489,11 @@ void handle_level_irq(struct irq_desc *desc) cond_unmask_irq(desc); + handled = true; + out_unlock: raw_spin_unlock(&desc->lock); + return handled; } EXPORT_SYMBOL_GPL(handle_level_irq); @@ -522,9 +537,10 @@ static void cond_unmask_eoi_irq(struct irq_desc *desc, struct irq_chip *chip) * for modern forms of interrupt handlers, which handle the flow * details in hardware, transparently. */ -void handle_fasteoi_irq(struct irq_desc *desc) +bool handle_fasteoi_irq(struct irq_desc *desc) { struct irq_chip *chip = desc->irq_data.chip; + bool handled = false; raw_spin_lock(&desc->lock); @@ -552,12 +568,15 @@ void handle_fasteoi_irq(struct irq_desc *desc) cond_unmask_eoi_irq(desc, chip); + handled = true; + raw_spin_unlock(&desc->lock); - return; + return handled; out: if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED)) chip->irq_eoi(&desc->irq_data); raw_spin_unlock(&desc->lock); + return handled; } EXPORT_SYMBOL_GPL(handle_fasteoi_irq); @@ -576,8 +595,10 @@ EXPORT_SYMBOL_GPL(handle_fasteoi_irq); * the handler was running. If all pending interrupts are handled, the * loop is left. */ -void handle_edge_irq(struct irq_desc *desc) +bool handle_edge_irq(struct irq_desc *desc) { + bool handled = false; + raw_spin_lock(&desc->lock); desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); @@ -621,12 +642,14 @@ void handle_edge_irq(struct irq_desc *desc) } handle_irq_event(desc); + handled = true; } while ((desc->istate & IRQS_PENDING) && !irqd_irq_disabled(&desc->irq_data)); out_unlock: raw_spin_unlock(&desc->lock); + return handled; } EXPORT_SYMBOL(handle_edge_irq); @@ -638,8 +661,9 @@ EXPORT_SYMBOL(handle_edge_irq); * Similar as the above handle_edge_irq, but using eoi and w/o the * mask/unmask logic. */ -void handle_edge_eoi_irq(struct irq_desc *desc) +bool handle_edge_eoi_irq(struct irq_desc *desc) { + bool handled = false; struct irq_chip *chip = irq_desc_get_chip(desc); raw_spin_lock(&desc->lock); @@ -667,6 +691,7 @@ void handle_edge_eoi_irq(struct irq_desc *desc) goto out_eoi; handle_irq_event(desc); + handled = true; } while ((desc->istate & IRQS_PENDING) && !irqd_irq_disabled(&desc->irq_data)); @@ -674,6 +699,7 @@ void handle_edge_eoi_irq(struct irq_desc *desc) out_eoi: chip->irq_eoi(&desc->irq_data); raw_spin_unlock(&desc->lock); + return handled; } #endif @@ -683,7 +709,7 @@ out_eoi: * * Per CPU interrupts on SMP machines without locking requirements */ -void handle_percpu_irq(struct irq_desc *desc) +bool handle_percpu_irq(struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); @@ -696,6 +722,8 @@ void handle_percpu_irq(struct irq_desc *desc) if (chip->irq_eoi) chip->irq_eoi(&desc->irq_data); + + return true; } /** @@ -709,7 +737,7 @@ void handle_percpu_irq(struct irq_desc *desc) * contain the real device id for the cpu on which this handler is * called */ -void handle_percpu_devid_irq(struct irq_desc *desc) +bool handle_percpu_devid_irq(struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); struct irqaction *action = desc->action; @@ -728,6 +756,8 @@ void handle_percpu_devid_irq(struct irq_desc *desc) if (chip->irq_eoi) chip->irq_eoi(&desc->irq_data); + + return true; } void @@ -836,7 +866,8 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) irq_settings_clr_and_set(desc, clr, set); irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU | - IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT); + IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT | + IRQD_AFFINITY_MANAGED); if (irq_settings_has_no_balance_set(desc)) irqd_set(&desc->irq_data, IRQD_NO_BALANCING); if (irq_settings_is_per_cpu(desc)) @@ -845,6 +876,8 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) irqd_set(&desc->irq_data, IRQD_MOVE_PCNTXT); if (irq_settings_is_level(desc)) irqd_set(&desc->irq_data, IRQD_LEVEL); + if (irq_settings_has_affinity_managed_set(desc)) + irqd_set(&desc->irq_data, IRQD_AFFINITY_MANAGED); irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc)); diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 011f8c4c63da..4684b7595e63 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -11,6 +11,7 @@ #include <linux/interrupt.h> #include <linux/ratelimit.h> #include <linux/irq.h> +#include <linux/cpumask.h> #include "internals.h" @@ -20,6 +21,7 @@ static bool migrate_one_irq(struct irq_desc *desc) const struct cpumask *affinity = d->common->affinity; struct irq_chip *c; bool ret = false; + struct cpumask available_cpus; /* * If this is a per-CPU interrupt, or the affinity does not @@ -29,8 +31,37 @@ static bool migrate_one_irq(struct irq_desc *desc) !cpumask_test_cpu(smp_processor_id(), affinity)) return false; + cpumask_copy(&available_cpus, affinity); + cpumask_andnot(&available_cpus, &available_cpus, cpu_isolated_mask); + affinity = &available_cpus; + if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { - affinity = cpu_online_mask; + /* + * The order of preference for selecting a fallback CPU is + * + * (1) online and un-isolated CPU from default affinity + * (2) online and un-isolated CPU + * (3) online CPU + */ + cpumask_andnot(&available_cpus, cpu_online_mask, + cpu_isolated_mask); + if (cpumask_intersects(&available_cpus, irq_default_affinity)) + cpumask_and(&available_cpus, &available_cpus, + irq_default_affinity); + else if (cpumask_empty(&available_cpus)) + affinity = cpu_online_mask; + + /* + * We are overriding the affinity with all online and + * un-isolated cpus. irq_set_affinity_locked() call + * below notify this mask to PM QOS affinity listener. + * That results in applying the CPU_DMA_LATENCY QOS + * to all the CPUs specified in the mask. But the low + * level irqchip driver sets the affinity of an irq + * to only one CPU. So pick only one CPU from the + * prepared mask while overriding the user affinity. + */ + affinity = cpumask_of(cpumask_any(affinity)); ret = true; } @@ -38,7 +69,7 @@ static bool migrate_one_irq(struct irq_desc *desc) if (!c->irq_set_affinity) { pr_debug("IRQ%u: unable to set affinity\n", d->irq); } else { - int r = irq_do_set_affinity(d, affinity, false); + int r = irq_set_affinity_locked(d, affinity, false); if (r) pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n", d->irq, r); @@ -69,6 +100,9 @@ void irq_migrate_all_off_this_cpu(void) bool affinity_broken; desc = irq_to_desc(irq); + if (!desc) + continue; + raw_spin_lock(&desc->lock); affinity_broken = migrate_one_irq(desc); raw_spin_unlock(&desc->lock); diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 57bff7857e87..80e76ddbf804 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -26,13 +26,14 @@ * * Handles spurious and unhandled IRQ's. It also prints a debugmessage. */ -void handle_bad_irq(struct irq_desc *desc) +bool handle_bad_irq(struct irq_desc *desc) { unsigned int irq = irq_desc_get_irq(desc); print_irq_desc(irq, desc); kstat_incr_irqs_this_cpu(desc); ack_bad_irq(irq); + return true; } EXPORT_SYMBOL_GPL(handle_bad_irq); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index fcab63c66905..56afc0be6289 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -105,6 +105,8 @@ static inline void unregister_handler_proc(unsigned int irq, struct irqaction *action) { } #endif +extern bool irq_can_set_affinity_usr(unsigned int irq); + extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask); extern void irq_set_thread_affinity(struct irq_desc *desc); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 239e2ae2c947..52fbf88cd2d8 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -15,6 +15,7 @@ #include <linux/radix-tree.h> #include <linux/bitmap.h> #include <linux/irqdomain.h> +#include <linux/wakeup_reason.h> #include "internals.h" @@ -339,16 +340,25 @@ void irq_init_desc(unsigned int irq) /** * generic_handle_irq - Invoke the handler for a particular irq * @irq: The irq number to handle - * + * returns: + * negative on error + * 0 when the interrupt handler was not called + * 1 when the interrupt handler was called */ + int generic_handle_irq(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); if (!desc) return -EINVAL; - generic_handle_irq_desc(desc); - return 0; + + if (unlikely(logging_wakeup_reasons_nosync())) + return log_possible_wakeup_reason(irq, + desc, + generic_handle_irq_desc); + + return generic_handle_irq_desc(desc); } EXPORT_SYMBOL_GPL(generic_handle_irq); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index f5bb63cbb6b4..668141c23d8f 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -115,12 +115,12 @@ EXPORT_SYMBOL(synchronize_irq); #ifdef CONFIG_SMP cpumask_var_t irq_default_affinity; -static int __irq_can_set_affinity(struct irq_desc *desc) +static bool __irq_can_set_affinity(struct irq_desc *desc) { if (!desc || !irqd_can_balance(&desc->irq_data) || !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity) - return 0; - return 1; + return false; + return true; } /** @@ -134,6 +134,21 @@ int irq_can_set_affinity(unsigned int irq) } /** + * irq_can_set_affinity_usr - Check if affinity of a irq can be set from user space + * @irq: Interrupt to check + * + * Like irq_can_set_affinity() above, but additionally checks for the + * AFFINITY_MANAGED flag. + */ +bool irq_can_set_affinity_usr(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + + return __irq_can_set_affinity(desc) && + !irqd_affinity_is_managed(&desc->irq_data); +} + +/** * irq_set_thread_affinity - Notify irq threads to adjust affinity * @desc: irq descriptor which has affitnity changed * diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index cd6009006510..41b40f310c28 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -268,7 +268,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, struct msi_domain_ops *ops = info->ops; msi_alloc_info_t arg; struct msi_desc *desc; - int i, ret, virq; + int i, ret, virq = 0; ret = ops->msi_check(domain, info, dev); if (ret == 0) diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index cea1de0161f1..28e134310435 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -11,7 +11,7 @@ #include <linux/interrupt.h> #include <linux/suspend.h> #include <linux/syscore_ops.h> - +#include <linux/wakeup_reason.h> #include "internals.h" bool irq_pm_check_wakeup(struct irq_desc *desc) diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index a2c02fd5d6d0..b05509af0352 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -96,7 +96,7 @@ static ssize_t write_irq_affinity(int type, struct file *file, cpumask_var_t new_value; int err; - if (!irq_can_set_affinity(irq) || no_irq_affinity) + if (!irq_can_set_affinity_usr(irq) || no_irq_affinity) return -EIO; if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) @@ -114,6 +114,11 @@ static ssize_t write_irq_affinity(int type, struct file *file, goto free_cpumask; } + if (cpumask_subset(new_value, cpu_isolated_mask)) { + err = -EINVAL; + goto free_cpumask; + } + /* * Do not allow disabling IRQs completely - it's a too easy * way to make the system unusable accidentally :-) At least diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index 320579d89091..f0964f058521 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -17,6 +17,7 @@ enum { _IRQ_IS_POLLED = IRQ_IS_POLLED, _IRQ_DISABLE_UNLAZY = IRQ_DISABLE_UNLAZY, _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK, + _IRQ_AFFINITY_MANAGED = IRQ_AFFINITY_MANAGED, }; #define IRQ_PER_CPU GOT_YOU_MORON @@ -32,6 +33,7 @@ enum { #define IRQ_DISABLE_UNLAZY GOT_YOU_MORON #undef IRQF_MODIFY_MASK #define IRQF_MODIFY_MASK GOT_YOU_MORON +#define IRQ_AFFINITY_MANAGED GOT_YOU_MORON static inline void irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set) @@ -65,6 +67,16 @@ static inline bool irq_settings_has_no_balance_set(struct irq_desc *desc) return desc->status_use_accessors & _IRQ_NO_BALANCING; } +static inline void irq_settings_set_affinity_managed(struct irq_desc *desc) +{ + desc->status_use_accessors |= _IRQ_AFFINITY_MANAGED; +} + +static inline bool irq_settings_has_affinity_managed_set(struct irq_desc *desc) +{ + return desc->status_use_accessors & _IRQ_AFFINITY_MANAGED; +} + static inline u32 irq_settings_get_trigger_mask(struct irq_desc *desc) { return desc->status_use_accessors & IRQ_TYPE_SENSE_MASK; diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index a70b90db3909..c61c56f05dfa 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -26,6 +26,7 @@ #include <linux/interrupt.h> #include <linux/debug_locks.h> #include <linux/osq_lock.h> +#include <linux/delay.h> /* * In the DEBUG case we are using the "NULL fastpath" for mutexes, @@ -378,6 +379,17 @@ static bool mutex_optimistic_spin(struct mutex *lock, * values at the cost of a few extra spins. */ cpu_relax_lowlatency(); + + /* + * On arm systems, we must slow down the waiter's repeated + * aquisition of spin_mlock and atomics on the lock count, or + * we risk starving out a thread attempting to release the + * mutex. The mutex slowpath release must take spin lock + * wait_lock. This spin lock can share a monitor with the + * other waiter atomics in the mutex data structure, so must + * take care to rate limit the waiters. + */ + udelay(1); } osq_unlock(&lock->osq); diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c index 8d7047ecef4e..0befa20ce96e 100644 --- a/kernel/locking/osq_lock.c +++ b/kernel/locking/osq_lock.c @@ -1,6 +1,7 @@ #include <linux/percpu.h> #include <linux/sched.h> #include <linux/osq_lock.h> +#include <linux/sched/rt.h> /* * An MCS like lock especially tailored for optimistic spinning for sleeping @@ -85,6 +86,7 @@ bool osq_lock(struct optimistic_spin_queue *lock) { struct optimistic_spin_node *node = this_cpu_ptr(&osq_node); struct optimistic_spin_node *prev, *next; + struct task_struct *task = current; int curr = encode_cpu(smp_processor_id()); int old; @@ -131,8 +133,13 @@ bool osq_lock(struct optimistic_spin_queue *lock) while (!READ_ONCE(node->locked)) { /* * If we need to reschedule bail... so we can block. + * If a task spins on owner on a CPU after acquiring + * osq_lock while a RT task spins on another CPU to + * acquire osq_lock, it will starve the owner from + * completing if owner is to be scheduled on the same CPU. + * It will be a live lock. */ - if (need_resched()) + if (need_resched() || rt_task(task)) goto unqueue; cpu_relax_lowlatency(); diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c index 95e610e3f7ef..989991f00dc7 100644 --- a/kernel/locking/spinlock_debug.c +++ b/kernel/locking/spinlock_debug.c @@ -12,6 +12,8 @@ #include <linux/debug_locks.h> #include <linux/delay.h> #include <linux/export.h> +#include <linux/bug.h> +#include <soc/qcom/watchdog.h> void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, struct lock_class_key *key) @@ -64,6 +66,11 @@ static void spin_dump(raw_spinlock_t *lock, const char *msg) owner ? owner->comm : "<none>", owner ? task_pid_nr(owner) : -1, READ_ONCE(lock->owner_cpu)); +#ifdef CONFIG_DEBUG_SPINLOCK_BITE_ON_BUG + msm_trigger_wdog_bite(); +#elif defined(CONFIG_DEBUG_SPINLOCK_PANIC_ON_BUG) + BUG(); +#endif dump_stack(); } @@ -114,7 +121,7 @@ static void __spin_lock_debug(raw_spinlock_t *lock) __delay(1); } /* lockup suspected: */ - spin_dump(lock, "lockup suspected"); + spin_bug(lock, "lockup suspected"); #ifdef CONFIG_SMP trigger_all_cpu_backtrace(); #endif @@ -167,6 +174,11 @@ static void rwlock_bug(rwlock_t *lock, const char *msg) printk(KERN_EMERG "BUG: rwlock %s on CPU#%d, %s/%d, %p\n", msg, raw_smp_processor_id(), current->comm, task_pid_nr(current), lock); +#ifdef CONFIG_DEBUG_SPINLOCK_BITE_ON_BUG + msm_trigger_wdog_bite(); +#elif defined(CONFIG_DEBUG_SPINLOCK_PANIC_ON_BUG) + BUG(); +#endif dump_stack(); } diff --git a/kernel/module.c b/kernel/module.c index d84f5e38456f..7ce940ab20dc 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2522,7 +2522,7 @@ static void layout_symtab(struct module *mod, struct load_info *info) /* We'll tack temporary mod_kallsyms on the end. */ mod->init_size = ALIGN(mod->init_size, - __alignof__(struct mod_kallsyms)); + __alignof__(struct mod_kallsyms)); info->mod_kallsyms_init_off = mod->init_size; mod->init_size += sizeof(struct mod_kallsyms); mod->init_size = debug_align(mod->init_size); @@ -2602,7 +2602,13 @@ void * __weak module_alloc(unsigned long size) return vmalloc_exec(size); } -#ifdef CONFIG_DEBUG_KMEMLEAK +#if defined(CONFIG_DEBUG_KMEMLEAK) && defined(CONFIG_DEBUG_MODULE_SCAN_OFF) +static void kmemleak_load_module(const struct module *mod, + const struct load_info *info) +{ + kmemleak_no_scan(mod->module_core); +} +#elif defined(CONFIG_DEBUG_KMEMLEAK) static void kmemleak_load_module(const struct module *mod, const struct load_info *info) { diff --git a/kernel/panic.c b/kernel/panic.c index dde00886c896..aceb10bcfc2f 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -24,6 +24,10 @@ #include <linux/init.h> #include <linux/nmi.h> #include <linux/console.h> +#include <soc/qcom/minidump.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/exception.h> #define PANIC_TIMER_STEP 100 #define PANIC_BLINK_SPD 18 @@ -77,6 +81,8 @@ void panic(const char *fmt, ...) long i, i_next = 0; int state = 0; + trace_kernel_panic(0); + /* * Disable local interrupts. This will prevent panic_smp_self_stop * from deadlocking the first cpu that invokes the panic, since @@ -104,6 +110,7 @@ void panic(const char *fmt, ...) va_start(args, fmt); vsnprintf(buf, sizeof(buf), fmt, args); va_end(args); + dump_stack_minidump(0); pr_emerg("Kernel panic - not syncing: %s\n", buf); #ifdef CONFIG_DEBUG_BUGVERBOSE /* @@ -179,6 +186,9 @@ void panic(const char *fmt, ...) mdelay(PANIC_TIMER_STEP); } } + + trace_kernel_panic_late(0); + if (panic_timeout != 0) { /* * This will not be a clean reboot, with everything diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 4335e7d1c391..500ba8b970e4 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -322,3 +322,7 @@ config PM_GENERIC_DOMAINS_OF config CPU_PM bool + +config DEDUCE_WAKEUP_REASONS + bool + default n diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 70d9f3f031e3..184ea54e721a 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -30,6 +30,7 @@ #include <linux/genhd.h> #include <linux/ktime.h> #include <trace/events/power.h> +#include <soc/qcom/boot_stats.h> #include "power.h" @@ -469,6 +470,7 @@ static int resume_target_kernel(bool platform_mode) touch_softlockup_watchdog(); syscore_resume(); + place_marker("PM: Image Restoration failed!"); Enable_irqs: local_irq_enable(); @@ -705,6 +707,7 @@ int hibernate(void) pm_restore_gfp_mask(); } else { pr_debug("PM: Image restored successfully.\n"); + place_marker("PM: Image restored!"); } Free_bitmaps: @@ -1152,6 +1155,22 @@ static int __init kaslr_nohibernate_setup(char *str) return nohibernate_setup(str); } +static int __init page_poison_nohibernate_setup(char *str) +{ +#ifdef CONFIG_PAGE_POISONING_ZERO + /* + * The zeroing option for page poison skips the checks on alloc. + * since hibernation doesn't save free pages there's no way to + * guarantee the pages will still be zeroed. + */ + if (!strcmp(str, "on")) { + pr_info("Disabling hibernation due to page poisoning\n"); + return nohibernate_setup(str); + } +#endif + return 1; +} + __setup("noresume", noresume_setup); __setup("resume_offset=", resume_offset_setup); __setup("resume=", resume_setup); @@ -1160,3 +1179,4 @@ __setup("resumewait", resumewait_setup); __setup("resumedelay=", resumedelay_setup); __setup("nohibernate", nohibernate_setup); __setup("kaslr", kaslr_nohibernate_setup); +__setup("page_poison=", page_poison_nohibernate_setup); diff --git a/kernel/power/main.c b/kernel/power/main.c index 68c0eaae8034..5ea50b1b7595 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -287,13 +287,7 @@ static ssize_t pm_wakeup_irq_show(struct kobject *kobj, return pm_wakeup_irq ? sprintf(buf, "%u\n", pm_wakeup_irq) : -ENODATA; } -static ssize_t pm_wakeup_irq_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t n) -{ - return -EINVAL; -} -power_attr(pm_wakeup_irq); +power_attr_ro(pm_wakeup_irq); #else /* !CONFIG_PM_SLEEP_DEBUG */ static inline void pm_print_times_init(void) {} @@ -571,14 +565,7 @@ static ssize_t pm_trace_dev_match_show(struct kobject *kobj, return show_trace_dev_match(buf, PAGE_SIZE); } -static ssize_t -pm_trace_dev_match_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t n) -{ - return -EINVAL; -} - -power_attr(pm_trace_dev_match); +power_attr_ro(pm_trace_dev_match); #endif /* CONFIG_PM_TRACE */ diff --git a/kernel/power/power.h b/kernel/power/power.h index 436e302eb024..2610516601ee 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -77,6 +77,15 @@ static struct kobj_attribute _name##_attr = { \ .store = _name##_store, \ } +#define power_attr_ro(_name) \ +static struct kobj_attribute _name##_attr = { \ + .attr = { \ + .name = __stringify(_name), \ + .mode = S_IRUGO, \ + }, \ + .show = _name##_show, \ +} + /* Preferred image size in bytes (default 500 MB) */ extern unsigned long image_size; /* Size of memory reserved for drivers (default SPARE_PAGES x PAGE_SIZE) */ diff --git a/kernel/power/process.c b/kernel/power/process.c index cc177142a08f..372de061dda2 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -37,9 +37,6 @@ static int try_to_freeze_tasks(bool user_only) unsigned int elapsed_msecs; bool wakeup = false; int sleep_usecs = USEC_PER_MSEC; -#ifdef CONFIG_PM_SLEEP - char suspend_abort[MAX_SUSPEND_ABORT_LEN]; -#endif do_gettimeofday(&start); @@ -69,11 +66,6 @@ static int try_to_freeze_tasks(bool user_only) break; if (pm_wakeup_pending()) { -#ifdef CONFIG_PM_SLEEP - pm_get_active_wakeup_sources(suspend_abort, - MAX_SUSPEND_ABORT_LEN); - log_suspend_abort_reason(suspend_abort); -#endif wakeup = true; break; } diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 97b0df71303e..e6eceb0aa496 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -43,6 +43,9 @@ #include <linux/kernel.h> #include <linux/debugfs.h> #include <linux/seq_file.h> +#include <linux/irq.h> +#include <linux/irqdesc.h> +#include <linux/cpumask.h> #include <linux/uaccess.h> #include <linux/export.h> @@ -67,6 +70,8 @@ static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); static struct pm_qos_constraints cpu_dma_constraints = { .list = PLIST_HEAD_INIT(cpu_dma_constraints.list), .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, + .target_per_cpu = { [0 ... (NR_CPUS - 1)] = + PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE }, .default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, .no_constraint_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, .type = PM_QOS_MIN, @@ -81,6 +86,8 @@ static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); static struct pm_qos_constraints network_lat_constraints = { .list = PLIST_HEAD_INIT(network_lat_constraints.list), .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, + .target_per_cpu = { [0 ... (NR_CPUS - 1)] = + PM_QOS_NETWORK_LAT_DEFAULT_VALUE }, .default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, .no_constraint_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, .type = PM_QOS_MIN, @@ -91,11 +98,12 @@ static struct pm_qos_object network_lat_pm_qos = { .name = "network_latency", }; - static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier); static struct pm_qos_constraints network_tput_constraints = { .list = PLIST_HEAD_INIT(network_tput_constraints.list), .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, + .target_per_cpu = { [0 ... (NR_CPUS - 1)] = + PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE }, .default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, .no_constraint_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, .type = PM_QOS_MAX, @@ -259,22 +267,60 @@ static const struct file_operations pm_qos_debug_fops = { .release = single_release, }; +static inline void pm_qos_set_value_for_cpus(struct pm_qos_constraints *c, + struct cpumask *cpus) +{ + struct pm_qos_request *req = NULL; + int cpu; + s32 qos_val[NR_CPUS] = { [0 ... (NR_CPUS - 1)] = c->default_value }; + + plist_for_each_entry(req, &c->list, node) { + for_each_cpu(cpu, &req->cpus_affine) { + switch (c->type) { + case PM_QOS_MIN: + if (qos_val[cpu] > req->node.prio) + qos_val[cpu] = req->node.prio; + break; + case PM_QOS_MAX: + if (req->node.prio > qos_val[cpu]) + qos_val[cpu] = req->node.prio; + break; + case PM_QOS_SUM: + qos_val[cpu] += req->node.prio; + break; + default: + BUG(); + break; + } + } + } + + for_each_possible_cpu(cpu) { + if (c->target_per_cpu[cpu] != qos_val[cpu]) + cpumask_set_cpu(cpu, cpus); + c->target_per_cpu[cpu] = qos_val[cpu]; + } +} + /** * pm_qos_update_target - manages the constraints list and calls the notifiers * if needed * @c: constraints data struct - * @node: request to add to the list, to update or to remove + * @req: request to add to the list, to update or to remove * @action: action to take on the constraints list * @value: value of the request to add or update * * This function returns 1 if the aggregated constraint value has changed, 0 * otherwise. */ -int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node, - enum pm_qos_req_action action, int value) +int pm_qos_update_target(struct pm_qos_constraints *c, + struct pm_qos_request *req, + enum pm_qos_req_action action, int value) { unsigned long flags; int prev_value, curr_value, new_value; + struct plist_node *node = &req->node; + struct cpumask cpus; int ret; spin_lock_irqsave(&pm_qos_lock, flags); @@ -305,17 +351,23 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node, } curr_value = pm_qos_get_value(c); + cpumask_clear(&cpus); pm_qos_set_value(c, curr_value); + pm_qos_set_value_for_cpus(c, &cpus); spin_unlock_irqrestore(&pm_qos_lock, flags); trace_pm_qos_update_target(action, prev_value, curr_value); - if (prev_value != curr_value) { + /* + * if cpu mask bits are set, call the notifier call chain + * to update the new qos restriction for the cores + */ + if (!cpumask_empty(&cpus)) { ret = 1; if (c->notifiers) blocking_notifier_call_chain(c->notifiers, (unsigned long)curr_value, - NULL); + &cpus); } else { ret = 0; } @@ -398,12 +450,56 @@ int pm_qos_request(int pm_qos_class) } EXPORT_SYMBOL_GPL(pm_qos_request); +int pm_qos_request_for_cpu(int pm_qos_class, int cpu) +{ + if (cpu_isolated(cpu)) + return INT_MAX; + + return pm_qos_array[pm_qos_class]->constraints->target_per_cpu[cpu]; +} +EXPORT_SYMBOL(pm_qos_request_for_cpu); + int pm_qos_request_active(struct pm_qos_request *req) { return req->pm_qos_class != 0; } EXPORT_SYMBOL_GPL(pm_qos_request_active); +int pm_qos_request_for_cpumask(int pm_qos_class, struct cpumask *mask) +{ + unsigned long irqflags; + int cpu; + struct pm_qos_constraints *c = NULL; + int val; + + spin_lock_irqsave(&pm_qos_lock, irqflags); + c = pm_qos_array[pm_qos_class]->constraints; + val = c->default_value; + + for_each_cpu(cpu, mask) { + if (cpu_isolated(cpu)) + continue; + + switch (c->type) { + case PM_QOS_MIN: + if (c->target_per_cpu[cpu] < val) + val = c->target_per_cpu[cpu]; + break; + case PM_QOS_MAX: + if (c->target_per_cpu[cpu] > val) + val = c->target_per_cpu[cpu]; + break; + default: + BUG(); + break; + } + } + spin_unlock_irqrestore(&pm_qos_lock, irqflags); + + return val; +} +EXPORT_SYMBOL(pm_qos_request_for_cpumask); + static void __pm_qos_update_request(struct pm_qos_request *req, s32 new_value) { @@ -412,7 +508,7 @@ static void __pm_qos_update_request(struct pm_qos_request *req, if (new_value != req->node.prio) pm_qos_update_target( pm_qos_array[req->pm_qos_class]->constraints, - &req->node, PM_QOS_UPDATE_REQ, new_value); + req, PM_QOS_UPDATE_REQ, new_value); } /** @@ -430,6 +526,41 @@ static void pm_qos_work_fn(struct work_struct *work) __pm_qos_update_request(req, PM_QOS_DEFAULT_VALUE); } +#ifdef CONFIG_SMP +static void pm_qos_irq_release(struct kref *ref) +{ + unsigned long flags; + struct irq_affinity_notify *notify = container_of(ref, + struct irq_affinity_notify, kref); + struct pm_qos_request *req = container_of(notify, + struct pm_qos_request, irq_notify); + struct pm_qos_constraints *c = + pm_qos_array[req->pm_qos_class]->constraints; + + spin_lock_irqsave(&pm_qos_lock, flags); + cpumask_setall(&req->cpus_affine); + spin_unlock_irqrestore(&pm_qos_lock, flags); + + pm_qos_update_target(c, req, PM_QOS_UPDATE_REQ, c->default_value); +} + +static void pm_qos_irq_notify(struct irq_affinity_notify *notify, + const cpumask_t *mask) +{ + unsigned long flags; + struct pm_qos_request *req = container_of(notify, + struct pm_qos_request, irq_notify); + struct pm_qos_constraints *c = + pm_qos_array[req->pm_qos_class]->constraints; + + spin_lock_irqsave(&pm_qos_lock, flags); + cpumask_copy(&req->cpus_affine, mask); + spin_unlock_irqrestore(&pm_qos_lock, flags); + + pm_qos_update_target(c, req, PM_QOS_UPDATE_REQ, req->node.prio); +} +#endif + /** * pm_qos_add_request - inserts new qos request into the list * @req: pointer to a preallocated handle @@ -453,11 +584,70 @@ void pm_qos_add_request(struct pm_qos_request *req, WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n"); return; } + + switch (req->type) { + case PM_QOS_REQ_AFFINE_CORES: + if (cpumask_empty(&req->cpus_affine)) { + req->type = PM_QOS_REQ_ALL_CORES; + cpumask_setall(&req->cpus_affine); + WARN(1, KERN_ERR "Affine cores not set for request with affinity flag\n"); + } + break; +#ifdef CONFIG_SMP + case PM_QOS_REQ_AFFINE_IRQ: + if (irq_can_set_affinity(req->irq)) { + struct irq_desc *desc = irq_to_desc(req->irq); + struct cpumask *mask; + + if (!desc) + return; + mask = desc->irq_data.common->affinity; + + /* Get the current affinity */ + cpumask_copy(&req->cpus_affine, mask); + req->irq_notify.irq = req->irq; + req->irq_notify.notify = pm_qos_irq_notify; + req->irq_notify.release = pm_qos_irq_release; + + } else { + req->type = PM_QOS_REQ_ALL_CORES; + cpumask_setall(&req->cpus_affine); + WARN(1, KERN_ERR "IRQ-%d not set for request with affinity flag\n", + req->irq); + } + break; +#endif + default: + WARN(1, KERN_ERR "Unknown request type %d\n", req->type); + /* fall through */ + case PM_QOS_REQ_ALL_CORES: + cpumask_setall(&req->cpus_affine); + break; + } + req->pm_qos_class = pm_qos_class; INIT_DELAYED_WORK(&req->work, pm_qos_work_fn); trace_pm_qos_add_request(pm_qos_class, value); pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints, - &req->node, PM_QOS_ADD_REQ, value); + req, PM_QOS_ADD_REQ, value); + +#ifdef CONFIG_SMP + if (req->type == PM_QOS_REQ_AFFINE_IRQ && + irq_can_set_affinity(req->irq)) { + int ret = 0; + + ret = irq_set_affinity_notifier(req->irq, + &req->irq_notify); + if (ret) { + WARN(1, "IRQ affinity notify set failed\n"); + req->type = PM_QOS_REQ_ALL_CORES; + cpumask_setall(&req->cpus_affine); + pm_qos_update_target( + pm_qos_array[pm_qos_class]->constraints, + req, PM_QOS_UPDATE_REQ, value); + } + } +#endif } EXPORT_SYMBOL_GPL(pm_qos_add_request); @@ -511,7 +701,7 @@ void pm_qos_update_request_timeout(struct pm_qos_request *req, s32 new_value, if (new_value != req->node.prio) pm_qos_update_target( pm_qos_array[req->pm_qos_class]->constraints, - &req->node, PM_QOS_UPDATE_REQ, new_value); + req, PM_QOS_UPDATE_REQ, new_value); schedule_delayed_work(&req->work, usecs_to_jiffies(timeout_us)); } @@ -531,15 +721,25 @@ void pm_qos_remove_request(struct pm_qos_request *req) /* silent return to keep pcm code cleaner */ if (!pm_qos_request_active(req)) { - WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n"); + WARN(1, "pm_qos_remove_request() called for unknown object\n"); return; } cancel_delayed_work_sync(&req->work); +#ifdef CONFIG_SMP + if (req->type == PM_QOS_REQ_AFFINE_IRQ) { + int ret = 0; + /* Get the current affinity */ + ret = irq_set_affinity_notifier(req->irq, NULL); + if (ret) + WARN(1, "IRQ affinity notify set failed\n"); + } +#endif + trace_pm_qos_remove_request(req->pm_qos_class, PM_QOS_DEFAULT_VALUE); pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints, - &req->node, PM_QOS_REMOVE_REQ, + req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 58209d8bfc56..6e7832ee6d74 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -287,6 +287,7 @@ static int suspend_prepare(suspend_state_t state) if (!error) return 0; + log_suspend_abort_reason("One or more tasks refusing to freeze"); suspend_stats.failed_freeze++; dpm_save_failed_step(SUSPEND_FREEZE); Finish: @@ -316,7 +317,6 @@ void __weak arch_suspend_enable_irqs(void) */ static int suspend_enter(suspend_state_t state, bool *wakeup) { - char suspend_abort[MAX_SUSPEND_ABORT_LEN]; int error, last_dev; error = platform_suspend_prepare(state); @@ -385,11 +385,10 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) state, false); events_check_enabled = false; } else if (*wakeup) { - pm_get_active_wakeup_sources(suspend_abort, - MAX_SUSPEND_ABORT_LEN); - log_suspend_abort_reason(suspend_abort); error = -EBUSY; } + + start_logging_wakeup_reasons(); syscore_resume(); } diff --git a/kernel/power/swap.c b/kernel/power/swap.c index a7630e7b22a5..77fef12bb45b 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -36,6 +36,15 @@ #define HIBERNATE_SIG "S1SUSPEND" +static int goldenimage; +/* + * When reading an {un,}compressed image, we may restore pages in place, + * in which case some architectures need these pages cleaning before they + * can be executed. We don't know which pages these may be, so clean the lot. + */ +static bool clean_pages_on_read; +static bool clean_pages_on_decompress; + /* * When reading an {un,}compressed image, we may restore pages in place, * in which case some architectures need these pages cleaning before they @@ -1532,7 +1541,13 @@ int swsusp_check(void) goto put; if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) { - memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); + if (!goldenimage) { + pr_debug("PM: corrupt hibernate image header\n"); + memcpy(swsusp_header->sig, + swsusp_header->orig_sig, 10); + } else { + pr_debug("PM: Header corruption avoided\n"); + } /* Reset swap signature now */ error = hib_submit_io(WRITE_SYNC, swsusp_resume_block, swsusp_header, NULL); @@ -1606,3 +1621,10 @@ static int swsusp_header_init(void) } core_initcall(swsusp_header_init); + +static int __init golden_image_setup(char *str) +{ + goldenimage = 1; + return 1; +} +__setup("golden_image", golden_image_setup); diff --git a/kernel/power/wakeup_reason.c b/kernel/power/wakeup_reason.c index 252611fad2fe..44d8da2952c5 100644 --- a/kernel/power/wakeup_reason.c +++ b/kernel/power/wakeup_reason.c @@ -26,42 +26,232 @@ #include <linux/spinlock.h> #include <linux/notifier.h> #include <linux/suspend.h> - +#include <linux/slab.h> #define MAX_WAKEUP_REASON_IRQS 32 -static int irq_list[MAX_WAKEUP_REASON_IRQS]; -static int irqcount; static bool suspend_abort; static char abort_reason[MAX_SUSPEND_ABORT_LEN]; + +static struct wakeup_irq_node *base_irq_nodes; +static struct wakeup_irq_node *cur_irq_tree; +static int cur_irq_tree_depth; +static LIST_HEAD(wakeup_irqs); + +static struct kmem_cache *wakeup_irq_nodes_cache; static struct kobject *wakeup_reason; -static DEFINE_SPINLOCK(resume_reason_lock); +static spinlock_t resume_reason_lock; +bool log_wakeups __read_mostly; +struct completion wakeups_completion; static ktime_t last_monotime; /* monotonic time before last suspend */ static ktime_t curr_monotime; /* monotonic time after last suspend */ static ktime_t last_stime; /* monotonic boottime offset before last suspend */ static ktime_t curr_stime; /* monotonic boottime offset after last suspend */ -static ssize_t last_resume_reason_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) +static void init_wakeup_irq_node(struct wakeup_irq_node *p, int irq) { - int irq_no, buf_offset = 0; - struct irq_desc *desc; - spin_lock(&resume_reason_lock); - if (suspend_abort) { - buf_offset = sprintf(buf, "Abort: %s", abort_reason); - } else { - for (irq_no = 0; irq_no < irqcount; irq_no++) { - desc = irq_to_desc(irq_list[irq_no]); - if (desc && desc->action && desc->action->name) - buf_offset += sprintf(buf + buf_offset, "%d %s\n", - irq_list[irq_no], desc->action->name); - else - buf_offset += sprintf(buf + buf_offset, "%d\n", - irq_list[irq_no]); + p->irq = irq; + p->desc = irq_to_desc(irq); + p->child = NULL; + p->parent = NULL; + p->handled = false; + INIT_LIST_HEAD(&p->siblings); + INIT_LIST_HEAD(&p->next); +} + +static struct wakeup_irq_node* alloc_irq_node(int irq) +{ + struct wakeup_irq_node *n; + + n = kmem_cache_alloc(wakeup_irq_nodes_cache, GFP_ATOMIC); + if (!n) { + pr_warning("Failed to log chained wakeup IRQ %d\n", + irq); + return NULL; + } + + init_wakeup_irq_node(n, irq); + return n; +} + +static struct wakeup_irq_node * +search_siblings(struct wakeup_irq_node *root, int irq) +{ + bool found = false; + struct wakeup_irq_node *n = NULL; + BUG_ON(!root); + + if (root->irq == irq) + return root; + + list_for_each_entry(n, &root->siblings, siblings) { + if (n->irq == irq) { + found = true; + break; } } - spin_unlock(&resume_reason_lock); - return buf_offset; + + return found ? n : NULL; +} + +static struct wakeup_irq_node * +add_to_siblings(struct wakeup_irq_node *root, int irq) +{ + struct wakeup_irq_node *n; + if (root) { + n = search_siblings(root, irq); + if (n) + return n; + } + n = alloc_irq_node(irq); + + if (n && root) + list_add(&n->siblings, &root->siblings); + return n; +} + +#ifdef CONFIG_DEDUCE_WAKEUP_REASONS +static struct wakeup_irq_node* add_child(struct wakeup_irq_node *root, int irq) +{ + if (!root->child) { + root->child = alloc_irq_node(irq); + if (!root->child) + return NULL; + root->child->parent = root; + return root->child; + } + + return add_to_siblings(root->child, irq); +} + +static struct wakeup_irq_node *find_first_sibling(struct wakeup_irq_node *node) +{ + struct wakeup_irq_node *n; + if (node->parent) + return node; + list_for_each_entry(n, &node->siblings, siblings) { + if (n->parent) + return n; + } + return NULL; +} + +static struct wakeup_irq_node * +get_base_node(struct wakeup_irq_node *node, unsigned depth) +{ + if (!node) + return NULL; + + while (depth) { + node = find_first_sibling(node); + BUG_ON(!node); + node = node->parent; + depth--; + } + + return node; +} +#endif /* CONFIG_DEDUCE_WAKEUP_REASONS */ + +static const struct list_head* get_wakeup_reasons_nosync(void); + +static void print_wakeup_sources(void) +{ + struct wakeup_irq_node *n; + const struct list_head *wakeups; + + if (suspend_abort) { + pr_info("Abort: %s\n", abort_reason); + return; + } + + wakeups = get_wakeup_reasons_nosync(); + list_for_each_entry(n, wakeups, next) { + if (n->desc && n->desc->action && n->desc->action->name) + pr_info("Resume caused by IRQ %d, %s\n", n->irq, + n->desc->action->name); + else + pr_info("Resume caused by IRQ %d\n", n->irq); + } +} + +static bool walk_irq_node_tree(struct wakeup_irq_node *root, + bool (*visit)(struct wakeup_irq_node *, void *), + void *cookie) +{ + struct wakeup_irq_node *n, *t; + + if (!root) + return true; + + list_for_each_entry_safe(n, t, &root->siblings, siblings) { + if (!walk_irq_node_tree(n->child, visit, cookie)) + return false; + if (!visit(n, cookie)) + return false; + } + + if (!walk_irq_node_tree(root->child, visit, cookie)) + return false; + return visit(root, cookie); +} + +#ifdef CONFIG_DEDUCE_WAKEUP_REASONS +static bool is_node_handled(struct wakeup_irq_node *n, void *_p) +{ + return n->handled; +} + +static bool base_irq_nodes_done(void) +{ + return walk_irq_node_tree(base_irq_nodes, is_node_handled, NULL); +} +#endif + +struct buf_cookie { + char *buf; + int buf_offset; +}; + +static bool print_leaf_node(struct wakeup_irq_node *n, void *_p) +{ + struct buf_cookie *b = _p; + if (!n->child) { + if (n->desc && n->desc->action && n->desc->action->name) + b->buf_offset += + snprintf(b->buf + b->buf_offset, + PAGE_SIZE - b->buf_offset, + "%d %s\n", + n->irq, n->desc->action->name); + else + b->buf_offset += + snprintf(b->buf + b->buf_offset, + PAGE_SIZE - b->buf_offset, + "%d\n", + n->irq); + } + return true; +} + +static ssize_t last_resume_reason_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + unsigned long flags; + + struct buf_cookie b = { + .buf = buf, + .buf_offset = 0 + }; + + spin_lock_irqsave(&resume_reason_lock, flags); + if (suspend_abort) + b.buf_offset = snprintf(buf, PAGE_SIZE, "Abort: %s", abort_reason); + else + walk_irq_node_tree(base_irq_nodes, print_leaf_node, &b); + spin_unlock_irqrestore(&resume_reason_lock, flags); + + return b.buf_offset; } static ssize_t last_suspend_time_show(struct kobject *kobj, @@ -104,56 +294,155 @@ static struct attribute_group attr_group = { .attrs = attrs, }; +static inline void stop_logging_wakeup_reasons(void) +{ + ACCESS_ONCE(log_wakeups) = false; + smp_wmb(); +} + /* - * logs all the wake up reasons to the kernel - * stores the irqs to expose them to the userspace via sysfs + * stores the immediate wakeup irqs; these often aren't the ones seen by + * the drivers that registered them, due to chained interrupt controllers, + * and multiple-interrupt dispatch. */ -void log_wakeup_reason(int irq) +void log_base_wakeup_reason(int irq) { - struct irq_desc *desc; - desc = irq_to_desc(irq); - if (desc && desc->action && desc->action->name) - printk(KERN_INFO "Resume caused by IRQ %d, %s\n", irq, - desc->action->name); - else - printk(KERN_INFO "Resume caused by IRQ %d\n", irq); + /* No locking is needed, since this function is called within + * syscore_resume, with both nonboot CPUs and interrupts disabled. + */ + base_irq_nodes = add_to_siblings(base_irq_nodes, irq); + BUG_ON(!base_irq_nodes); +#ifndef CONFIG_DEDUCE_WAKEUP_REASONS + base_irq_nodes->handled = true; +#endif +} - spin_lock(&resume_reason_lock); - if (irqcount == MAX_WAKEUP_REASON_IRQS) { - spin_unlock(&resume_reason_lock); - printk(KERN_WARNING "Resume caused by more than %d IRQs\n", - MAX_WAKEUP_REASON_IRQS); - return; +#ifdef CONFIG_DEDUCE_WAKEUP_REASONS + +/* This function is called by generic_handle_irq, which may call itself + * recursively. This happens with interrupts disabled. Using + * log_possible_wakeup_reason, we build a tree of interrupts, tracing the call + * stack of generic_handle_irq, for each wakeup source containing the + * interrupts actually handled. + * + * Most of these "trees" would either have a single node (in the event that the + * wakeup source is the final interrupt), or consist of a list of two + * interrupts, with the wakeup source at the root, and the final dispatched + * interrupt at the leaf. + * + * When *all* wakeup sources have been thusly spoken for, this function will + * clear the log_wakeups flag, and print the wakeup reasons. + + TODO: percpu + + */ + +static struct wakeup_irq_node * +log_possible_wakeup_reason_start(int irq, struct irq_desc *desc, unsigned depth) +{ + BUG_ON(!irqs_disabled()); + BUG_ON((signed)depth < 0); + + /* This function can race with a call to stop_logging_wakeup_reasons() + * from a thread context. If this happens, just exit silently, as we are no + * longer interested in logging interrupts. + */ + if (!logging_wakeup_reasons()) + return NULL; + + /* If suspend was aborted, the base IRQ nodes are missing, and we stop + * logging interrupts immediately. + */ + if (!base_irq_nodes) { + stop_logging_wakeup_reasons(); + return NULL; + } + + /* We assume wakeup interrupts are handlerd only by the first core. */ + /* TODO: relax this by having percpu versions of the irq tree */ + if (smp_processor_id() != 0) { + return NULL; } - irq_list[irqcount++] = irq; - spin_unlock(&resume_reason_lock); + if (depth == 0) { + cur_irq_tree_depth = 0; + cur_irq_tree = search_siblings(base_irq_nodes, irq); + } + else if (cur_irq_tree) { + if (depth > cur_irq_tree_depth) { + BUG_ON(depth - cur_irq_tree_depth > 1); + cur_irq_tree = add_child(cur_irq_tree, irq); + if (cur_irq_tree) + cur_irq_tree_depth++; + } + else { + cur_irq_tree = get_base_node(cur_irq_tree, + cur_irq_tree_depth - depth); + cur_irq_tree_depth = depth; + cur_irq_tree = add_to_siblings(cur_irq_tree, irq); + } + } + + return cur_irq_tree; } -int check_wakeup_reason(int irq) +static void log_possible_wakeup_reason_complete(struct wakeup_irq_node *n, + unsigned depth, + bool handled) { - int irq_no; - int ret = false; - - spin_lock(&resume_reason_lock); - for (irq_no = 0; irq_no < irqcount; irq_no++) - if (irq_list[irq_no] == irq) { - ret = true; - break; + if (!n) + return; + n->handled = handled; + if (depth == 0) { + if (base_irq_nodes_done()) { + stop_logging_wakeup_reasons(); + complete(&wakeups_completion); + print_wakeup_sources(); + } } - spin_unlock(&resume_reason_lock); - return ret; } +bool log_possible_wakeup_reason(int irq, + struct irq_desc *desc, + bool (*handler)(struct irq_desc *)) +{ + static DEFINE_PER_CPU(unsigned int, depth); + + struct wakeup_irq_node *n; + bool handled; + unsigned d; + + d = get_cpu_var(depth)++; + put_cpu_var(depth); + + n = log_possible_wakeup_reason_start(irq, desc, d); + + handled = handler(desc); + + d = --get_cpu_var(depth); + put_cpu_var(depth); + + if (!handled && desc && desc->action) + pr_debug("%s: irq %d action %pF not handled\n", __func__, + irq, desc->action->handler); + + log_possible_wakeup_reason_complete(n, d, handled); + + return handled; +} + +#endif /* CONFIG_DEDUCE_WAKEUP_REASONS */ + void log_suspend_abort_reason(const char *fmt, ...) { va_list args; + unsigned long flags; - spin_lock(&resume_reason_lock); + spin_lock_irqsave(&resume_reason_lock, flags); //Suspend abort reason has already been logged. if (suspend_abort) { - spin_unlock(&resume_reason_lock); + spin_unlock_irqrestore(&resume_reason_lock, flags); return; } @@ -161,29 +450,128 @@ void log_suspend_abort_reason(const char *fmt, ...) va_start(args, fmt); vsnprintf(abort_reason, MAX_SUSPEND_ABORT_LEN, fmt, args); va_end(args); - spin_unlock(&resume_reason_lock); + + spin_unlock_irqrestore(&resume_reason_lock, flags); +} + +static bool match_node(struct wakeup_irq_node *n, void *_p) +{ + int irq = *((int *)_p); + return n->irq != irq; +} + +int check_wakeup_reason(int irq) +{ + bool found; + unsigned long flags; + spin_lock_irqsave(&resume_reason_lock, flags); + found = !walk_irq_node_tree(base_irq_nodes, match_node, &irq); + spin_unlock_irqrestore(&resume_reason_lock, flags); + return found; +} + +static bool build_leaf_nodes(struct wakeup_irq_node *n, void *_p) +{ + struct list_head *wakeups = _p; + if (!n->child) + list_add(&n->next, wakeups); + return true; +} + +static const struct list_head* get_wakeup_reasons_nosync(void) +{ + BUG_ON(logging_wakeup_reasons()); + INIT_LIST_HEAD(&wakeup_irqs); + walk_irq_node_tree(base_irq_nodes, build_leaf_nodes, &wakeup_irqs); + return &wakeup_irqs; +} + +static bool build_unfinished_nodes(struct wakeup_irq_node *n, void *_p) +{ + struct list_head *unfinished = _p; + if (!n->handled) { + pr_warning("%s: wakeup irq %d was not handled\n", + __func__, n->irq); + list_add(&n->next, unfinished); + } + return true; +} + +const struct list_head* get_wakeup_reasons(unsigned long timeout, + struct list_head *unfinished) +{ + INIT_LIST_HEAD(unfinished); + + if (logging_wakeup_reasons()) { + unsigned long signalled = 0; + if (timeout) + signalled = wait_for_completion_timeout(&wakeups_completion, timeout); + if (WARN_ON(!signalled)) { + stop_logging_wakeup_reasons(); + walk_irq_node_tree(base_irq_nodes, build_unfinished_nodes, unfinished); + return NULL; + } + pr_info("%s: waited for %u ms\n", + __func__, + jiffies_to_msecs(timeout - signalled)); + } + + return get_wakeup_reasons_nosync(); +} + +static bool delete_node(struct wakeup_irq_node *n, void *unused) +{ + list_del(&n->siblings); + kmem_cache_free(wakeup_irq_nodes_cache, n); + return true; +} + +void clear_wakeup_reasons(void) +{ + unsigned long flags; + spin_lock_irqsave(&resume_reason_lock, flags); + + BUG_ON(logging_wakeup_reasons()); + walk_irq_node_tree(base_irq_nodes, delete_node, NULL); + base_irq_nodes = NULL; + cur_irq_tree = NULL; + cur_irq_tree_depth = 0; + INIT_LIST_HEAD(&wakeup_irqs); + suspend_abort = false; + + spin_unlock_irqrestore(&resume_reason_lock, flags); } /* Detects a suspend and clears all the previous wake up reasons*/ static int wakeup_reason_pm_event(struct notifier_block *notifier, unsigned long pm_event, void *unused) { + unsigned long flags; switch (pm_event) { case PM_SUSPEND_PREPARE: - spin_lock(&resume_reason_lock); - irqcount = 0; + spin_lock_irqsave(&resume_reason_lock, flags); suspend_abort = false; - spin_unlock(&resume_reason_lock); + spin_unlock_irqrestore(&resume_reason_lock, flags); /* monotonic time since boot */ last_monotime = ktime_get(); /* monotonic time since boot including the time spent in suspend */ last_stime = ktime_get_boottime(); + clear_wakeup_reasons(); break; case PM_POST_SUSPEND: /* monotonic time since boot */ curr_monotime = ktime_get(); /* monotonic time since boot including the time spent in suspend */ curr_stime = ktime_get_boottime(); +#ifdef CONFIG_DEDUCE_WAKEUP_REASONS + /* log_wakeups should have been cleared by now. */ + if (WARN_ON(logging_wakeup_reasons())) { + stop_logging_wakeup_reasons(); + print_wakeup_sources(); + } +#else + print_wakeup_sources(); +#endif break; default: break; @@ -195,31 +583,46 @@ static struct notifier_block wakeup_reason_pm_notifier_block = { .notifier_call = wakeup_reason_pm_event, }; -/* Initializes the sysfs parameter - * registers the pm_event notifier - */ int __init wakeup_reason_init(void) { - int retval; + spin_lock_init(&resume_reason_lock); - retval = register_pm_notifier(&wakeup_reason_pm_notifier_block); - if (retval) - printk(KERN_WARNING "[%s] failed to register PM notifier %d\n", - __func__, retval); + if (register_pm_notifier(&wakeup_reason_pm_notifier_block)) { + pr_warning("[%s] failed to register PM notifier\n", + __func__); + goto fail; + } wakeup_reason = kobject_create_and_add("wakeup_reasons", kernel_kobj); if (!wakeup_reason) { - printk(KERN_WARNING "[%s] failed to create a sysfs kobject\n", + pr_warning("[%s] failed to create a sysfs kobject\n", __func__); - return 1; + goto fail_unregister_pm_notifier; } - retval = sysfs_create_group(wakeup_reason, &attr_group); - if (retval) { - kobject_put(wakeup_reason); - printk(KERN_WARNING "[%s] failed to create a sysfs group %d\n", - __func__, retval); + + if (sysfs_create_group(wakeup_reason, &attr_group)) { + pr_warning("[%s] failed to create a sysfs group\n", + __func__); + goto fail_kobject_put; } + + wakeup_irq_nodes_cache = + kmem_cache_create("wakeup_irq_node_cache", + sizeof(struct wakeup_irq_node), 0, + 0, NULL); + if (!wakeup_irq_nodes_cache) + goto fail_remove_group; + return 0; + +fail_remove_group: + sysfs_remove_group(wakeup_reason, &attr_group); +fail_kobject_put: + kobject_put(wakeup_reason); +fail_unregister_pm_notifier: + unregister_pm_notifier(&wakeup_reason_pm_notifier_block); +fail: + return 1; } late_initcall(wakeup_reason_init); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 8e229b85f50e..6613e3437b91 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -236,7 +236,11 @@ struct printk_log { u8 facility; /* syslog facility */ u8 flags:5; /* internal record flags */ u8 level:3; /* syslog level */ -}; +} +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS +__packed __aligned(4) +#endif +; /* * The logbuf_lock protects kmsg buffer, indices, counters. This can be taken @@ -277,11 +281,7 @@ static u32 clear_idx; #define LOG_FACILITY(v) ((v) >> 3 & 0xff) /* record buffer */ -#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) -#define LOG_ALIGN 4 -#else #define LOG_ALIGN __alignof__(struct printk_log) -#endif #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) #define LOG_BUF_LEN_MAX (u32)(1 << 31) static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); @@ -497,7 +497,7 @@ static int syslog_action_restricted(int type) type != SYSLOG_ACTION_SIZE_BUFFER; } -int check_syslog_permissions(int type, int source) +static int check_syslog_permissions(int type, int source) { /* * If this is from /proc/kmsg and we've already opened it, then we've @@ -525,7 +525,6 @@ int check_syslog_permissions(int type, int source) ok: return security_syslog(type); } -EXPORT_SYMBOL_GPL(check_syslog_permissions); static void append_char(char **pp, char *e, char c) { @@ -2140,6 +2139,8 @@ void resume_console(void) console_unlock(); } +#ifdef CONFIG_CONSOLE_FLUSH_ON_HOTPLUG + /** * console_cpu_notify - print deferred console messages after CPU hotplug * @self: notifier struct @@ -2159,12 +2160,18 @@ static int console_cpu_notify(struct notifier_block *self, case CPU_DEAD: case CPU_DOWN_FAILED: case CPU_UP_CANCELED: + case CPU_DYING: +#ifdef CONFIG_CONSOLE_FLUSH_ON_HOTPLUG console_lock(); console_unlock(); +#endif + break; } return NOTIFY_OK; } +#endif + /** * console_lock - lock the console system for exclusive use. * @@ -2729,7 +2736,9 @@ static int __init printk_late_init(void) unregister_console(con); } } +#ifdef CONFIG_CONSOLE_FLUSH_ON_HOTPLUG hotcpu_notifier(console_cpu_notify, 0); +#endif return 0; } late_initcall(printk_late_init); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 082aedefe29c..61d1a83aabf5 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -57,6 +57,8 @@ #include <linux/trace_events.h> #include <linux/suspend.h> +#include <soc/qcom/watchdog.h> + #include "tree.h" #include "rcu.h" @@ -246,24 +248,17 @@ static int rcu_gp_in_progress(struct rcu_state *rsp) */ void rcu_sched_qs(void) { - unsigned long flags; - - if (__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) { - trace_rcu_grace_period(TPS("rcu_sched"), - __this_cpu_read(rcu_sched_data.gpnum), - TPS("cpuqs")); - __this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false); - if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) - return; - local_irq_save(flags); - if (__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) { - __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false); - rcu_report_exp_rdp(&rcu_sched_state, - this_cpu_ptr(&rcu_sched_data), - true); - } - local_irq_restore(flags); - } + if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) + return; + trace_rcu_grace_period(TPS("rcu_sched"), + __this_cpu_read(rcu_sched_data.gpnum), + TPS("cpuqs")); + __this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false); + if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) + return; + __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false); + rcu_report_exp_rdp(&rcu_sched_state, + this_cpu_ptr(&rcu_sched_data), true); } void rcu_bh_qs(void) @@ -300,17 +295,16 @@ EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr); * We inform the RCU core by emulating a zero-duration dyntick-idle * period, which we in turn do by incrementing the ->dynticks counter * by two. + * + * The caller must have disabled interrupts. */ static void rcu_momentary_dyntick_idle(void) { - unsigned long flags; struct rcu_data *rdp; struct rcu_dynticks *rdtp; int resched_mask; struct rcu_state *rsp; - local_irq_save(flags); - /* * Yes, we can lose flag-setting operations. This is OK, because * the flag will be set again after some delay. @@ -340,13 +334,12 @@ static void rcu_momentary_dyntick_idle(void) smp_mb__after_atomic(); /* Later stuff after QS. */ break; } - local_irq_restore(flags); } /* * Note a context switch. This is a quiescent state for RCU-sched, * and requires special handling for preemptible RCU. - * The caller must have disabled preemption. + * The caller must have disabled interrupts. */ void rcu_note_context_switch(void) { @@ -376,9 +369,14 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch); */ void rcu_all_qs(void) { + unsigned long flags; + barrier(); /* Avoid RCU read-side critical sections leaking down. */ - if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) + if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) { + local_irq_save(flags); rcu_momentary_dyntick_idle(); + local_irq_restore(flags); + } this_cpu_inc(rcu_qs_ctr); barrier(); /* Avoid RCU read-side critical sections leaking up. */ } @@ -1310,6 +1308,11 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) rcu_check_gp_kthread_starvation(rsp); +#ifdef CONFIG_RCU_STALL_WATCHDOG_BITE + /* Induce watchdog bite */ + msm_trigger_wdog_bite(); +#endif + force_quiescent_state(rsp); /* Kick them all. */ } @@ -1345,6 +1348,11 @@ static void print_cpu_stall(struct rcu_state *rsp) jiffies + 3 * rcu_jiffies_till_stall_check() + 3); raw_spin_unlock_irqrestore(&rnp->lock, flags); +#ifdef CONFIG_RCU_STALL_WATCHDOG_BITE + /* Induce non secure watchdog bite to collect context */ + msm_trigger_wdog_bite(); +#endif + /* * Attempt to revive the RCU machinery by forcing a context switch. * diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 32cbe72bf545..c6fc11d626f8 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -147,8 +147,8 @@ static void __init rcu_bootup_announce(void) * the corresponding expedited grace period will also be the end of the * normal grace period. */ -static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp, - unsigned long flags) __releases(rnp->lock) +static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) + __releases(rnp->lock) /* But leaves rrupts disabled. */ { int blkd_state = (rnp->gp_tasks ? RCU_GP_TASKS : 0) + (rnp->exp_tasks ? RCU_EXP_TASKS : 0) + @@ -236,7 +236,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp, rnp->gp_tasks = &t->rcu_node_entry; if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD)) rnp->exp_tasks = &t->rcu_node_entry; - raw_spin_unlock(&rnp->lock); + raw_spin_unlock(&rnp->lock); /* rrupts remain disabled. */ /* * Report the quiescent state for the expedited GP. This expedited @@ -251,7 +251,6 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp, } else { WARN_ON_ONCE(t->rcu_read_unlock_special.b.exp_need_qs); } - local_irq_restore(flags); } /* @@ -286,12 +285,11 @@ static void rcu_preempt_qs(void) * predating the current grace period drain, in other words, until * rnp->gp_tasks becomes NULL. * - * Caller must disable preemption. + * Caller must disable interrupts. */ static void rcu_preempt_note_context_switch(void) { struct task_struct *t = current; - unsigned long flags; struct rcu_data *rdp; struct rcu_node *rnp; @@ -301,7 +299,7 @@ static void rcu_preempt_note_context_switch(void) /* Possibly blocking in an RCU read-side critical section. */ rdp = this_cpu_ptr(rcu_state_p->rda); rnp = rdp->mynode; - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock(&rnp->lock); smp_mb__after_unlock_lock(); t->rcu_read_unlock_special.b.blocked = true; t->rcu_blocked_node = rnp; @@ -318,7 +316,7 @@ static void rcu_preempt_note_context_switch(void) (rnp->qsmask & rdp->grpmask) ? rnp->gpnum : rnp->gpnum + 1); - rcu_preempt_ctxt_queue(rnp, rdp, flags); + rcu_preempt_ctxt_queue(rnp, rdp); } else if (t->rcu_read_lock_nesting < 0 && t->rcu_read_unlock_special.s) { diff --git a/kernel/resource.c b/kernel/resource.c index 41718cd8cab5..73348f574163 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -172,7 +172,7 @@ static const struct file_operations proc_iomem_operations = { static int __init ioresources_init(void) { proc_create("ioports", 0, NULL, &proc_ioports_operations); - proc_create("iomem", 0, NULL, &proc_iomem_operations); + proc_create("iomem", S_IRUSR, NULL, &proc_iomem_operations); return 0; } __initcall(ioresources_init); diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 99378130a42f..7dde1b9918e4 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -17,13 +17,14 @@ endif obj-y += core.o loadavg.o clock.o cputime.o obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o -obj-y += wait.o completion.o idle.o +obj-y += wait.o completion.o idle.o sched_avg.o obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o energy.o -obj-$(CONFIG_SCHED_WALT) += walt.o +obj-$(CONFIG_SCHED_HMP) += hmp.o boost.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_SCHED_TUNE) += tune.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o +obj-$(CONFIG_SCHED_CORE_CTL) += core_ctl.o obj-$(CONFIG_CPU_FREQ) += cpufreq.o obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o diff --git a/kernel/sched/boost.c b/kernel/sched/boost.c new file mode 100644 index 000000000000..5bdd51b1e55e --- /dev/null +++ b/kernel/sched/boost.c @@ -0,0 +1,217 @@ +/* Copyright (c) 2012-2016, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include "sched.h" +#include <linux/of.h> +#include <linux/sched/core_ctl.h> +#include <trace/events/sched.h> + +/* + * Scheduler boost is a mechanism to temporarily place tasks on CPUs + * with higher capacity than those where a task would have normally + * ended up with their load characteristics. Any entity enabling + * boost is responsible for disabling it as well. + */ + +unsigned int sysctl_sched_boost; +static enum sched_boost_policy boost_policy; +static enum sched_boost_policy boost_policy_dt = SCHED_BOOST_NONE; +static DEFINE_MUTEX(boost_mutex); +static unsigned int freq_aggr_threshold_backup; + +static inline void boost_kick(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (!test_and_set_bit(BOOST_KICK, &rq->hmp_flags)) + smp_send_reschedule(cpu); +} + +static void boost_kick_cpus(void) +{ + int i; + struct cpumask kick_mask; + + if (boost_policy != SCHED_BOOST_ON_BIG) + return; + + cpumask_andnot(&kick_mask, cpu_online_mask, cpu_isolated_mask); + + for_each_cpu(i, &kick_mask) { + if (cpu_capacity(i) != max_capacity) + boost_kick(i); + } +} + +int got_boost_kick(void) +{ + int cpu = smp_processor_id(); + struct rq *rq = cpu_rq(cpu); + + return test_bit(BOOST_KICK, &rq->hmp_flags); +} + +void clear_boost_kick(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + clear_bit(BOOST_KICK, &rq->hmp_flags); +} + +/* + * Scheduler boost type and boost policy might at first seem unrelated, + * however, there exists a connection between them that will allow us + * to use them interchangeably during placement decisions. We'll explain + * the connection here in one possible way so that the implications are + * clear when looking at placement policies. + * + * When policy = SCHED_BOOST_NONE, type is either none or RESTRAINED + * When policy = SCHED_BOOST_ON_ALL or SCHED_BOOST_ON_BIG, type can + * neither be none nor RESTRAINED. + */ +static void set_boost_policy(int type) +{ + if (type == SCHED_BOOST_NONE || type == RESTRAINED_BOOST) { + boost_policy = SCHED_BOOST_NONE; + return; + } + + if (boost_policy_dt) { + boost_policy = boost_policy_dt; + return; + } + + if (min_possible_efficiency != max_possible_efficiency) { + boost_policy = SCHED_BOOST_ON_BIG; + return; + } + + boost_policy = SCHED_BOOST_ON_ALL; +} + +enum sched_boost_policy sched_boost_policy(void) +{ + return boost_policy; +} + +static bool verify_boost_params(int old_val, int new_val) +{ + /* + * Boost can only be turned on or off. There is no possiblity of + * switching from one boost type to another or to set the same + * kind of boost several times. + */ + return !(!!old_val == !!new_val); +} + +static void _sched_set_boost(int old_val, int type) +{ + switch (type) { + case NO_BOOST: + if (old_val == FULL_THROTTLE_BOOST) + core_ctl_set_boost(false); + else if (old_val == CONSERVATIVE_BOOST) + restore_cgroup_boost_settings(); + else + update_freq_aggregate_threshold( + freq_aggr_threshold_backup); + break; + + case FULL_THROTTLE_BOOST: + core_ctl_set_boost(true); + boost_kick_cpus(); + break; + + case CONSERVATIVE_BOOST: + update_cgroup_boost_settings(); + boost_kick_cpus(); + break; + + case RESTRAINED_BOOST: + freq_aggr_threshold_backup = + update_freq_aggregate_threshold(1); + break; + + default: + WARN_ON(1); + return; + } + + set_boost_policy(type); + sysctl_sched_boost = type; + trace_sched_set_boost(type); +} + +void sched_boost_parse_dt(void) +{ + struct device_node *sn; + const char *boost_policy; + + sn = of_find_node_by_path("/sched-hmp"); + if (!sn) + return; + + if (!of_property_read_string(sn, "boost-policy", &boost_policy)) { + if (!strcmp(boost_policy, "boost-on-big")) + boost_policy_dt = SCHED_BOOST_ON_BIG; + else if (!strcmp(boost_policy, "boost-on-all")) + boost_policy_dt = SCHED_BOOST_ON_ALL; + } +} + +int sched_set_boost(int type) +{ + int ret = 0; + + mutex_lock(&boost_mutex); + + if (verify_boost_params(sysctl_sched_boost, type)) + _sched_set_boost(sysctl_sched_boost, type); + else + ret = -EINVAL; + + mutex_unlock(&boost_mutex); + return ret; +} + +int sched_boost_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + unsigned int *data = (unsigned int *)table->data; + unsigned int old_val; + + mutex_lock(&boost_mutex); + + old_val = *data; + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + if (ret || !write) + goto done; + + if (verify_boost_params(old_val, *data)) { + _sched_set_boost(old_val, *data); + } else { + *data = old_val; + ret = -EINVAL; + } + +done: + mutex_unlock(&boost_mutex); + return ret; +} + +int sched_boost(void) +{ + return sysctl_sched_boost; +} diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index caf4041f5b0a..bc54e84675da 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -354,7 +354,7 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) return; sched_clock_tick(); - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sched(); } EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 13f2475716c7..633139647129 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -26,6 +26,7 @@ * Thomas Gleixner, Mike Kravetz */ +#include <linux/kasan.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/nmi.h> @@ -74,6 +75,8 @@ #include <linux/binfmts.h> #include <linux/context_tracking.h> #include <linux/compiler.h> +#include <linux/irq.h> +#include <linux/sched/core_ctl.h> #include <linux/cpufreq_times.h> #include <asm/switch_to.h> @@ -83,14 +86,19 @@ #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> #endif +#ifdef CONFIG_MSM_APP_SETTINGS +#include <asm/app_api.h> +#endif #include "sched.h" #include "../workqueue_internal.h" #include "../smpboot.h" +#include "../time/tick-internal.h" #define CREATE_TRACE_POINTS #include <trace/events/sched.h> -#include "walt.h" + +ATOMIC_NOTIFIER_HEAD(load_alert_notifier_head); DEFINE_MUTEX(sched_domains_mutex); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); @@ -856,6 +864,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) if (!(flags & ENQUEUE_RESTORE)) sched_info_queued(rq, p); p->sched_class->enqueue_task(rq, p, flags); + trace_sched_enq_deq_task(p, 1, cpumask_bits(&p->cpus_allowed)[0]); } static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) @@ -864,6 +873,7 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) if (!(flags & DEQUEUE_SAVE)) sched_info_dequeued(rq, p); p->sched_class->dequeue_task(rq, p, flags); + trace_sched_enq_deq_task(p, 0, cpumask_bits(&p->cpus_allowed)[0]); } void activate_task(struct rq *rq, struct task_struct *p, int flags) @@ -879,6 +889,9 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags) if (task_contributes_to_load(p)) rq->nr_uninterruptible++; + if (flags & DEQUEUE_SLEEP) + clear_ed_task(p, rq); + dequeue_task(rq, p, flags); } @@ -1094,8 +1107,8 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new { lockdep_assert_held(&rq->lock); - dequeue_task(rq, p, 0); p->on_rq = TASK_ON_RQ_MIGRATING; + dequeue_task(rq, p, 0); double_lock_balance(rq, cpu_rq(new_cpu)); set_task_cpu(p, new_cpu); double_unlock_balance(rq, cpu_rq(new_cpu)); @@ -1105,8 +1118,8 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new raw_spin_lock(&rq->lock); BUG_ON(task_cpu(p) != new_cpu); - p->on_rq = TASK_ON_RQ_QUEUED; enqueue_task(rq, p, 0); + p->on_rq = TASK_ON_RQ_QUEUED; check_preempt_curr(rq, p, 0); return rq; @@ -1128,6 +1141,8 @@ struct migration_arg { */ static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_cpu) { + int src_cpu; + if (unlikely(!cpu_active(dest_cpu))) return rq; @@ -1135,6 +1150,7 @@ static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_ if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) return rq; + src_cpu = cpu_of(rq); rq = move_queued_task(rq, p, dest_cpu); return rq; @@ -1150,6 +1166,8 @@ static int migration_cpu_stop(void *data) struct migration_arg *arg = data; struct task_struct *p = arg->task; struct rq *rq = this_rq(); + int src_cpu = cpu_of(rq); + bool moved = false; /* * The original target cpu might have gone down and we might @@ -1170,12 +1188,18 @@ static int migration_cpu_stop(void *data) * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because * we're holding p->pi_lock. */ - if (task_rq(p) == rq && task_on_rq_queued(p)) + if (task_rq(p) == rq && task_on_rq_queued(p)) { rq = __migrate_task(rq, p, arg->dest_cpu); + moved = true; + } raw_spin_unlock(&rq->lock); raw_spin_unlock(&p->pi_lock); local_irq_enable(); + + if (moved) + notify_migration(src_cpu, arg->dest_cpu, false, p); + return 0; } @@ -1234,6 +1258,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, struct rq *rq; unsigned int dest_cpu; int ret = 0; + cpumask_t allowed_mask; rq = task_rq_lock(p, &flags); @@ -1249,18 +1274,25 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, if (cpumask_equal(&p->cpus_allowed, new_mask)) goto out; - if (!cpumask_intersects(new_mask, cpu_active_mask)) { - ret = -EINVAL; - goto out; + cpumask_andnot(&allowed_mask, new_mask, cpu_isolated_mask); + cpumask_and(&allowed_mask, &allowed_mask, cpu_active_mask); + + dest_cpu = cpumask_any(&allowed_mask); + if (dest_cpu >= nr_cpu_ids) { + cpumask_and(&allowed_mask, cpu_active_mask, new_mask); + dest_cpu = cpumask_any(&allowed_mask); + if (dest_cpu >= nr_cpu_ids) { + ret = -EINVAL; + goto out; + } } do_set_cpus_allowed(p, new_mask); /* Can the task run on the task's current CPU? If so, we're done */ - if (cpumask_test_cpu(task_cpu(p), new_mask)) + if (cpumask_test_cpu(task_cpu(p), &allowed_mask)) goto out; - dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); if (task_running(rq, p) || p->state == TASK_WAKING) { struct migration_arg arg = { p, dest_cpu }; /* Need help from migration thread: drop lock and wait. */ @@ -1299,6 +1331,15 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && !p->on_rq); + /* + * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING, + * because schedstat_wait_{start,end} rebase migrating task's wait_start + * time relying on p->on_rq. + */ + WARN_ON_ONCE(p->state == TASK_RUNNING && + p->sched_class == &fair_sched_class && + (p->on_rq && !task_on_rq_migrating(p))); + #ifdef CONFIG_LOCKDEP /* * The caller should hold either p->pi_lock or rq->lock, when changing @@ -1315,7 +1356,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) #endif #endif - trace_sched_migrate_task(p, new_cpu); + trace_sched_migrate_task(p, new_cpu, pct_task_load(p)); if (task_cpu(p) != new_cpu) { if (p->sched_class->migrate_task_rq) @@ -1323,7 +1364,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) p->se.nr_migrations++; perf_event_task_migrate(p); - walt_fixup_busy_time(p, new_cpu); + fixup_busy_time(p, new_cpu); } __set_task_cpu(p, new_cpu); @@ -1337,11 +1378,13 @@ static void __migrate_swap_task(struct task_struct *p, int cpu) src_rq = task_rq(p); dst_rq = cpu_rq(cpu); + p->on_rq = TASK_ON_RQ_MIGRATING; deactivate_task(src_rq, p, 0); p->on_rq = TASK_ON_RQ_MIGRATING; set_task_cpu(p, cpu); p->on_rq = TASK_ON_RQ_QUEUED; activate_task(dst_rq, p, 0); + p->on_rq = TASK_ON_RQ_QUEUED; check_preempt_curr(dst_rq, p, 0); } else { /* @@ -1527,7 +1570,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) * yield - it could be a while. */ if (unlikely(queued)) { - ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); + ktime_t to = ktime_set(0, NSEC_PER_MSEC); set_current_state(TASK_UNINTERRUPTIBLE); schedule_hrtimeout(&to, HRTIMER_MODE_REL); @@ -1573,12 +1616,13 @@ EXPORT_SYMBOL_GPL(kick_process); /* * ->cpus_allowed is protected by both rq->lock and p->pi_lock */ -static int select_fallback_rq(int cpu, struct task_struct *p) +static int select_fallback_rq(int cpu, struct task_struct *p, bool allow_iso) { int nid = cpu_to_node(cpu); const struct cpumask *nodemask = NULL; - enum { cpuset, possible, fail } state = cpuset; + enum { cpuset, possible, fail, bug } state = cpuset; int dest_cpu; + int isolated_candidate = -1; /* * If the node that the cpu is on has been offlined, cpu_to_node() @@ -1594,6 +1638,8 @@ static int select_fallback_rq(int cpu, struct task_struct *p) continue; if (!cpu_active(dest_cpu)) continue; + if (cpu_isolated(dest_cpu)) + continue; if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) return dest_cpu; } @@ -1606,6 +1652,16 @@ static int select_fallback_rq(int cpu, struct task_struct *p) continue; if (!cpu_active(dest_cpu)) continue; + if (cpu_isolated(dest_cpu)) { + if (allow_iso) + isolated_candidate = dest_cpu; + continue; + } + goto out; + } + + if (isolated_candidate != -1) { + dest_cpu = isolated_candidate; goto out; } @@ -1624,6 +1680,11 @@ static int select_fallback_rq(int cpu, struct task_struct *p) break; case fail: + allow_iso = true; + state = bug; + break; + + case bug: BUG(); break; } @@ -1652,6 +1713,8 @@ static inline int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags, int sibling_count_hint) { + bool allow_isolated = (p->flags & PF_KTHREAD); + lockdep_assert_held(&p->pi_lock); if (p->nr_cpus_allowed > 1) @@ -1669,13 +1732,14 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags, * not worry about this generic constraint ] */ if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) || - !cpu_online(cpu))) - cpu = select_fallback_rq(task_cpu(p), p); + !cpu_online(cpu)) || + (cpu_isolated(cpu) && !allow_isolated)) + cpu = select_fallback_rq(task_cpu(p), p, allow_isolated); return cpu; } -static void update_avg(u64 *avg, u64 sample) +void update_avg(u64 *avg, u64 sample) { s64 diff = sample - *avg; *avg += diff >> 3; @@ -1748,6 +1812,7 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) { check_preempt_curr(rq, p, wake_flags); + p->state = TASK_RUNNING; trace_sched_wakeup(p); @@ -1839,6 +1904,8 @@ void sched_ttwu_pending(void) void scheduler_ipi(void) { + int cpu = smp_processor_id(); + /* * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting * TIF_NEED_RESCHED remotely (for the first time) will also send @@ -1846,9 +1913,18 @@ void scheduler_ipi(void) */ preempt_fold_need_resched(); - if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) + if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick() && + !got_boost_kick()) return; + if (got_boost_kick()) { + struct rq *rq = cpu_rq(cpu); + + if (rq->curr->sched_class == &fair_sched_class) + check_for_migration(rq, rq->curr); + clear_boost_kick(cpu); + } + /* * Not all reschedule IPI handlers call irq_enter/irq_exit, since * traditionally all their work was done from the interrupt return @@ -1868,7 +1944,7 @@ void scheduler_ipi(void) /* * Check if someone kicked us for doing the nohz idle load balance. */ - if (unlikely(got_nohz_idle_kick())) { + if (unlikely(got_nohz_idle_kick()) && !cpu_isolated(cpu)) { this_rq()->idle_balance = 1; raise_softirq_irqoff(SCHED_SOFTIRQ); } @@ -1961,11 +2037,17 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags, int sibling_count_hint) { unsigned long flags; - int cpu, success = 0; + int cpu, src_cpu, success = 0; #ifdef CONFIG_SMP + unsigned int old_load; struct rq *rq; u64 wallclock; + struct related_thread_group *grp = NULL; #endif + bool freq_notif_allowed = !(wake_flags & WF_NO_NOTIFIER); + bool check_group = false; + + wake_flags &= ~WF_NO_NOTIFIER; /* * If we are going to wake up a thread waiting for CONDITION we @@ -1975,13 +2057,14 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags, */ smp_mb__before_spinlock(); raw_spin_lock_irqsave(&p->pi_lock, flags); + src_cpu = cpu = task_cpu(p); + if (!(p->state & state)) goto out; trace_sched_waking(p); success = 1; /* we're going to change ->state */ - cpu = task_cpu(p); /* * Ensure we load p->on_rq _after_ p->state, otherwise it would @@ -2048,11 +2131,20 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags, rq = cpu_rq(task_cpu(p)); raw_spin_lock(&rq->lock); - wallclock = walt_ktime_clock(); - walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); - walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0); + old_load = task_load(p); + wallclock = sched_ktime_clock(); + update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); + update_task_ravg(p, rq, TASK_WAKE, wallclock, 0); + cpufreq_update_util(rq, 0); raw_spin_unlock(&rq->lock); + rcu_read_lock(); + grp = task_related_thread_group(p); + if (update_preferred_cluster(grp, p, old_load)) + set_preferred_cluster(grp); + rcu_read_unlock(); + check_group = grp != NULL; + p->sched_contributes_to_load = !!task_contributes_to_load(p); p->state = TASK_WAKING; @@ -2061,19 +2153,33 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags, cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags, sibling_count_hint); - if (task_cpu(p) != cpu) { + + /* Refresh src_cpu as it could have changed since we last read it */ + src_cpu = task_cpu(p); + if (src_cpu != cpu) { wake_flags |= WF_MIGRATED; set_task_cpu(p, cpu); } + note_task_waking(p, wallclock); #endif /* CONFIG_SMP */ - ttwu_queue(p, cpu); stat: ttwu_stat(p, cpu, wake_flags); out: raw_spin_unlock_irqrestore(&p->pi_lock, flags); + if (freq_notif_allowed) { + if (!same_freq_domain(src_cpu, cpu)) { + check_for_freq_change(cpu_rq(cpu), + false, check_group); + check_for_freq_change(cpu_rq(src_cpu), + false, check_group); + } else if (success) { + check_for_freq_change(cpu_rq(cpu), true, false); + } + } + return success; } @@ -2089,9 +2195,13 @@ static void try_to_wake_up_local(struct task_struct *p) { struct rq *rq = task_rq(p); - if (WARN_ON_ONCE(rq != this_rq()) || - WARN_ON_ONCE(p == current)) + if (rq != this_rq() || p == current) { + printk_deferred("%s: Failed to wakeup task %d (%s), rq = %p," + " this_rq = %p, p = %p, current = %p\n", + __func__, task_pid_nr(p), p->comm, rq, + this_rq(), p, current); return; + } lockdep_assert_held(&rq->lock); @@ -2115,17 +2225,20 @@ static void try_to_wake_up_local(struct task_struct *p) trace_sched_waking(p); if (!task_on_rq_queued(p)) { - u64 wallclock = walt_ktime_clock(); + u64 wallclock = sched_ktime_clock(); - walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); - walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0); + update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); + update_task_ravg(p, rq, TASK_WAKE, wallclock, 0); + cpufreq_update_util(rq, 0); ttwu_activate(rq, p, ENQUEUE_WAKEUP); + note_task_waking(p, wallclock); } ttwu_do_wakeup(rq, p, 0); ttwu_stat(p, smp_processor_id(), 0); out: raw_spin_unlock(&p->pi_lock); + /* Todo : Send cpufreq notifier */ } /** @@ -2146,6 +2259,26 @@ int wake_up_process(struct task_struct *p) } EXPORT_SYMBOL(wake_up_process); +/** + * wake_up_process_no_notif - Wake up a specific process without notifying + * governor + * @p: The process to be woken up. + * + * Attempt to wake up the nominated process and move it to the set of runnable + * processes. + * + * Return: 1 if the process was woken up, 0 if it was already running. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ +int wake_up_process_no_notif(struct task_struct *p) +{ + WARN_ON(task_is_stopped_or_traced(p)); + return try_to_wake_up(p, TASK_NORMAL, WF_NO_NOTIFIER, 1); +} +EXPORT_SYMBOL(wake_up_process_no_notif); + int wake_up_state(struct task_struct *p, unsigned int state) { return try_to_wake_up(p, state, 0, 1); @@ -2170,6 +2303,44 @@ void __dl_clear_params(struct task_struct *p) dl_se->dl_yielded = 0; } +#ifdef CONFIG_SCHED_HMP +/* + * sched_exit() - Set EXITING_TASK_MARKER in task's ravg.demand field + * + * Stop accounting (exiting) task's future cpu usage + * + * We need this so that reset_all_windows_stats() can function correctly. + * reset_all_window_stats() depends on do_each_thread/for_each_thread task + * iterators to reset *all* task's statistics. Exiting tasks however become + * invisible to those iterators. sched_exit() is called on a exiting task prior + * to being removed from task_list, which will let reset_all_window_stats() + * function correctly. + */ +void sched_exit(struct task_struct *p) +{ + unsigned long flags; + struct rq *rq; + u64 wallclock; + + sched_set_group_id(p, 0); + + rq = task_rq_lock(p, &flags); + + /* rq->curr == p */ + wallclock = sched_ktime_clock(); + update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); + dequeue_task(rq, p, 0); + reset_task_stats(p); + p->ravg.mark_start = wallclock; + p->ravg.sum_history[0] = EXITING_TASK_MARKER; + + enqueue_task(rq, p, 0); + clear_ed_task(p, rq); + task_rq_unlock(rq, p, &flags); + free_task_load_ptrs(p); +} +#endif /* CONFIG_SCHED_HMP */ + /* * Perform scheduler related setup for a newly forked process p. * p is forked by current. @@ -2191,7 +2362,6 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) #endif INIT_LIST_HEAD(&p->se.group_node); - walt_init_new_task_load(p); #ifdef CONFIG_FAIR_GROUP_SCHED p->se.cfs_rq = NULL; @@ -2207,6 +2377,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) init_rt_schedtune_timer(&p->rt); INIT_LIST_HEAD(&p->rt.run_list); + p->rt.timeout = 0; + p->rt.time_slice = sched_rr_timeslice; + p->rt.on_rq = 0; + p->rt.on_list = 0; #ifdef CONFIG_PREEMPT_NOTIFIERS INIT_HLIST_HEAD(&p->preempt_notifiers); @@ -2276,7 +2450,10 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, int sched_fork(unsigned long clone_flags, struct task_struct *p) { unsigned long flags; - int cpu = get_cpu(); + int cpu; + + init_new_task_load(p); + cpu = get_cpu(); __sched_fork(clone_flags, p); /* @@ -2468,11 +2645,10 @@ void wake_up_new_task(struct task_struct *p) unsigned long flags; struct rq *rq; + add_new_task_to_grp(p); raw_spin_lock_irqsave(&p->pi_lock, flags); p->state = TASK_RUNNING; - walt_init_new_task_load(p); - /* Initialize new task's runnable average */ init_entity_runnable_average(&p->se); #ifdef CONFIG_SMP @@ -2487,10 +2663,9 @@ void wake_up_new_task(struct task_struct *p) __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0, 1)); #endif rq = __task_rq_lock(p); + mark_task_starting(p); update_rq_clock(rq); post_init_entity_util_avg(&p->se); - - walt_mark_task_starting(p); activate_task(rq, p, ENQUEUE_WAKEUP_NEW); p->on_rq = TASK_ON_RQ_QUEUED; trace_sched_wakeup_new(p); @@ -2618,6 +2793,14 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, fire_sched_out_preempt_notifiers(prev, next); prepare_lock_switch(rq, next); prepare_arch_switch(next); + +#ifdef CONFIG_MSM_APP_SETTINGS + if (use_app_setting) + switch_app_setting_bit(prev, next); + + if (use_32bit_app_setting || use_32bit_app_setting_pro) + switch_32bit_app_setting_bit(prev, next); +#endif } /** @@ -2909,7 +3092,7 @@ void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) *load = rq->load.weight; } -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) /* * sched_exec - execve() is a valuable balancing opportunity, because at @@ -2919,18 +3102,23 @@ void sched_exec(void) { struct task_struct *p = current; unsigned long flags; - int dest_cpu; + int dest_cpu, curr_cpu; + +#ifdef CONFIG_SCHED_HMP + return; +#endif raw_spin_lock_irqsave(&p->pi_lock, flags); + curr_cpu = task_cpu(p); dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0, 1); if (dest_cpu == smp_processor_id()) goto unlock; - if (likely(cpu_active(dest_cpu))) { + if (likely(cpu_active(dest_cpu) && likely(!cpu_isolated(dest_cpu)))) { struct migration_arg arg = { p, dest_cpu }; raw_spin_unlock_irqrestore(&p->pi_lock, flags); - stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); + stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); return; } unlock: @@ -2997,19 +3185,31 @@ void scheduler_tick(void) int cpu = smp_processor_id(); struct rq *rq = cpu_rq(cpu); struct task_struct *curr = rq->curr; + u64 wallclock; + bool early_notif; + u32 old_load; + struct related_thread_group *grp; sched_clock_tick(); raw_spin_lock(&rq->lock); - walt_set_window_start(rq); - walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, - walt_ktime_clock(), 0); + old_load = task_load(curr); + set_window_start(rq); update_rq_clock(rq); curr->sched_class->task_tick(rq, curr, 0); update_cpu_load_active(rq); calc_global_load_tick(rq); + wallclock = sched_ktime_clock(); + update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); + + cpufreq_update_util(rq, 0); + early_notif = early_detection_notify(rq, wallclock); raw_spin_unlock(&rq->lock); + if (early_notif) + atomic_notifier_call_chain(&load_alert_notifier_head, + 0, (void *)(long)cpu); + perf_event_task_tick(); #ifdef CONFIG_SMP @@ -3018,8 +3218,17 @@ void scheduler_tick(void) #endif rq_last_tick_reset(rq); + rcu_read_lock(); + grp = task_related_thread_group(curr); + if (update_preferred_cluster(grp, curr, old_load)) + set_preferred_cluster(grp); + rcu_read_unlock(); + if (curr->sched_class == &fair_sched_class) check_for_migration(rq, curr); + + if (cpu == tick_do_timer_cpu) + core_ctl_check(wallclock); } #ifdef CONFIG_NO_HZ_FULL @@ -3062,9 +3271,24 @@ notrace unsigned long get_parent_ip(unsigned long addr) #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ defined(CONFIG_PREEMPT_TRACER)) +/* + * preemptoff stack tracing threshold in ns. + * default: 1ms + */ +unsigned int sysctl_preemptoff_tracing_threshold_ns = 1000000UL; + +struct preempt_store { + u64 ts; + unsigned long caddr[4]; + bool irqs_disabled; +}; + +static DEFINE_PER_CPU(struct preempt_store, the_ps); void preempt_count_add(int val) { + struct preempt_store *ps = &per_cpu(the_ps, raw_smp_processor_id()); + #ifdef CONFIG_DEBUG_PREEMPT /* * Underflow? @@ -3085,6 +3309,13 @@ void preempt_count_add(int val) #ifdef CONFIG_DEBUG_PREEMPT current->preempt_disable_ip = ip; #endif + ps->ts = sched_clock(); + ps->caddr[0] = CALLER_ADDR0; + ps->caddr[1] = CALLER_ADDR1; + ps->caddr[2] = CALLER_ADDR2; + ps->caddr[3] = CALLER_ADDR3; + ps->irqs_disabled = irqs_disabled(); + trace_preempt_off(CALLER_ADDR0, ip); } } @@ -3107,8 +3338,22 @@ void preempt_count_sub(int val) return; #endif - if (preempt_count() == val) + if (preempt_count() == val) { + struct preempt_store *ps = &per_cpu(the_ps, + raw_smp_processor_id()); + u64 delta = sched_clock() - ps->ts; + + /* + * Trace preempt disable stack if preemption + * is disabled for more than the threshold. + */ + if (delta > sysctl_preemptoff_tracing_threshold_ns) + trace_sched_preempt_disable(delta, ps->irqs_disabled, + ps->caddr[0], ps->caddr[1], + ps->caddr[2], ps->caddr[3]); + trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); + } __preempt_count_sub(val); } EXPORT_SYMBOL(preempt_count_sub); @@ -3121,6 +3366,9 @@ NOKPROBE_SYMBOL(preempt_count_sub); */ static noinline void __schedule_bug(struct task_struct *prev) { + /* Save this before calling printk(), since that will clobber it */ + unsigned long preempt_disable_ip = get_preempt_disable_ip(current); + if (oops_in_progress) return; @@ -3131,12 +3379,14 @@ static noinline void __schedule_bug(struct task_struct *prev) print_modules(); if (irqs_disabled()) print_irqtrace_events(prev); -#ifdef CONFIG_DEBUG_PREEMPT - if (in_atomic_preempt_off()) { + if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) + && in_atomic_preempt_off()) { pr_err("Preemption disabled at:"); - print_ip_sym(current->preempt_disable_ip); + print_ip_sym(preempt_disable_ip); pr_cont("\n"); } +#ifdef CONFIG_PANIC_ON_SCHED_BUG + BUG(); #endif dump_stack(); add_taint(TAINT_WARN, LOCKDEP_STILL_OK); @@ -3251,7 +3501,6 @@ static void __sched notrace __schedule(bool preempt) cpu = smp_processor_id(); rq = cpu_rq(cpu); - rcu_note_context_switch(); prev = rq->curr; /* @@ -3270,13 +3519,16 @@ static void __sched notrace __schedule(bool preempt) if (sched_feat(HRTICK)) hrtick_clear(rq); + local_irq_disable(); + rcu_note_context_switch(); + /* * Make sure that signal_pending_state()->signal_pending() below * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) * done by the caller to avoid the race with signal_wake_up(). */ smp_mb__before_spinlock(); - raw_spin_lock_irq(&rq->lock); + raw_spin_lock(&rq->lock); lockdep_pin_lock(&rq->lock); rq->clock_skip_update <<= 1; /* promote REQ to ACT */ @@ -3309,14 +3561,20 @@ static void __sched notrace __schedule(bool preempt) update_rq_clock(rq); next = pick_next_task(rq, prev); - wallclock = walt_ktime_clock(); - walt_update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0); - walt_update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0); clear_tsk_need_resched(prev); clear_preempt_need_resched(); rq->clock_skip_update = 0; + BUG_ON(task_cpu(next) != cpu_of(rq)); + + wallclock = sched_ktime_clock(); if (likely(prev != next)) { + update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0); + update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0); + cpufreq_update_util(rq, 0); + if (!is_idle_task(prev) && !prev->on_rq) + update_avg_burst(prev); + #ifdef CONFIG_SCHED_WALT if (!prev->on_rq) prev->last_sleep_ts = wallclock; @@ -3325,10 +3583,14 @@ static void __sched notrace __schedule(bool preempt) rq->curr = next; ++*switch_count; + set_task_last_switch_out(prev, wallclock); + trace_sched_switch(preempt, prev, next); rq = context_switch(rq, prev, next); /* unlocks the rq */ cpu = cpu_of(rq); } else { + update_task_ravg(prev, rq, TASK_UPDATE, wallclock, 0); + cpufreq_update_util(rq, 0); lockdep_unpin_lock(&rq->lock); raw_spin_unlock_irq(&rq->lock); } @@ -3513,7 +3775,7 @@ EXPORT_SYMBOL(default_wake_function); */ void rt_mutex_setprio(struct task_struct *p, int prio) { - int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE; + int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE; struct rq *rq; const struct sched_class *prev_class; @@ -3542,11 +3804,15 @@ void rt_mutex_setprio(struct task_struct *p, int prio) trace_sched_pi_setprio(p, prio); oldprio = p->prio; + + if (oldprio == prio) + queue_flag &= ~DEQUEUE_MOVE; + prev_class = p->sched_class; queued = task_on_rq_queued(p); running = task_current(rq, p); if (queued) - dequeue_task(rq, p, DEQUEUE_SAVE); + dequeue_task(rq, p, queue_flag); if (running) put_prev_task(rq, p); @@ -3565,7 +3831,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) (pi_task && dl_prio(pi_task->prio) && dl_entity_preempt(&pi_task->dl, &p->dl))) { p->dl.dl_boosted = 1; - enqueue_flag |= ENQUEUE_REPLENISH; + queue_flag |= ENQUEUE_REPLENISH; } else p->dl.dl_boosted = 0; p->sched_class = &dl_sched_class; @@ -3573,7 +3839,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (dl_prio(oldprio)) p->dl.dl_boosted = 0; if (oldprio < prio) - enqueue_flag |= ENQUEUE_HEAD; + queue_flag |= ENQUEUE_HEAD; p->sched_class = &rt_sched_class; } else { if (dl_prio(oldprio)) @@ -3588,7 +3854,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (running) p->sched_class->set_curr_task(rq); if (queued) - enqueue_task(rq, p, enqueue_flag); + enqueue_task(rq, p, queue_flag); check_class_changed(rq, p, prev_class, oldprio); out_unlock: @@ -3947,6 +4213,7 @@ static int __sched_setscheduler(struct task_struct *p, const struct sched_class *prev_class; struct rq *rq; int reset_on_fork; + int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; /* The pi code expects interrupts enabled */ BUG_ON(pi && in_interrupt()); @@ -4130,17 +4397,14 @@ change: * itself. */ new_effective_prio = rt_mutex_get_effective_prio(p, newprio); - if (new_effective_prio == oldprio) { - __setscheduler_params(p, attr); - task_rq_unlock(rq, p, &flags); - return 0; - } + if (new_effective_prio == oldprio) + queue_flags &= ~DEQUEUE_MOVE; } queued = task_on_rq_queued(p); running = task_current(rq, p); if (queued) - dequeue_task(rq, p, DEQUEUE_SAVE); + dequeue_task(rq, p, queue_flags); if (running) put_prev_task(rq, p); @@ -4150,15 +4414,14 @@ change: if (running) p->sched_class->set_curr_task(rq); if (queued) { - int enqueue_flags = ENQUEUE_RESTORE; /* * We enqueue to tail when the priority of a task is * increased (user space view). */ - if (oldprio <= p->prio) - enqueue_flags |= ENQUEUE_HEAD; + if (oldprio < p->prio) + queue_flags |= ENQUEUE_HEAD; - enqueue_task(rq, p, enqueue_flags); + enqueue_task(rq, p, queue_flags); } check_class_changed(rq, p, prev_class, oldprio); @@ -4236,7 +4499,7 @@ int sched_setscheduler_nocheck(struct task_struct *p, int policy, { return _sched_setscheduler(p, policy, param, false); } -EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); +EXPORT_SYMBOL(sched_setscheduler_nocheck); static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) @@ -4556,6 +4819,8 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) cpumask_var_t cpus_allowed, new_mask; struct task_struct *p; int retval; + int dest_cpu; + cpumask_t allowed_mask; rcu_read_lock(); @@ -4617,20 +4882,26 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) } #endif again: - retval = __set_cpus_allowed_ptr(p, new_mask, true); - - if (!retval) { - cpuset_cpus_allowed(p, cpus_allowed); - if (!cpumask_subset(new_mask, cpus_allowed)) { - /* - * We must have raced with a concurrent cpuset - * update. Just reset the cpus_allowed to the - * cpuset's cpus_allowed - */ - cpumask_copy(new_mask, cpus_allowed); - goto again; + cpumask_andnot(&allowed_mask, new_mask, cpu_isolated_mask); + dest_cpu = cpumask_any_and(cpu_active_mask, &allowed_mask); + if (dest_cpu < nr_cpu_ids) { + retval = __set_cpus_allowed_ptr(p, new_mask, true); + if (!retval) { + cpuset_cpus_allowed(p, cpus_allowed); + if (!cpumask_subset(new_mask, cpus_allowed)) { + /* + * We must have raced with a concurrent cpuset + * update. Just reset the cpus_allowed to the + * cpuset's cpus_allowed + */ + cpumask_copy(new_mask, cpus_allowed); + goto again; + } } + } else { + retval = -EINVAL; } + out_free_new_mask: free_cpumask_var(new_mask); out_free_cpus_allowed: @@ -4694,6 +4965,15 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) raw_spin_lock_irqsave(&p->pi_lock, flags); cpumask_and(mask, &p->cpus_allowed, cpu_active_mask); + + /* + * The userspace tasks are forbidden to run on + * isolated CPUs. So exclude isolated CPUs from + * the getaffinity. + */ + if (!(p->flags & PF_KTHREAD)) + cpumask_andnot(mask, mask, cpu_isolated_mask); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); out_unlock: @@ -5115,6 +5395,8 @@ void show_state_filter(unsigned long state_filter) sched_show_task(p); } + touch_all_softlockup_watchdogs(); + #ifdef CONFIG_SCHED_DEBUG sysrq_sched_debug_show(); #endif @@ -5152,6 +5434,8 @@ void init_idle(struct task_struct *idle, int cpu) idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); + kasan_unpoison_task_stack(idle); + #ifdef CONFIG_SMP /* * Its possible that init_idle() gets called multiple times on a task, @@ -5374,18 +5658,54 @@ static struct task_struct fake_task = { }; /* - * Migrate all tasks from the rq, sleeping tasks will be migrated by - * try_to_wake_up()->select_task_rq(). + * Remove a task from the runqueue and pretend that it's migrating. This + * should prevent migrations for the detached task and disallow further + * changes to tsk_cpus_allowed. + */ +static void +detach_one_task(struct task_struct *p, struct rq *rq, struct list_head *tasks) +{ + lockdep_assert_held(&rq->lock); + + p->on_rq = TASK_ON_RQ_MIGRATING; + deactivate_task(rq, p, 0); + list_add(&p->se.group_node, tasks); +} + +static void attach_tasks(struct list_head *tasks, struct rq *rq) +{ + struct task_struct *p; + + lockdep_assert_held(&rq->lock); + + while (!list_empty(tasks)) { + p = list_first_entry(tasks, struct task_struct, se.group_node); + list_del_init(&p->se.group_node); + + BUG_ON(task_rq(p) != rq); + activate_task(rq, p, 0); + p->on_rq = TASK_ON_RQ_QUEUED; + } +} + +/* + * Migrate all tasks (not pinned if pinned argument say so) from the rq, + * sleeping tasks will be migrated by try_to_wake_up()->select_task_rq(). * * Called with rq->lock held even though we'er in stop_machine() and * there's no concurrency possible, we hold the required locks anyway * because of lock validation efforts. */ -static void migrate_tasks(struct rq *dead_rq) +static void migrate_tasks(struct rq *dead_rq, bool migrate_pinned_tasks) { struct rq *rq = dead_rq; struct task_struct *next, *stop = rq->stop; int dest_cpu; + unsigned int num_pinned_kthreads = 1; /* this thread */ + LIST_HEAD(tasks); + cpumask_t avail_cpus; + + cpumask_andnot(&avail_cpus, cpu_online_mask, cpu_isolated_mask); /* * Fudge the rq selection such that the below task selection loop @@ -5421,6 +5741,14 @@ static void migrate_tasks(struct rq *dead_rq) BUG_ON(!next); next->sched_class->put_prev_task(rq, next); + if (!migrate_pinned_tasks && next->flags & PF_KTHREAD && + !cpumask_intersects(&avail_cpus, &next->cpus_allowed)) { + detach_one_task(next, rq, &tasks); + num_pinned_kthreads += 1; + lockdep_unpin_lock(&rq->lock); + continue; + } + /* * Rules for changing task_struct::cpus_allowed are holding * both pi_lock and rq->lock, such that holding either @@ -5439,26 +5767,271 @@ static void migrate_tasks(struct rq *dead_rq) * Since we're inside stop-machine, _nothing_ should have * changed the task, WARN if weird stuff happened, because in * that case the above rq->lock drop is a fail too. + * However, during cpu isolation the load balancer might have + * interferred since we don't stop all CPUs. Ignore warning for + * this case. */ - if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) { + if (task_rq(next) != rq || !task_on_rq_queued(next)) { + WARN_ON(migrate_pinned_tasks); raw_spin_unlock(&next->pi_lock); continue; } /* Find suitable destination for @next, with force if needed. */ - dest_cpu = select_fallback_rq(dead_rq->cpu, next); + dest_cpu = select_fallback_rq(dead_rq->cpu, next, false); rq = __migrate_task(rq, next, dest_cpu); if (rq != dead_rq) { + raw_spin_unlock(&next->pi_lock); raw_spin_unlock(&rq->lock); + notify_migration(dead_rq->cpu, dest_cpu, true, next); rq = dead_rq; + raw_spin_lock(&next->pi_lock); raw_spin_lock(&rq->lock); } raw_spin_unlock(&next->pi_lock); } rq->stop = stop; + + if (num_pinned_kthreads > 1) + attach_tasks(&tasks, rq); +} + +static void set_rq_online(struct rq *rq); +static void set_rq_offline(struct rq *rq); + +int do_isolation_work_cpu_stop(void *data) +{ + unsigned int cpu = smp_processor_id(); + struct rq *rq = cpu_rq(cpu); + + watchdog_disable(cpu); + + irq_migrate_all_off_this_cpu(); + + local_irq_disable(); + + sched_ttwu_pending(); + + raw_spin_lock(&rq->lock); + + /* + * Temporarily mark the rq as offline. This will allow us to + * move tasks off the CPU. + */ + if (rq->rd) { + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + set_rq_offline(rq); + } + + migrate_tasks(rq, false); + + if (rq->rd) + set_rq_online(rq); + raw_spin_unlock(&rq->lock); + + /* + * We might have been in tickless state. Clear NOHZ flags to avoid + * us being kicked for helping out with balancing + */ + nohz_balance_clear_nohz_mask(cpu); + + clear_hmp_request(cpu); + local_irq_enable(); + return 0; +} + +int do_unisolation_work_cpu_stop(void *data) +{ + watchdog_enable(smp_processor_id()); + return 0; +} + +static void init_sched_groups_capacity(int cpu, struct sched_domain *sd); + +static void sched_update_group_capacities(int cpu) +{ + struct sched_domain *sd; + + mutex_lock(&sched_domains_mutex); + rcu_read_lock(); + + for_each_domain(cpu, sd) { + int balance_cpu = group_balance_cpu(sd->groups); + + init_sched_groups_capacity(cpu, sd); + /* + * Need to ensure this is also called with balancing + * cpu. + */ + if (cpu != balance_cpu) + init_sched_groups_capacity(balance_cpu, sd); + } + + rcu_read_unlock(); + mutex_unlock(&sched_domains_mutex); +} + +static unsigned int cpu_isolation_vote[NR_CPUS]; + +int sched_isolate_count(const cpumask_t *mask, bool include_offline) +{ + cpumask_t count_mask = CPU_MASK_NONE; + + if (include_offline) { + cpumask_complement(&count_mask, cpu_online_mask); + cpumask_or(&count_mask, &count_mask, cpu_isolated_mask); + cpumask_and(&count_mask, &count_mask, mask); + } else { + cpumask_and(&count_mask, mask, cpu_isolated_mask); + } + + return cpumask_weight(&count_mask); } + +/* + * 1) CPU is isolated and cpu is offlined: + * Unisolate the core. + * 2) CPU is not isolated and CPU is offlined: + * No action taken. + * 3) CPU is offline and request to isolate + * Request ignored. + * 4) CPU is offline and isolated: + * Not a possible state. + * 5) CPU is online and request to isolate + * Normal case: Isolate the CPU + * 6) CPU is not isolated and comes back online + * Nothing to do + * + * Note: The client calling sched_isolate_cpu() is repsonsible for ONLY + * calling sched_unisolate_cpu() on a CPU that the client previously isolated. + * Client is also responsible for unisolating when a core goes offline + * (after CPU is marked offline). + */ +int sched_isolate_cpu(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + cpumask_t avail_cpus; + int ret_code = 0; + u64 start_time = 0; + + if (trace_sched_isolate_enabled()) + start_time = sched_clock(); + + cpu_maps_update_begin(); + + cpumask_andnot(&avail_cpus, cpu_online_mask, cpu_isolated_mask); + + /* We cannot isolate ALL cpus in the system */ + if (cpumask_weight(&avail_cpus) == 1) { + ret_code = -EINVAL; + goto out; + } + + if (!cpu_online(cpu)) { + ret_code = -EINVAL; + goto out; + } + + if (++cpu_isolation_vote[cpu] > 1) + goto out; + + /* + * There is a race between watchdog being enabled by hotplug and + * core isolation disabling the watchdog. When a CPU is hotplugged in + * and the hotplug lock has been released the watchdog thread might + * not have run yet to enable the watchdog. + * We have to wait for the watchdog to be enabled before proceeding. + */ + if (!watchdog_configured(cpu)) { + msleep(20); + if (!watchdog_configured(cpu)) { + --cpu_isolation_vote[cpu]; + ret_code = -EBUSY; + goto out; + } + } + + set_cpu_isolated(cpu, true); + cpumask_clear_cpu(cpu, &avail_cpus); + + /* Migrate timers */ + smp_call_function_any(&avail_cpus, hrtimer_quiesce_cpu, &cpu, 1); + smp_call_function_any(&avail_cpus, timer_quiesce_cpu, &cpu, 1); + + stop_cpus(cpumask_of(cpu), do_isolation_work_cpu_stop, 0); + + calc_load_migrate(rq); + update_max_interval(); + sched_update_group_capacities(cpu); + +out: + cpu_maps_update_done(); + trace_sched_isolate(cpu, cpumask_bits(cpu_isolated_mask)[0], + start_time, 1); + return ret_code; +} + +/* + * Note: The client calling sched_isolate_cpu() is repsonsible for ONLY + * calling sched_unisolate_cpu() on a CPU that the client previously isolated. + * Client is also responsible for unisolating when a core goes offline + * (after CPU is marked offline). + */ +int sched_unisolate_cpu_unlocked(int cpu) +{ + int ret_code = 0; + struct rq *rq = cpu_rq(cpu); + u64 start_time = 0; + + if (trace_sched_isolate_enabled()) + start_time = sched_clock(); + + if (!cpu_isolation_vote[cpu]) { + ret_code = -EINVAL; + goto out; + } + + if (--cpu_isolation_vote[cpu]) + goto out; + + if (cpu_online(cpu)) { + unsigned long flags; + + raw_spin_lock_irqsave(&rq->lock, flags); + rq->age_stamp = sched_clock_cpu(cpu); + raw_spin_unlock_irqrestore(&rq->lock, flags); + } + + set_cpu_isolated(cpu, false); + update_max_interval(); + sched_update_group_capacities(cpu); + + if (cpu_online(cpu)) { + stop_cpus(cpumask_of(cpu), do_unisolation_work_cpu_stop, 0); + + /* Kick CPU to immediately do load balancing */ + if (!test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu))) + smp_send_reschedule(cpu); + } + +out: + trace_sched_isolate(cpu, cpumask_bits(cpu_isolated_mask)[0], + start_time, 0); + return ret_code; +} + +int sched_unisolate_cpu(int cpu) +{ + int ret_code; + + cpu_maps_update_begin(); + ret_code = sched_unisolate_cpu_unlocked(cpu); + cpu_maps_update_done(); + return ret_code; +} + #endif /* CONFIG_HOTPLUG_CPU */ #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) @@ -5746,7 +6319,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_UP_PREPARE: raw_spin_lock_irqsave(&rq->lock, flags); - walt_set_window_start(rq); + set_window_start(rq); raw_spin_unlock_irqrestore(&rq->lock, flags); rq->calc_load_update = calc_load_update; break; @@ -5767,17 +6340,18 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) sched_ttwu_pending(); /* Update our root-domain */ raw_spin_lock_irqsave(&rq->lock, flags); - walt_migrate_sync_cpu(cpu); + if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); } - migrate_tasks(rq); + migrate_tasks(rq, true); BUG_ON(rq->nr_running != 1); /* the migration thread */ raw_spin_unlock_irqrestore(&rq->lock, flags); break; case CPU_DEAD: + clear_hmp_request(cpu); calc_load_migrate(rq); break; #endif @@ -6322,6 +6896,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) { struct rq *rq = cpu_rq(cpu); struct sched_domain *tmp; + unsigned long next_balance = rq->next_balance; /* Remove the sched domains which do not contribute to scheduling. */ for (tmp = sd; tmp; ) { @@ -6353,6 +6928,17 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) sd->child = NULL; } + for (tmp = sd; tmp; ) { + unsigned long interval; + + interval = msecs_to_jiffies(tmp->balance_interval); + if (time_after(next_balance, tmp->last_balance + interval)) + next_balance = tmp->last_balance + interval; + + tmp = tmp->parent; + } + rq->next_balance = next_balance; + sched_domain_debug(sd, cpu); rq_attach_root(rq, rd); @@ -6602,11 +7188,14 @@ build_sched_groups(struct sched_domain *sd, int cpu) static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) { struct sched_group *sg = sd->groups; + cpumask_t avail_mask; WARN_ON(!sg); do { - sg->group_weight = cpumask_weight(sched_group_cpus(sg)); + cpumask_andnot(&avail_mask, sched_group_cpus(sg), + cpu_isolated_mask); + sg->group_weight = cpumask_weight(&avail_mask); sg = sg->next; } while (sg != sd->groups); @@ -7339,6 +7928,9 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, pr_err(" the %s domain not a subset of the %s domain\n", child->name, sd->name); #endif +#ifdef CONFIG_PANIC_ON_SCHED_BUG + BUG(); +#endif /* Fixup, ensure @sd has at least @child cpus. */ cpumask_or(sched_domain_span(sd), sched_domain_span(sd), @@ -7403,7 +7995,8 @@ static int build_sched_domains(const struct cpumask *cpu_map, continue; for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent, tl++) { - init_sched_energy(i, sd, tl->energy); + if (energy_aware()) + init_sched_energy(i, sd, tl->energy); claim_allocations(i, sd); init_sched_groups_capacity(i, sd); } @@ -7725,6 +8318,8 @@ void __init sched_init_smp(void) hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); + update_cluster_topology(); + init_hrtick(); /* Move init over to a non-isolated CPU */ @@ -7743,6 +8338,7 @@ void __init sched_init_smp(void) } #endif /* CONFIG_SMP */ + int in_sched_functions(unsigned long addr) { return in_lock_functions(addr) || @@ -7766,6 +8362,15 @@ void __init sched_init(void) int i, j; unsigned long alloc_size = 0, ptr; +#ifdef CONFIG_SCHED_HMP + pr_info("HMP scheduling enabled.\n"); +#endif + + BUG_ON(num_possible_cpus() > BITS_PER_LONG); + + sched_boost_parse_dt(); + init_clusters(); + #ifdef CONFIG_FAIR_GROUP_SCHED alloc_size += 2 * nr_cpu_ids * sizeof(void **); #endif @@ -7882,12 +8487,53 @@ void __init sched_init(void) rq->online = 0; rq->idle_stamp = 0; rq->avg_idle = 2*sysctl_sched_migration_cost; - rq->max_idle_balance_cost = sysctl_sched_migration_cost; -#ifdef CONFIG_SCHED_WALT +#ifdef CONFIG_SCHED_HMP + cpumask_set_cpu(i, &rq->freq_domain_cpumask); + rq->hmp_stats.cumulative_runnable_avg = 0; + rq->window_start = 0; + rq->hmp_stats.nr_big_tasks = 0; + rq->hmp_flags = 0; rq->cur_irqload = 0; rq->avg_irqload = 0; rq->irqload_ts = 0; + rq->static_cpu_pwr_cost = 0; + rq->cc.cycles = 1; + rq->cc.time = 1; + rq->cstate = 0; + rq->wakeup_latency = 0; + rq->wakeup_energy = 0; + + /* + * All cpus part of same cluster by default. This avoids the + * need to check for rq->cluster being non-NULL in hot-paths + * like select_best_cpu() + */ + rq->cluster = &init_cluster; + rq->curr_runnable_sum = rq->prev_runnable_sum = 0; + rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0; + memset(&rq->grp_time, 0, sizeof(struct group_cpu_time)); + rq->old_busy_time = 0; + rq->old_estimated_time = 0; + rq->old_busy_time_group = 0; + rq->hmp_stats.pred_demands_sum = 0; + rq->curr_table = 0; + rq->prev_top = 0; + rq->curr_top = 0; + + for (j = 0; j < NUM_TRACKED_WINDOWS; j++) { + memset(&rq->load_subs[j], 0, + sizeof(struct load_subtractions)); + + rq->top_tasks[j] = kcalloc(NUM_LOAD_INDICES, + sizeof(u8), GFP_NOWAIT); + + /* No other choice */ + BUG_ON(!rq->top_tasks[j]); + + clear_top_tasks_bitmap(rq->top_tasks_bitmap[j]); + } #endif + rq->max_idle_balance_cost = sysctl_sched_migration_cost; INIT_LIST_HEAD(&rq->cfs_tasks); @@ -7903,6 +8549,11 @@ void __init sched_init(void) atomic_set(&rq->nr_iowait, 0); } + i = alloc_related_thread_groups(); + BUG_ON(i); + + set_hmp_defaults(); + set_load_weight(&init_task); #ifdef CONFIG_PREEMPT_NOTIFIERS @@ -7927,6 +8578,7 @@ void __init sched_init(void) * when this runqueue becomes "idle". */ init_idle(current, smp_processor_id()); + init_new_task_load(current); calc_load_update = jiffies + LOAD_FREQ; @@ -7980,6 +8632,7 @@ EXPORT_SYMBOL(__might_sleep); void ___might_sleep(const char *file, int line, int preempt_offset) { static unsigned long prev_jiffy; /* ratelimiting */ + unsigned long preempt_disable_ip; rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && @@ -7992,6 +8645,9 @@ void ___might_sleep(const char *file, int line, int preempt_offset) return; prev_jiffy = jiffies; + /* Save this before calling printk(), since that will clobber it */ + preempt_disable_ip = get_preempt_disable_ip(current); + printk(KERN_ERR "BUG: sleeping function called from invalid context at %s:%d\n", file, line); @@ -8006,12 +8662,14 @@ void ___might_sleep(const char *file, int line, int preempt_offset) debug_show_held_locks(current); if (irqs_disabled()) print_irqtrace_events(current); -#ifdef CONFIG_DEBUG_PREEMPT - if (!preempt_count_equals(preempt_offset)) { + if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) + && !preempt_count_equals(preempt_offset)) { pr_err("Preemption disabled at:"); - print_ip_sym(current->preempt_disable_ip); + print_ip_sym(preempt_disable_ip); pr_cont("\n"); } +#ifdef CONFIG_PANIC_ON_SCHED_BUG + BUG(); #endif dump_stack(); } @@ -8223,7 +8881,7 @@ void sched_move_task(struct task_struct *tsk) queued = task_on_rq_queued(tsk); if (queued) - dequeue_task(rq, tsk, DEQUEUE_SAVE); + dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE); if (unlikely(running)) put_prev_task(rq, tsk); @@ -8232,7 +8890,7 @@ void sched_move_task(struct task_struct *tsk) if (unlikely(running)) tsk->sched_class->set_curr_task(rq); if (queued) - enqueue_task(rq, tsk, ENQUEUE_RESTORE); + enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE); task_rq_unlock(rq, tsk, &flags); } @@ -8614,7 +9272,7 @@ int sched_rr_handler(struct ctl_table *table, int write, #ifdef CONFIG_CGROUP_SCHED -static inline struct task_group *css_tg(struct cgroup_subsys_state *css) +inline struct task_group *css_tg(struct cgroup_subsys_state *css) { return css ? container_of(css, struct task_group, css) : NULL; } @@ -9014,6 +9672,13 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, #endif /* CONFIG_RT_GROUP_SCHED */ static struct cftype cpu_files[] = { +#ifdef CONFIG_SCHED_HMP + { + .name = "upmigrate_discourage", + .read_u64 = cpu_upmigrate_discourage_read_u64, + .write_u64 = cpu_upmigrate_discourage_write_u64, + }, +#endif #ifdef CONFIG_FAIR_GROUP_SCHED { .name = "shares", diff --git a/kernel/sched/core_ctl.c b/kernel/sched/core_ctl.c new file mode 100644 index 000000000000..2f060a570061 --- /dev/null +++ b/kernel/sched/core_ctl.c @@ -0,0 +1,1171 @@ +/* Copyright (c) 2014-2017, 2020 The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#define pr_fmt(fmt) "core_ctl: " fmt + +#include <linux/init.h> +#include <linux/notifier.h> +#include <linux/cpu.h> +#include <linux/cpumask.h> +#include <linux/cpufreq.h> +#include <linux/kthread.h> +#include <linux/sched.h> +#include <linux/sched/rt.h> + +#include <trace/events/sched.h> +#include "sched.h" + +#define MAX_CPUS_PER_CLUSTER 4 +#define MAX_CLUSTERS 2 + +struct cluster_data { + bool inited; + unsigned int min_cpus; + unsigned int max_cpus; + unsigned int offline_delay_ms; + unsigned int busy_up_thres[MAX_CPUS_PER_CLUSTER]; + unsigned int busy_down_thres[MAX_CPUS_PER_CLUSTER]; + unsigned int active_cpus; + unsigned int num_cpus; + unsigned int nr_isolated_cpus; + cpumask_t cpu_mask; + unsigned int need_cpus; + unsigned int task_thres; + unsigned int max_nr; + s64 need_ts; + struct list_head lru; + bool pending; + spinlock_t pending_lock; + bool is_big_cluster; + bool enable; + int nrrun; + bool nrrun_changed; + struct task_struct *core_ctl_thread; + unsigned int first_cpu; + unsigned int boost; + struct kobject kobj; +}; + +struct cpu_data { + bool is_busy; + unsigned int busy; + unsigned int cpu; + bool not_preferred; + struct cluster_data *cluster; + struct list_head sib; + bool isolated_by_us; + unsigned int max_nr; +}; + +static DEFINE_PER_CPU(struct cpu_data, cpu_state); +static struct cluster_data cluster_state[MAX_CLUSTERS]; +static unsigned int num_clusters; + +#define for_each_cluster(cluster, idx) \ + for (; (idx) < num_clusters && ((cluster) = &cluster_state[idx]);\ + (idx)++) + +static DEFINE_SPINLOCK(state_lock); +static void apply_need(struct cluster_data *state); +static void wake_up_core_ctl_thread(struct cluster_data *state); +static bool initialized; + +static unsigned int get_active_cpu_count(const struct cluster_data *cluster); + +/* ========================= sysfs interface =========================== */ + +static ssize_t store_min_cpus(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + state->min_cpus = min(val, state->max_cpus); + wake_up_core_ctl_thread(state); + + return count; +} + +static ssize_t show_min_cpus(const struct cluster_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->min_cpus); +} + +static ssize_t store_max_cpus(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + val = min(val, state->num_cpus); + state->max_cpus = val; + state->min_cpus = min(state->min_cpus, state->max_cpus); + wake_up_core_ctl_thread(state); + + return count; +} + +static ssize_t show_max_cpus(const struct cluster_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->max_cpus); +} + +static ssize_t store_offline_delay_ms(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + state->offline_delay_ms = val; + apply_need(state); + + return count; +} + +static ssize_t show_task_thres(const struct cluster_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->task_thres); +} + +static ssize_t store_task_thres(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + if (val < state->num_cpus) + return -EINVAL; + + state->task_thres = val; + apply_need(state); + + return count; +} + +static ssize_t show_offline_delay_ms(const struct cluster_data *state, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->offline_delay_ms); +} + +static ssize_t store_busy_up_thres(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val[MAX_CPUS_PER_CLUSTER]; + int ret, i; + + ret = sscanf(buf, "%u %u %u %u\n", &val[0], &val[1], &val[2], &val[3]); + if (ret != 1 && ret != state->num_cpus) + return -EINVAL; + + if (ret == 1) { + for (i = 0; i < state->num_cpus; i++) + state->busy_up_thres[i] = val[0]; + } else { + for (i = 0; i < state->num_cpus; i++) + state->busy_up_thres[i] = val[i]; + } + apply_need(state); + return count; +} + +static ssize_t show_busy_up_thres(const struct cluster_data *state, char *buf) +{ + int i, count = 0; + + for (i = 0; i < state->num_cpus; i++) + count += snprintf(buf + count, PAGE_SIZE - count, "%u ", + state->busy_up_thres[i]); + + count += snprintf(buf + count, PAGE_SIZE - count, "\n"); + return count; +} + +static ssize_t store_busy_down_thres(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val[MAX_CPUS_PER_CLUSTER]; + int ret, i; + + ret = sscanf(buf, "%u %u %u %u\n", &val[0], &val[1], &val[2], &val[3]); + if (ret != 1 && ret != state->num_cpus) + return -EINVAL; + + if (ret == 1) { + for (i = 0; i < state->num_cpus; i++) + state->busy_down_thres[i] = val[0]; + } else { + for (i = 0; i < state->num_cpus; i++) + state->busy_down_thres[i] = val[i]; + } + apply_need(state); + return count; +} + +static ssize_t show_busy_down_thres(const struct cluster_data *state, char *buf) +{ + int i, count = 0; + + for (i = 0; i < state->num_cpus; i++) + count += snprintf(buf + count, PAGE_SIZE - count, "%u ", + state->busy_down_thres[i]); + + count += snprintf(buf + count, PAGE_SIZE - count, "\n"); + return count; +} + +static ssize_t store_is_big_cluster(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + state->is_big_cluster = val ? 1 : 0; + return count; +} + +static ssize_t show_is_big_cluster(const struct cluster_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->is_big_cluster); +} + +static ssize_t store_enable(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val; + bool bval; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + bval = !!val; + if (bval != state->enable) { + state->enable = bval; + apply_need(state); + } + + return count; +} + +static ssize_t show_enable(const struct cluster_data *state, char *buf) +{ + return scnprintf(buf, PAGE_SIZE, "%u\n", state->enable); +} + +static ssize_t show_need_cpus(const struct cluster_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->need_cpus); +} + +static ssize_t show_active_cpus(const struct cluster_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->active_cpus); +} + +static ssize_t show_global_state(const struct cluster_data *state, char *buf) +{ + struct cpu_data *c; + struct cluster_data *cluster; + ssize_t count = 0; + unsigned int cpu; + + spin_lock_irq(&state_lock); + for_each_possible_cpu(cpu) { + c = &per_cpu(cpu_state, cpu); + cluster = c->cluster; + if (!cluster || !cluster->inited) + continue; + + count += snprintf(buf + count, PAGE_SIZE - count, + "CPU%u\n", cpu); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tCPU: %u\n", c->cpu); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tOnline: %u\n", + cpu_online(c->cpu)); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tIsolated: %u\n", + cpu_isolated(c->cpu)); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tFirst CPU: %u\n", + cluster->first_cpu); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tBusy%%: %u\n", c->busy); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tIs busy: %u\n", c->is_busy); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tNot preferred: %u\n", + c->not_preferred); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tNr running: %u\n", cluster->nrrun); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tActive CPUs: %u\n", get_active_cpu_count(cluster)); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tNeed CPUs: %u\n", cluster->need_cpus); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tNr isolated CPUs: %u\n", + cluster->nr_isolated_cpus); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tBoost: %u\n", (unsigned int) cluster->boost); + } + spin_unlock_irq(&state_lock); + + return count; +} + +static ssize_t store_not_preferred(struct cluster_data *state, + const char *buf, size_t count) +{ + struct cpu_data *c; + unsigned int i; + unsigned int val[MAX_CPUS_PER_CLUSTER]; + unsigned long flags; + int ret; + + ret = sscanf(buf, "%u %u %u %u\n", &val[0], &val[1], &val[2], &val[3]); + if (ret != state->num_cpus) + return -EINVAL; + + spin_lock_irqsave(&state_lock, flags); + for (i = 0; i < state->num_cpus; i++) { + c = &per_cpu(cpu_state, i + state->first_cpu); + c->not_preferred = val[i]; + } + spin_unlock_irqrestore(&state_lock, flags); + + return count; +} + +static ssize_t show_not_preferred(const struct cluster_data *state, char *buf) +{ + struct cpu_data *c; + ssize_t count = 0; + unsigned long flags; + int i; + + spin_lock_irqsave(&state_lock, flags); + for (i = 0; i < state->num_cpus; i++) { + c = &per_cpu(cpu_state, i + state->first_cpu); + count += scnprintf(buf + count, PAGE_SIZE - count, + "CPU#%d: %u\n", c->cpu, c->not_preferred); + } + spin_unlock_irqrestore(&state_lock, flags); + + return count; +} + + +struct core_ctl_attr { + struct attribute attr; + ssize_t (*show)(const struct cluster_data *, char *); + ssize_t (*store)(struct cluster_data *, const char *, size_t count); +}; + +#define core_ctl_attr_ro(_name) \ +static struct core_ctl_attr _name = \ +__ATTR(_name, 0444, show_##_name, NULL) + +#define core_ctl_attr_rw(_name) \ +static struct core_ctl_attr _name = \ +__ATTR(_name, 0644, show_##_name, store_##_name) + +core_ctl_attr_rw(min_cpus); +core_ctl_attr_rw(max_cpus); +core_ctl_attr_rw(offline_delay_ms); +core_ctl_attr_rw(busy_up_thres); +core_ctl_attr_rw(busy_down_thres); +core_ctl_attr_rw(task_thres); +core_ctl_attr_rw(is_big_cluster); +core_ctl_attr_ro(need_cpus); +core_ctl_attr_ro(active_cpus); +core_ctl_attr_ro(global_state); +core_ctl_attr_rw(not_preferred); +core_ctl_attr_rw(enable); + +static struct attribute *default_attrs[] = { + &min_cpus.attr, + &max_cpus.attr, + &offline_delay_ms.attr, + &busy_up_thres.attr, + &busy_down_thres.attr, + &task_thres.attr, + &is_big_cluster.attr, + &enable.attr, + &need_cpus.attr, + &active_cpus.attr, + &global_state.attr, + ¬_preferred.attr, + NULL +}; + +#define to_cluster_data(k) container_of(k, struct cluster_data, kobj) +#define to_attr(a) container_of(a, struct core_ctl_attr, attr) +static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + struct cluster_data *data = to_cluster_data(kobj); + struct core_ctl_attr *cattr = to_attr(attr); + ssize_t ret = -EIO; + + if (cattr->show) + ret = cattr->show(data, buf); + + return ret; +} + +static ssize_t store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + struct cluster_data *data = to_cluster_data(kobj); + struct core_ctl_attr *cattr = to_attr(attr); + ssize_t ret = -EIO; + + if (cattr->store) + ret = cattr->store(data, buf, count); + + return ret; +} + +static const struct sysfs_ops sysfs_ops = { + .show = show, + .store = store, +}; + +static struct kobj_type ktype_core_ctl = { + .sysfs_ops = &sysfs_ops, + .default_attrs = default_attrs, +}; + +/* ==================== runqueue based core count =================== */ + +#define RQ_AVG_TOLERANCE 2 +#define RQ_AVG_DEFAULT_MS 20 +static unsigned int rq_avg_period_ms = RQ_AVG_DEFAULT_MS; + +static s64 rq_avg_timestamp_ms; + +static void update_running_avg(bool trigger_update) +{ + int avg, iowait_avg, big_avg, old_nrrun; + int old_max_nr, max_nr, big_max_nr; + s64 now; + unsigned long flags; + struct cluster_data *cluster; + unsigned int index = 0; + + spin_lock_irqsave(&state_lock, flags); + + now = ktime_to_ms(ktime_get()); + if (now - rq_avg_timestamp_ms < rq_avg_period_ms - RQ_AVG_TOLERANCE) { + spin_unlock_irqrestore(&state_lock, flags); + return; + } + rq_avg_timestamp_ms = now; + sched_get_nr_running_avg(&avg, &iowait_avg, &big_avg, + &max_nr, &big_max_nr); + + spin_unlock_irqrestore(&state_lock, flags); + + for_each_cluster(cluster, index) { + if (!cluster->inited) + continue; + + old_nrrun = cluster->nrrun; + old_max_nr = cluster->max_nr; + cluster->nrrun = cluster->is_big_cluster ? big_avg : avg; + cluster->max_nr = cluster->is_big_cluster ? big_max_nr : max_nr; + + if (cluster->nrrun != old_nrrun || + cluster->max_nr != old_max_nr) { + + if (trigger_update) + apply_need(cluster); + else + cluster->nrrun_changed = true; + } + } + return; +} + +#define MAX_NR_THRESHOLD 4 +/* adjust needed CPUs based on current runqueue information */ +static unsigned int apply_task_need(const struct cluster_data *cluster, + unsigned int new_need) +{ + /* unisolate all cores if there are enough tasks */ + if (cluster->nrrun >= cluster->task_thres) + return cluster->num_cpus; + + /* only unisolate more cores if there are tasks to run */ + if (cluster->nrrun > new_need) + new_need = new_need + 1; + + /* + * We don't want tasks to be overcrowded in a cluster. + * If any CPU has more than MAX_NR_THRESHOLD in the last + * window, bring another CPU to help out. + */ + if (cluster->max_nr > MAX_NR_THRESHOLD) + new_need = new_need + 1; + + return new_need; +} + +/* ======================= load based core count ====================== */ + +static unsigned int apply_limits(const struct cluster_data *cluster, + unsigned int need_cpus) +{ + return min(max(cluster->min_cpus, need_cpus), cluster->max_cpus); +} + +static unsigned int get_active_cpu_count(const struct cluster_data *cluster) +{ + return cluster->num_cpus - + sched_isolate_count(&cluster->cpu_mask, true); +} + +static bool is_active(const struct cpu_data *state) +{ + return cpu_online(state->cpu) && !cpu_isolated(state->cpu); +} + +static bool adjustment_possible(const struct cluster_data *cluster, + unsigned int need) +{ + return (need < cluster->active_cpus || (need > cluster->active_cpus && + cluster->nr_isolated_cpus)); +} + +static bool eval_need(struct cluster_data *cluster) +{ + unsigned long flags; + struct cpu_data *c; + unsigned int need_cpus = 0, last_need, thres_idx; + int ret = 0; + bool need_flag = false; + unsigned int new_need; + s64 now, elapsed; + + if (unlikely(!cluster->inited)) + return 0; + + spin_lock_irqsave(&state_lock, flags); + + if (cluster->boost || !cluster->enable) { + need_cpus = cluster->max_cpus; + } else { + cluster->active_cpus = get_active_cpu_count(cluster); + thres_idx = cluster->active_cpus ? cluster->active_cpus - 1 : 0; + list_for_each_entry(c, &cluster->lru, sib) { + if (c->busy >= cluster->busy_up_thres[thres_idx] || + sched_cpu_high_irqload(c->cpu)) + c->is_busy = true; + else if (c->busy < cluster->busy_down_thres[thres_idx]) + c->is_busy = false; + need_cpus += c->is_busy; + } + need_cpus = apply_task_need(cluster, need_cpus); + } + new_need = apply_limits(cluster, need_cpus); + need_flag = adjustment_possible(cluster, new_need); + + last_need = cluster->need_cpus; + now = ktime_to_ms(ktime_get()); + + if (new_need > cluster->active_cpus) { + ret = 1; + } else { + if (new_need == last_need) { + cluster->need_ts = now; + spin_unlock_irqrestore(&state_lock, flags); + return 0; + } + + elapsed = now - cluster->need_ts; + ret = elapsed >= cluster->offline_delay_ms; + } + + if (ret) { + cluster->need_ts = now; + cluster->need_cpus = new_need; + } + trace_core_ctl_eval_need(cluster->first_cpu, last_need, new_need, + ret && need_flag); + spin_unlock_irqrestore(&state_lock, flags); + + return ret && need_flag; +} + +static void apply_need(struct cluster_data *cluster) +{ + if (eval_need(cluster)) + wake_up_core_ctl_thread(cluster); +} + +static int core_ctl_set_busy(unsigned int cpu, unsigned int busy) +{ + struct cpu_data *c = &per_cpu(cpu_state, cpu); + struct cluster_data *cluster = c->cluster; + unsigned int old_is_busy = c->is_busy; + + if (!cluster || !cluster->inited) + return 0; + + update_running_avg(false); + if (c->busy == busy && !cluster->nrrun_changed) + return 0; + c->busy = busy; + cluster->nrrun_changed = false; + + apply_need(cluster); + trace_core_ctl_set_busy(cpu, busy, old_is_busy, c->is_busy); + return 0; +} + +/* ========================= core count enforcement ==================== */ + +static void wake_up_core_ctl_thread(struct cluster_data *cluster) +{ + unsigned long flags; + + spin_lock_irqsave(&cluster->pending_lock, flags); + cluster->pending = true; + spin_unlock_irqrestore(&cluster->pending_lock, flags); + + wake_up_process_no_notif(cluster->core_ctl_thread); +} + +static u64 core_ctl_check_timestamp; +static u64 core_ctl_check_interval; + +static bool do_check(u64 wallclock) +{ + bool do_check = false; + unsigned long flags; + + spin_lock_irqsave(&state_lock, flags); + if ((wallclock - core_ctl_check_timestamp) >= core_ctl_check_interval) { + core_ctl_check_timestamp = wallclock; + do_check = true; + } + spin_unlock_irqrestore(&state_lock, flags); + return do_check; +} + +int core_ctl_set_boost(bool boost) +{ + unsigned int index = 0; + struct cluster_data *cluster; + unsigned long flags; + int ret = 0; + bool boost_state_changed = false; + + if (unlikely(!initialized)) + return 0; + + spin_lock_irqsave(&state_lock, flags); + for_each_cluster(cluster, index) { + if (cluster->is_big_cluster) { + if (boost) { + boost_state_changed = !cluster->boost; + ++cluster->boost; + } else { + if (!cluster->boost) { + pr_err("Error turning off boost. Boost already turned off\n"); + ret = -EINVAL; + } else { + --cluster->boost; + boost_state_changed = !cluster->boost; + } + } + break; + } + } + spin_unlock_irqrestore(&state_lock, flags); + + if (boost_state_changed) + apply_need(cluster); + + trace_core_ctl_set_boost(cluster->boost, ret); + + return ret; +} +EXPORT_SYMBOL(core_ctl_set_boost); + +void core_ctl_check(u64 wallclock) +{ + if (unlikely(!initialized)) + return; + + if (do_check(wallclock)) { + unsigned int index = 0; + struct cluster_data *cluster; + + update_running_avg(true); + + for_each_cluster(cluster, index) { + if (eval_need(cluster)) + wake_up_core_ctl_thread(cluster); + } + } +} + +static void move_cpu_lru(struct cpu_data *cpu_data) +{ + unsigned long flags; + + spin_lock_irqsave(&state_lock, flags); + list_del(&cpu_data->sib); + list_add_tail(&cpu_data->sib, &cpu_data->cluster->lru); + spin_unlock_irqrestore(&state_lock, flags); +} + +static void try_to_isolate(struct cluster_data *cluster, unsigned int need) +{ + struct cpu_data *c, *tmp; + unsigned long flags; + unsigned int num_cpus = cluster->num_cpus; + unsigned int nr_isolated = 0; + + /* + * Protect against entry being removed (and added at tail) by other + * thread (hotplug). + */ + spin_lock_irqsave(&state_lock, flags); + list_for_each_entry_safe(c, tmp, &cluster->lru, sib) { + if (!num_cpus--) + break; + + if (!is_active(c)) + continue; + if (cluster->active_cpus == need) + break; + /* Don't offline busy CPUs. */ + if (c->is_busy) + continue; + + spin_unlock_irqrestore(&state_lock, flags); + + pr_debug("Trying to isolate CPU%u\n", c->cpu); + if (!sched_isolate_cpu(c->cpu)) { + c->isolated_by_us = true; + move_cpu_lru(c); + nr_isolated++; + } else { + pr_debug("Unable to isolate CPU%u\n", c->cpu); + } + cluster->active_cpus = get_active_cpu_count(cluster); + spin_lock_irqsave(&state_lock, flags); + } + cluster->nr_isolated_cpus += nr_isolated; + spin_unlock_irqrestore(&state_lock, flags); + + /* + * If the number of active CPUs is within the limits, then + * don't force isolation of any busy CPUs. + */ + if (cluster->active_cpus <= cluster->max_cpus) + return; + + nr_isolated = 0; + num_cpus = cluster->num_cpus; + spin_lock_irqsave(&state_lock, flags); + list_for_each_entry_safe(c, tmp, &cluster->lru, sib) { + if (!num_cpus--) + break; + + if (!is_active(c)) + continue; + if (cluster->active_cpus <= cluster->max_cpus) + break; + + spin_unlock_irqrestore(&state_lock, flags); + + pr_debug("Trying to isolate CPU%u\n", c->cpu); + if (!sched_isolate_cpu(c->cpu)) { + c->isolated_by_us = true; + move_cpu_lru(c); + nr_isolated++; + } else { + pr_debug("Unable to isolate CPU%u\n", c->cpu); + } + cluster->active_cpus = get_active_cpu_count(cluster); + spin_lock_irqsave(&state_lock, flags); + } + cluster->nr_isolated_cpus += nr_isolated; + spin_unlock_irqrestore(&state_lock, flags); + +} + +static void __try_to_unisolate(struct cluster_data *cluster, + unsigned int need, bool force) +{ + struct cpu_data *c, *tmp; + unsigned long flags; + unsigned int num_cpus = cluster->num_cpus; + unsigned int nr_unisolated = 0; + + /* + * Protect against entry being removed (and added at tail) by other + * thread (hotplug). + */ + spin_lock_irqsave(&state_lock, flags); + list_for_each_entry_safe(c, tmp, &cluster->lru, sib) { + if (!num_cpus--) + break; + + if (!c->isolated_by_us) + continue; + if ((cpu_online(c->cpu) && !cpu_isolated(c->cpu)) || + (!force && c->not_preferred)) + continue; + if (cluster->active_cpus == need) + break; + + spin_unlock_irqrestore(&state_lock, flags); + + pr_debug("Trying to unisolate CPU%u\n", c->cpu); + if (!sched_unisolate_cpu(c->cpu)) { + c->isolated_by_us = false; + move_cpu_lru(c); + nr_unisolated++; + } else { + pr_debug("Unable to unisolate CPU%u\n", c->cpu); + } + cluster->active_cpus = get_active_cpu_count(cluster); + spin_lock_irqsave(&state_lock, flags); + } + cluster->nr_isolated_cpus -= nr_unisolated; + spin_unlock_irqrestore(&state_lock, flags); +} + +static void try_to_unisolate(struct cluster_data *cluster, unsigned int need) +{ + bool force_use_non_preferred = false; + + __try_to_unisolate(cluster, need, force_use_non_preferred); + + if (cluster->active_cpus == need) + return; + + force_use_non_preferred = true; + __try_to_unisolate(cluster, need, force_use_non_preferred); +} + +static void __ref do_core_ctl(struct cluster_data *cluster) +{ + unsigned int need; + + need = apply_limits(cluster, cluster->need_cpus); + + if (adjustment_possible(cluster, need)) { + pr_debug("Trying to adjust group %u from %u to %u\n", + cluster->first_cpu, cluster->active_cpus, need); + + if (cluster->active_cpus > need) + try_to_isolate(cluster, need); + else if (cluster->active_cpus < need) + try_to_unisolate(cluster, need); + } +} + +static int __ref try_core_ctl(void *data) +{ + struct cluster_data *cluster = data; + unsigned long flags; + + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + spin_lock_irqsave(&cluster->pending_lock, flags); + if (!cluster->pending) { + spin_unlock_irqrestore(&cluster->pending_lock, flags); + schedule(); + if (kthread_should_stop()) + break; + spin_lock_irqsave(&cluster->pending_lock, flags); + } + set_current_state(TASK_RUNNING); + cluster->pending = false; + spin_unlock_irqrestore(&cluster->pending_lock, flags); + + do_core_ctl(cluster); + } + + return 0; +} + +static int __ref cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + uint32_t cpu = (uintptr_t)hcpu; + struct cpu_data *state = &per_cpu(cpu_state, cpu); + struct cluster_data *cluster = state->cluster; + unsigned int need; + bool do_wakeup, unisolated = false; + unsigned long flags; + + if (unlikely(!cluster || !cluster->inited)) + return NOTIFY_DONE; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_ONLINE: + cluster->active_cpus = get_active_cpu_count(cluster); + + /* + * Moving to the end of the list should only happen in + * CPU_ONLINE and not on CPU_UP_PREPARE to prevent an + * infinite list traversal when thermal (or other entities) + * reject trying to online CPUs. + */ + move_cpu_lru(state); + break; + + case CPU_DEAD: + /* + * We don't want to have a CPU both offline and isolated. + * So unisolate a CPU that went down if it was isolated by us. + */ + if (state->isolated_by_us) { + sched_unisolate_cpu_unlocked(cpu); + state->isolated_by_us = false; + unisolated = true; + } + + /* Move a CPU to the end of the LRU when it goes offline. */ + move_cpu_lru(state); + + state->busy = 0; + cluster->active_cpus = get_active_cpu_count(cluster); + break; + default: + return NOTIFY_DONE; + } + + need = apply_limits(cluster, cluster->need_cpus); + spin_lock_irqsave(&state_lock, flags); + if (unisolated) + cluster->nr_isolated_cpus--; + do_wakeup = adjustment_possible(cluster, need); + spin_unlock_irqrestore(&state_lock, flags); + if (do_wakeup) + wake_up_core_ctl_thread(cluster); + + return NOTIFY_OK; +} + +static struct notifier_block __refdata cpu_notifier = { + .notifier_call = cpu_callback, +}; + +/* ============================ init code ============================== */ + +static cpumask_var_t core_ctl_disable_cpumask; +static bool core_ctl_disable_cpumask_present; + +static int __init core_ctl_disable_setup(char *str) +{ + if (!*str) + return -EINVAL; + + alloc_bootmem_cpumask_var(&core_ctl_disable_cpumask); + + if (cpulist_parse(str, core_ctl_disable_cpumask) < 0) { + free_bootmem_cpumask_var(core_ctl_disable_cpumask); + return -EINVAL; + } + + core_ctl_disable_cpumask_present = true; + pr_info("disable_cpumask=%*pbl\n", + cpumask_pr_args(core_ctl_disable_cpumask)); + + return 0; +} +early_param("core_ctl_disable_cpumask", core_ctl_disable_setup); + +static bool should_skip(const struct cpumask *mask) +{ + if (!core_ctl_disable_cpumask_present) + return false; + + /* + * We operate on a cluster basis. Disable the core_ctl for + * a cluster, if all of it's cpus are specified in + * core_ctl_disable_cpumask + */ + return cpumask_subset(mask, core_ctl_disable_cpumask); +} + +static struct cluster_data *find_cluster_by_first_cpu(unsigned int first_cpu) +{ + unsigned int i; + + for (i = 0; i < num_clusters; ++i) { + if (cluster_state[i].first_cpu == first_cpu) + return &cluster_state[i]; + } + + return NULL; +} + +static int cluster_init(const struct cpumask *mask) +{ + struct device *dev; + unsigned int first_cpu = cpumask_first(mask); + struct cluster_data *cluster; + struct cpu_data *state; + unsigned int cpu; + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + + if (should_skip(mask)) + return 0; + + if (find_cluster_by_first_cpu(first_cpu)) + return 0; + + dev = get_cpu_device(first_cpu); + if (!dev) + return -ENODEV; + + pr_info("Creating CPU group %d\n", first_cpu); + + if (num_clusters == MAX_CLUSTERS) { + pr_err("Unsupported number of clusters. Only %u supported\n", + MAX_CLUSTERS); + return -EINVAL; + } + cluster = &cluster_state[num_clusters]; + ++num_clusters; + + cpumask_copy(&cluster->cpu_mask, mask); + cluster->num_cpus = cpumask_weight(mask); + if (cluster->num_cpus > MAX_CPUS_PER_CLUSTER) { + pr_err("HW configuration not supported\n"); + return -EINVAL; + } + cluster->first_cpu = first_cpu; + cluster->min_cpus = 1; + cluster->max_cpus = cluster->num_cpus; + cluster->need_cpus = cluster->num_cpus; + cluster->offline_delay_ms = 100; + cluster->task_thres = UINT_MAX; + cluster->nrrun = cluster->num_cpus; + cluster->enable = true; + INIT_LIST_HEAD(&cluster->lru); + spin_lock_init(&cluster->pending_lock); + + for_each_cpu(cpu, mask) { + pr_info("Init CPU%u state\n", cpu); + + state = &per_cpu(cpu_state, cpu); + state->cluster = cluster; + state->cpu = cpu; + list_add_tail(&state->sib, &cluster->lru); + } + cluster->active_cpus = get_active_cpu_count(cluster); + + cluster->core_ctl_thread = kthread_run(try_core_ctl, (void *) cluster, + "core_ctl/%d", first_cpu); + if (IS_ERR(cluster->core_ctl_thread)) + return PTR_ERR(cluster->core_ctl_thread); + + sched_setscheduler_nocheck(cluster->core_ctl_thread, SCHED_FIFO, + ¶m); + + cluster->inited = true; + + kobject_init(&cluster->kobj, &ktype_core_ctl); + return kobject_add(&cluster->kobj, &dev->kobj, "core_ctl"); +} + +static int cpufreq_policy_cb(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct cpufreq_policy *policy = data; + int ret; + + switch (val) { + case CPUFREQ_CREATE_POLICY: + ret = cluster_init(policy->related_cpus); + if (ret) + pr_warn("unable to create core ctl group: %d\n", ret); + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block cpufreq_pol_nb = { + .notifier_call = cpufreq_policy_cb, +}; + +static int cpufreq_gov_cb(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct cpufreq_govinfo *info = data; + + switch (val) { + case CPUFREQ_LOAD_CHANGE: + core_ctl_set_busy(info->cpu, info->load); + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block cpufreq_gov_nb = { + .notifier_call = cpufreq_gov_cb, +}; + +static int __init core_ctl_init(void) +{ + unsigned int cpu; + + if (should_skip(cpu_possible_mask)) + return 0; + + core_ctl_check_interval = (rq_avg_period_ms - RQ_AVG_TOLERANCE) + * NSEC_PER_MSEC; + + register_cpu_notifier(&cpu_notifier); + cpufreq_register_notifier(&cpufreq_pol_nb, CPUFREQ_POLICY_NOTIFIER); + cpufreq_register_notifier(&cpufreq_gov_nb, CPUFREQ_GOVINFO_NOTIFIER); + + cpu_maps_update_begin(); + for_each_online_cpu(cpu) { + struct cpufreq_policy *policy; + int ret; + + policy = cpufreq_cpu_get(cpu); + if (policy) { + ret = cluster_init(policy->related_cpus); + if (ret) + pr_warn("unable to create core ctl group: %d\n" + , ret); + cpufreq_cpu_put(policy); + } + } + cpu_maps_update_done(); + initialized = true; + return 0; +} + +late_initcall(core_ctl_init); diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 981fcd7dc394..14225d5d8617 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -27,6 +27,8 @@ * of the License. */ +#include "sched.h" + #include <linux/gfp.h> #include <linux/sched.h> #include <linux/sched/rt.h> @@ -51,6 +53,27 @@ static int convert_prio(int prio) } /** + * drop_nopreempt_cpus - remove a cpu from the mask if it is likely + * non-preemptible + * @lowest_mask: mask with selected CPUs (non-NULL) + */ +static void +drop_nopreempt_cpus(struct cpumask *lowest_mask) +{ + unsigned int cpu = cpumask_first(lowest_mask); + + while (cpu < nr_cpu_ids) { + /* unlocked access */ + struct task_struct *task = READ_ONCE(cpu_rq(cpu)->curr); + + if (task_may_not_preempt(task, cpu)) + cpumask_clear_cpu(cpu, lowest_mask); + + cpu = cpumask_next(cpu, lowest_mask); + } +} + +/** * cpupri_find - find the best (lowest-pri) CPU in the system * @cp: The cpupri context * @p: The task @@ -70,9 +93,11 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, { int idx = 0; int task_pri = convert_prio(p->prio); + bool drop_nopreempts = task_pri <= MAX_RT_PRIO; BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES); +retry: for (idx = 0; idx < task_pri; idx++) { struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; int skip = 0; @@ -108,7 +133,8 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, if (lowest_mask) { cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask); - + if (drop_nopreempts) + drop_nopreempt_cpus(lowest_mask); /* * We have to ensure that we have at least one bit * still set in the array, since the map could have @@ -123,7 +149,14 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, return 1; } - + /* + * If we can't find any non-preemptible cpu's, retry so we can + * find the lowest priority target and avoid priority inversion. + */ + if (drop_nopreempts) { + drop_nopreempts = false; + goto retry; + } return 0; } @@ -246,3 +279,14 @@ void cpupri_cleanup(struct cpupri *cp) for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) free_cpumask_var(cp->pri_to_cpu[i].mask); } + +/* + * cpupri_check_rt - check if CPU has a RT task + * should be called from rcu-sched read section. + */ +bool cpupri_check_rt(void) +{ + int cpu = raw_smp_processor_id(); + + return cpu_rq(cpu)->rd->cpupri.cpu_to_pri[cpu] > CPUPRI_NORMAL; +} diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index fc2cfd6b2941..e6ec68c15aa3 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -6,7 +6,6 @@ #include <linux/context_tracking.h> #include <linux/cpufreq_times.h> #include "sched.h" -#include "walt.h" #ifdef CONFIG_IRQ_TIME_ACCOUNTING @@ -51,10 +50,8 @@ void irqtime_account_irq(struct task_struct *curr) unsigned long flags; s64 delta; int cpu; -#ifdef CONFIG_SCHED_WALT u64 wallclock; bool account = true; -#endif if (!sched_clock_irqtime) return; @@ -62,10 +59,8 @@ void irqtime_account_irq(struct task_struct *curr) local_irq_save(flags); cpu = smp_processor_id(); -#ifdef CONFIG_SCHED_WALT wallclock = sched_clock_cpu(cpu); -#endif - delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); + delta = wallclock - __this_cpu_read(irq_start_time); __this_cpu_add(irq_start_time, delta); irq_time_write_begin(); @@ -79,16 +74,16 @@ void irqtime_account_irq(struct task_struct *curr) __this_cpu_add(cpu_hardirq_time, delta); else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) __this_cpu_add(cpu_softirq_time, delta); -#ifdef CONFIG_SCHED_WALT else account = false; -#endif irq_time_write_end(); -#ifdef CONFIG_SCHED_WALT + if (account) - walt_account_irqtime(cpu, curr, delta, wallclock); -#endif + sched_account_irqtime(cpu, curr, delta, wallclock); + else if (curr != this_cpu_ksoftirqd()) + sched_account_irqstart(cpu, curr, wallclock); + local_irq_restore(flags); } EXPORT_SYMBOL_GPL(irqtime_account_irq); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 5c6ffddcafcd..188c8388a63f 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -291,9 +291,11 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p /* * By now the task is replenished and enqueued; migrate it. */ + p->on_rq = TASK_ON_RQ_MIGRATING; deactivate_task(rq, p, 0); set_task_cpu(p, later_rq->cpu); activate_task(later_rq, p, 0); + p->on_rq = TASK_ON_RQ_QUEUED; if (!fallback) resched_curr(later_rq); @@ -992,6 +994,41 @@ static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {} #endif /* CONFIG_SMP */ +#ifdef CONFIG_SCHED_HMP + +static void +inc_hmp_sched_stats_dl(struct rq *rq, struct task_struct *p) +{ + inc_cumulative_runnable_avg(&rq->hmp_stats, p); +} + +static void +dec_hmp_sched_stats_dl(struct rq *rq, struct task_struct *p) +{ + dec_cumulative_runnable_avg(&rq->hmp_stats, p); +} + +static void +fixup_hmp_sched_stats_dl(struct rq *rq, struct task_struct *p, + u32 new_task_load, u32 new_pred_demand) +{ + s64 task_load_delta = (s64)new_task_load - task_load(p); + s64 pred_demand_delta = PRED_DEMAND_DELTA; + + fixup_cumulative_runnable_avg(&rq->hmp_stats, p, task_load_delta, + pred_demand_delta); +} + +#else /* CONFIG_SCHED_HMP */ + +static inline void +inc_hmp_sched_stats_dl(struct rq *rq, struct task_struct *p) { } + +static inline void +dec_hmp_sched_stats_dl(struct rq *rq, struct task_struct *p) { } + +#endif /* CONFIG_SCHED_HMP */ + static inline void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { @@ -1001,7 +1038,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) WARN_ON(!dl_prio(prio)); dl_rq->dl_nr_running++; add_nr_running(rq_of_dl_rq(dl_rq), 1); - walt_inc_cumulative_runnable_avg(rq_of_dl_rq(dl_rq), dl_task_of(dl_se)); + inc_hmp_sched_stats_dl(rq_of_dl_rq(dl_rq), dl_task_of(dl_se)); inc_dl_deadline(dl_rq, deadline); inc_dl_migration(dl_se, dl_rq); @@ -1016,7 +1053,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) WARN_ON(!dl_rq->dl_nr_running); dl_rq->dl_nr_running--; sub_nr_running(rq_of_dl_rq(dl_rq), 1); - walt_dec_cumulative_runnable_avg(rq_of_dl_rq(dl_rq), dl_task_of(dl_se)); + dec_hmp_sched_stats_dl(rq_of_dl_rq(dl_rq), dl_task_of(dl_se)); dec_dl_deadline(dl_rq, dl_se->deadline); dec_dl_migration(dl_se, dl_rq); @@ -1712,6 +1749,7 @@ retry: goto retry; } + next_task->on_rq = TASK_ON_RQ_MIGRATING; deactivate_task(rq, next_task, 0); clear_average_bw(&next_task->dl, &rq->dl); next_task->on_rq = TASK_ON_RQ_MIGRATING; @@ -1719,6 +1757,7 @@ retry: next_task->on_rq = TASK_ON_RQ_QUEUED; add_average_bw(&next_task->dl, &later_rq->dl); activate_task(later_rq, next_task, 0); + next_task->on_rq = TASK_ON_RQ_QUEUED; ret = 1; resched_curr(later_rq); @@ -1804,6 +1843,7 @@ static void pull_dl_task(struct rq *this_rq) resched = true; + p->on_rq = TASK_ON_RQ_MIGRATING; deactivate_task(src_rq, p, 0); clear_average_bw(&p->dl, &src_rq->dl); p->on_rq = TASK_ON_RQ_MIGRATING; @@ -1811,6 +1851,7 @@ static void pull_dl_task(struct rq *this_rq) p->on_rq = TASK_ON_RQ_QUEUED; add_average_bw(&p->dl, &this_rq->dl); activate_task(this_rq, p, 0); + p->on_rq = TASK_ON_RQ_QUEUED; dmin = p->dl.deadline; /* Is there any other task even earlier? */ @@ -2012,6 +2053,11 @@ const struct sched_class dl_sched_class = { .switched_to = switched_to_dl, .update_curr = update_curr_dl, +#ifdef CONFIG_SCHED_HMP + .inc_hmp_sched_stats = inc_hmp_sched_stats_dl, + .dec_hmp_sched_stats = dec_hmp_sched_stats_dl, + .fixup_hmp_sched_stats = fixup_hmp_sched_stats_dl, +#endif }; #ifdef CONFIG_SCHED_DEBUG diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 7f7116622631..ed8e6bb4531b 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -227,6 +227,14 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cfs_rq->throttled); SEQ_printf(m, " .%-30s: %d\n", "throttle_count", cfs_rq->throttle_count); + SEQ_printf(m, " .%-30s: %d\n", "runtime_enabled", + cfs_rq->runtime_enabled); +#ifdef CONFIG_SCHED_HMP + SEQ_printf(m, " .%-30s: %d\n", "nr_big_tasks", + cfs_rq->hmp_stats.nr_big_tasks); + SEQ_printf(m, " .%-30s: %llu\n", "cumulative_runnable_avg", + cfs_rq->hmp_stats.cumulative_runnable_avg); +#endif #endif #ifdef CONFIG_FAIR_GROUP_SCHED @@ -306,6 +314,23 @@ do { \ P(cpu_load[2]); P(cpu_load[3]); P(cpu_load[4]); +#ifdef CONFIG_SMP + P(cpu_capacity); +#endif +#ifdef CONFIG_SCHED_HMP + P(static_cpu_pwr_cost); + P(cluster->static_cluster_pwr_cost); + P(cluster->load_scale_factor); + P(cluster->capacity); + P(cluster->max_possible_capacity); + P(cluster->efficiency); + P(cluster->cur_freq); + P(cluster->max_freq); + P(cluster->exec_scale_factor); + P(hmp_stats.nr_big_tasks); + SEQ_printf(m, " .%-30s: %llu\n", "hmp_stats.cumulative_runnable_avg", + rq->hmp_stats.cumulative_runnable_avg); +#endif #undef P #undef PN @@ -386,6 +411,15 @@ static void sched_debug_header(struct seq_file *m) PN(sysctl_sched_wakeup_granularity); P(sysctl_sched_child_runs_first); P(sysctl_sched_features); +#ifdef CONFIG_SCHED_HMP + P(sched_upmigrate); + P(sched_downmigrate); + P(sched_init_task_load_windows); + P(min_capacity); + P(max_capacity); + P(sched_ravg_window); + P(sched_load_granule); +#endif #undef PN #undef P @@ -408,6 +442,7 @@ static int sched_debug_show(struct seq_file *m, void *v) return 0; } +#ifdef CONFIG_SYSRQ_SCHED_DEBUG void sysrq_sched_debug_show(void) { int cpu; @@ -417,6 +452,7 @@ void sysrq_sched_debug_show(void) print_cpu(NULL, cpu); } +#endif /* * This itererator needs some explanation. @@ -547,6 +583,9 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m) void proc_sched_show_task(struct task_struct *p, struct seq_file *m) { unsigned long nr_switches; + unsigned int load_avg; + + load_avg = pct_task_load(p); SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr(p), get_nr_threads(p)); @@ -624,6 +663,13 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) P(se.statistics.nr_wakeups_cas_attempts); P(se.statistics.nr_wakeups_cas_count); +#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) + __P(load_avg); +#ifdef CONFIG_SCHED_HMP + P(ravg.demand); +#endif +#endif + { u64 avg_atom, avg_per_cpu; diff --git a/kernel/sched/energy.c b/kernel/sched/energy.c index b0656b7a93e3..50d183b1e156 100644 --- a/kernel/sched/energy.c +++ b/kernel/sched/energy.c @@ -27,7 +27,10 @@ #include <linux/sched_energy.h> #include <linux/stddef.h> +#include "sched.h" + struct sched_group_energy *sge_array[NR_CPUS][NR_SD_LEVELS]; +bool sched_energy_aware; static void free_resources(void) { @@ -56,6 +59,13 @@ void init_sched_energy_costs(void) int sd_level, i, nstates, cpu; const __be32 *val; + if (!energy_aware()) { + sched_energy_aware = false; + return; + } + + sched_energy_aware = true; + for_each_possible_cpu(cpu) { cn = of_get_cpu_node(cpu, NULL); if (!cn) { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 80fe9e258d5e..51443a801af5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -32,9 +32,8 @@ #include <linux/task_work.h> #include <linux/module.h> -#include <trace/events/sched.h> - #include "sched.h" +#include <trace/events/sched.h> #include "tune.h" #include "walt.h" @@ -56,12 +55,6 @@ unsigned int normalized_sysctl_sched_latency = 6000000ULL; unsigned int sysctl_sched_sync_hint_enable = 1; unsigned int sysctl_sched_cstate_aware = 1; -#ifdef CONFIG_SCHED_WALT -unsigned int sysctl_sched_use_walt_cpu_util = 1; -unsigned int sysctl_sched_use_walt_task_util = 1; -__read_mostly unsigned int sysctl_sched_walt_cpu_high_irqload = - (10 * NSEC_PER_MSEC); -#endif /* * The initial- and re-scaling of tunables is configurable * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) @@ -254,6 +247,9 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight return mul_u64_u32_shr(delta_exec, fact, shift); } +#ifdef CONFIG_SMP +static int active_load_balance_cpu_stop(void *data); +#endif const struct sched_class fair_sched_class; @@ -891,12 +887,56 @@ static void update_curr_fair(struct rq *rq) update_curr(cfs_rq_of(&rq->curr->se)); } +#ifdef CONFIG_SCHEDSTATS +static inline void +update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + u64 wait_start = rq_clock(rq_of(cfs_rq)); + + if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) && + likely(wait_start > se->statistics.wait_start)) + wait_start -= se->statistics.wait_start; + + se->statistics.wait_start = wait_start; +} + +static void +update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + struct task_struct *p; + u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start; + + if (entity_is_task(se)) { + p = task_of(se); + if (task_on_rq_migrating(p)) { + /* + * Preserve migrating task's wait time so wait_start + * time stamp can be adjusted to accumulate wait time + * prior to migration. + */ + se->statistics.wait_start = delta; + return; + } + trace_sched_stat_wait(p, delta); + } + + se->statistics.wait_max = max(se->statistics.wait_max, delta); + se->statistics.wait_count++; + se->statistics.wait_sum += delta; + se->statistics.wait_start = 0; +} +#else static inline void update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) { - schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq))); } +static inline void +update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +} +#endif + /* * Task is being enqueued - update stats: */ @@ -910,23 +950,6 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) update_stats_wait_start(cfs_rq, se); } -static void -update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max, - rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start)); - schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1); - schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum + - rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start); -#ifdef CONFIG_SCHEDSTATS - if (entity_is_task(se)) { - trace_sched_stat_wait(task_of(se), - rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start); - } -#endif - schedstat_set(se->statistics.wait_start, 0); -} - static inline void update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -2633,7 +2656,27 @@ static inline void update_cfs_shares(struct sched_entity *se) #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_SMP -/* Precomputed fixed inverse multiplies for multiplication by y^n */ +u32 sched_get_wake_up_idle(struct task_struct *p) +{ + u32 enabled = p->flags & PF_WAKE_UP_IDLE; + + return !!enabled; +} +EXPORT_SYMBOL(sched_get_wake_up_idle); + +int sched_set_wake_up_idle(struct task_struct *p, int wake_up_idle) +{ + int enable = !!wake_up_idle; + + if (enable) + p->flags |= PF_WAKE_UP_IDLE; + else + p->flags &= ~PF_WAKE_UP_IDLE; + + return 0; +} +EXPORT_SYMBOL(sched_set_wake_up_idle); + static const u32 runnable_avg_yN_inv[] = { 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85, @@ -2713,6 +2756,1068 @@ static u32 __compute_runnable_contrib(u64 n) return contrib + runnable_avg_yN_sum[n]; } +#ifdef CONFIG_SCHED_HMP + +/* CPU selection flag */ +#define SBC_FLAG_PREV_CPU 0x1 +#define SBC_FLAG_BEST_CAP_CPU 0x2 +#define SBC_FLAG_CPU_COST 0x4 +#define SBC_FLAG_MIN_COST 0x8 +#define SBC_FLAG_IDLE_LEAST_LOADED 0x10 +#define SBC_FLAG_IDLE_CSTATE 0x20 +#define SBC_FLAG_COST_CSTATE_TIE_BREAKER 0x40 +#define SBC_FLAG_COST_CSTATE_PREV_CPU_TIE_BREAKER 0x80 +#define SBC_FLAG_CSTATE_LOAD 0x100 +#define SBC_FLAG_BEST_SIBLING 0x200 +#define SBC_FLAG_WAKER_CPU 0x400 +#define SBC_FLAG_PACK_TASK 0x800 + +/* Cluster selection flag */ +#define SBC_FLAG_COLOC_CLUSTER 0x10000 +#define SBC_FLAG_WAKER_CLUSTER 0x20000 +#define SBC_FLAG_BACKUP_CLUSTER 0x40000 +#define SBC_FLAG_BOOST_CLUSTER 0x80000 + +struct cpu_select_env { + struct task_struct *p; + struct related_thread_group *rtg; + u8 reason; + u8 need_idle:1; + u8 need_waker_cluster:1; + u8 sync:1; + enum sched_boost_policy boost_policy; + u8 pack_task:1; + int prev_cpu; + DECLARE_BITMAP(candidate_list, NR_CPUS); + DECLARE_BITMAP(backup_list, NR_CPUS); + u64 task_load; + u64 cpu_load; + u32 sbc_best_flag; + u32 sbc_best_cluster_flag; + struct cpumask search_cpus; +}; + +struct cluster_cpu_stats { + int best_idle_cpu, least_loaded_cpu; + int best_capacity_cpu, best_cpu, best_sibling_cpu; + int min_cost, best_sibling_cpu_cost; + int best_cpu_wakeup_latency; + u64 min_load, best_load, best_sibling_cpu_load; + s64 highest_spare_capacity; +}; + +/* + * Should task be woken to any available idle cpu? + * + * Waking tasks to idle cpu has mixed implications on both performance and + * power. In many cases, scheduler can't estimate correctly impact of using idle + * cpus on either performance or power. PF_WAKE_UP_IDLE allows external kernel + * module to pass a strong hint to scheduler that the task in question should be + * woken to idle cpu, generally to improve performance. + */ +static inline int wake_to_idle(struct task_struct *p) +{ + return (current->flags & PF_WAKE_UP_IDLE) || + (p->flags & PF_WAKE_UP_IDLE); +} + +static int spill_threshold_crossed(struct cpu_select_env *env, struct rq *rq) +{ + u64 total_load; + + total_load = env->task_load + env->cpu_load; + + if (total_load > sched_spill_load || + (rq->nr_running + 1) > sysctl_sched_spill_nr_run) + return 1; + + return 0; +} + +static int skip_cpu(int cpu, struct cpu_select_env *env) +{ + int tcpu = task_cpu(env->p); + int skip = 0; + + if (!env->reason) + return 0; + + if (is_reserved(cpu)) + return 1; + + switch (env->reason) { + case UP_MIGRATION: + skip = !idle_cpu(cpu); + break; + case IRQLOAD_MIGRATION: + /* Purposely fall through */ + default: + skip = (cpu == tcpu); + break; + } + + return skip; +} + +static inline int +acceptable_capacity(struct sched_cluster *cluster, struct cpu_select_env *env) +{ + int tcpu; + + if (!env->reason) + return 1; + + tcpu = task_cpu(env->p); + switch (env->reason) { + case UP_MIGRATION: + return cluster->capacity > cpu_capacity(tcpu); + + case DOWN_MIGRATION: + return cluster->capacity < cpu_capacity(tcpu); + + default: + break; + } + + return 1; +} + +static int +skip_cluster(struct sched_cluster *cluster, struct cpu_select_env *env) +{ + if (!test_bit(cluster->id, env->candidate_list)) + return 1; + + if (!acceptable_capacity(cluster, env)) { + __clear_bit(cluster->id, env->candidate_list); + return 1; + } + + return 0; +} + +static struct sched_cluster * +select_least_power_cluster(struct cpu_select_env *env) +{ + struct sched_cluster *cluster; + + if (env->rtg) { + int cpu = cluster_first_cpu(env->rtg->preferred_cluster); + + env->task_load = scale_load_to_cpu(task_load(env->p), cpu); + + if (task_load_will_fit(env->p, env->task_load, + cpu, env->boost_policy)) { + env->sbc_best_cluster_flag |= SBC_FLAG_COLOC_CLUSTER; + + if (env->boost_policy == SCHED_BOOST_NONE) + return env->rtg->preferred_cluster; + + for_each_sched_cluster(cluster) { + if (cluster != env->rtg->preferred_cluster) { + __set_bit(cluster->id, + env->backup_list); + __clear_bit(cluster->id, + env->candidate_list); + } + } + + return env->rtg->preferred_cluster; + } + + /* + * Since the task load does not fit on the preferred + * cluster anymore, pretend that the task does not + * have any preferred cluster. This allows the waking + * task to get the appropriate CPU it needs as per the + * non co-location placement policy without having to + * wait until the preferred cluster is updated. + */ + env->rtg = NULL; + } + + for_each_sched_cluster(cluster) { + if (!skip_cluster(cluster, env)) { + int cpu = cluster_first_cpu(cluster); + + env->task_load = scale_load_to_cpu(task_load(env->p), + cpu); + if (task_load_will_fit(env->p, env->task_load, cpu, + env->boost_policy)) + return cluster; + + __set_bit(cluster->id, env->backup_list); + __clear_bit(cluster->id, env->candidate_list); + } + } + + return NULL; +} + +static struct sched_cluster * +next_candidate(const unsigned long *list, int start, int end) +{ + int cluster_id; + + cluster_id = find_next_bit(list, end, start - 1 + 1); + if (cluster_id >= end) + return NULL; + + return sched_cluster[cluster_id]; +} + +static void +update_spare_capacity(struct cluster_cpu_stats *stats, + struct cpu_select_env *env, int cpu, int capacity, + u64 cpu_load) +{ + s64 spare_capacity = sched_ravg_window - cpu_load; + + if (spare_capacity > 0 && + (spare_capacity > stats->highest_spare_capacity || + (spare_capacity == stats->highest_spare_capacity && + ((!env->need_waker_cluster && + capacity > cpu_capacity(stats->best_capacity_cpu)) || + (env->need_waker_cluster && + cpu_rq(cpu)->nr_running < + cpu_rq(stats->best_capacity_cpu)->nr_running))))) { + /* + * If sync waker is the only runnable of CPU, cr_avg of the + * CPU is 0 so we have high chance to place the wakee on the + * waker's CPU which likely causes preemtion of the waker. + * This can lead migration of preempted waker. Place the + * wakee on the real idle CPU when it's possible by checking + * nr_running to avoid such preemption. + */ + stats->highest_spare_capacity = spare_capacity; + stats->best_capacity_cpu = cpu; + } +} + +static inline void find_backup_cluster( +struct cpu_select_env *env, struct cluster_cpu_stats *stats) +{ + struct sched_cluster *next = NULL; + int i; + struct cpumask search_cpus; + + extern int num_clusters; + + while (!bitmap_empty(env->backup_list, num_clusters)) { + next = next_candidate(env->backup_list, 0, num_clusters); + __clear_bit(next->id, env->backup_list); + + cpumask_and(&search_cpus, &env->search_cpus, &next->cpus); + for_each_cpu(i, &search_cpus) { + trace_sched_cpu_load_wakeup(cpu_rq(i), idle_cpu(i), + sched_irqload(i), power_cost(i, task_load(env->p) + + cpu_cravg_sync(i, env->sync)), 0); + + update_spare_capacity(stats, env, i, next->capacity, + cpu_load_sync(i, env->sync)); + } + env->sbc_best_cluster_flag = SBC_FLAG_BACKUP_CLUSTER; + } +} + +struct sched_cluster * +next_best_cluster(struct sched_cluster *cluster, struct cpu_select_env *env, + struct cluster_cpu_stats *stats) +{ + struct sched_cluster *next = NULL; + + extern int num_clusters; + + __clear_bit(cluster->id, env->candidate_list); + + if (env->rtg && preferred_cluster(cluster, env->p)) + return NULL; + + do { + if (bitmap_empty(env->candidate_list, num_clusters)) + return NULL; + + next = next_candidate(env->candidate_list, 0, num_clusters); + if (next) { + if (next->min_power_cost > stats->min_cost) { + clear_bit(next->id, env->candidate_list); + next = NULL; + continue; + } + + if (skip_cluster(next, env)) + next = NULL; + } + } while (!next); + + env->task_load = scale_load_to_cpu(task_load(env->p), + cluster_first_cpu(next)); + return next; +} + +#ifdef CONFIG_SCHED_HMP_CSTATE_AWARE +static void __update_cluster_stats(int cpu, struct cluster_cpu_stats *stats, + struct cpu_select_env *env, int cpu_cost) +{ + int wakeup_latency; + int prev_cpu = env->prev_cpu; + + wakeup_latency = cpu_rq(cpu)->wakeup_latency; + + if (env->need_idle) { + stats->min_cost = cpu_cost; + if (idle_cpu(cpu)) { + if (wakeup_latency < stats->best_cpu_wakeup_latency || + (wakeup_latency == stats->best_cpu_wakeup_latency && + cpu == prev_cpu)) { + stats->best_idle_cpu = cpu; + stats->best_cpu_wakeup_latency = wakeup_latency; + } + } else { + if (env->cpu_load < stats->min_load || + (env->cpu_load == stats->min_load && + cpu == prev_cpu)) { + stats->least_loaded_cpu = cpu; + stats->min_load = env->cpu_load; + } + } + + return; + } + + if (cpu_cost < stats->min_cost) { + stats->min_cost = cpu_cost; + stats->best_cpu_wakeup_latency = wakeup_latency; + stats->best_load = env->cpu_load; + stats->best_cpu = cpu; + env->sbc_best_flag = SBC_FLAG_CPU_COST; + return; + } + + /* CPU cost is the same. Start breaking the tie by C-state */ + + if (wakeup_latency > stats->best_cpu_wakeup_latency) + return; + + if (wakeup_latency < stats->best_cpu_wakeup_latency) { + stats->best_cpu_wakeup_latency = wakeup_latency; + stats->best_load = env->cpu_load; + stats->best_cpu = cpu; + env->sbc_best_flag = SBC_FLAG_COST_CSTATE_TIE_BREAKER; + return; + } + + /* C-state is the same. Use prev CPU to break the tie */ + if (cpu == prev_cpu) { + stats->best_cpu = cpu; + env->sbc_best_flag = SBC_FLAG_COST_CSTATE_PREV_CPU_TIE_BREAKER; + return; + } + + if (stats->best_cpu != prev_cpu && + ((wakeup_latency == 0 && env->cpu_load < stats->best_load) || + (wakeup_latency > 0 && env->cpu_load > stats->best_load))) { + stats->best_load = env->cpu_load; + stats->best_cpu = cpu; + env->sbc_best_flag = SBC_FLAG_CSTATE_LOAD; + } +} +#else /* CONFIG_SCHED_HMP_CSTATE_AWARE */ +static void __update_cluster_stats(int cpu, struct cluster_cpu_stats *stats, + struct cpu_select_env *env, int cpu_cost) +{ + int prev_cpu = env->prev_cpu; + + if (cpu != prev_cpu && cpus_share_cache(prev_cpu, cpu)) { + if (stats->best_sibling_cpu_cost > cpu_cost || + (stats->best_sibling_cpu_cost == cpu_cost && + stats->best_sibling_cpu_load > env->cpu_load)) { + stats->best_sibling_cpu_cost = cpu_cost; + stats->best_sibling_cpu_load = env->cpu_load; + stats->best_sibling_cpu = cpu; + } + } + + if ((cpu_cost < stats->min_cost) || + ((stats->best_cpu != prev_cpu && + stats->min_load > env->cpu_load) || cpu == prev_cpu)) { + if (env->need_idle) { + if (idle_cpu(cpu)) { + stats->min_cost = cpu_cost; + stats->best_idle_cpu = cpu; + } + } else { + stats->min_cost = cpu_cost; + stats->min_load = env->cpu_load; + stats->best_cpu = cpu; + env->sbc_best_flag = SBC_FLAG_MIN_COST; + } + } +} +#endif /* CONFIG_SCHED_HMP_CSTATE_AWARE */ + +static void update_cluster_stats(int cpu, struct cluster_cpu_stats *stats, + struct cpu_select_env *env) +{ + int cpu_cost; + + /* + * We try to find the least loaded *busy* CPU irrespective + * of the power cost. + */ + if (env->pack_task) + cpu_cost = cpu_min_power_cost(cpu); + + else + cpu_cost = power_cost(cpu, task_load(env->p) + + cpu_cravg_sync(cpu, env->sync)); + + if (cpu_cost <= stats->min_cost) + __update_cluster_stats(cpu, stats, env, cpu_cost); +} + +static void find_best_cpu_in_cluster(struct sched_cluster *c, + struct cpu_select_env *env, struct cluster_cpu_stats *stats) +{ + int i; + struct cpumask search_cpus; + + cpumask_and(&search_cpus, &env->search_cpus, &c->cpus); + + env->need_idle = wake_to_idle(env->p) || c->wake_up_idle; + + for_each_cpu(i, &search_cpus) { + env->cpu_load = cpu_load_sync(i, env->sync); + + trace_sched_cpu_load_wakeup(cpu_rq(i), idle_cpu(i), + sched_irqload(i), + power_cost(i, task_load(env->p) + + cpu_cravg_sync(i, env->sync)), 0); + + if (skip_cpu(i, env)) + continue; + + update_spare_capacity(stats, env, i, c->capacity, + env->cpu_load); + + /* + * need_idle takes precedence over sched boost but when both + * are set, idlest CPU with in all the clusters is selected + * when boost_policy = BOOST_ON_ALL whereas idlest CPU in the + * big cluster is selected within boost_policy = BOOST_ON_BIG. + */ + if ((!env->need_idle && + env->boost_policy != SCHED_BOOST_NONE) || + env->need_waker_cluster || + sched_cpu_high_irqload(i) || + spill_threshold_crossed(env, cpu_rq(i))) + continue; + + update_cluster_stats(i, stats, env); + } +} + +static inline void init_cluster_cpu_stats(struct cluster_cpu_stats *stats) +{ + stats->best_cpu = stats->best_idle_cpu = -1; + stats->best_capacity_cpu = stats->best_sibling_cpu = -1; + stats->min_cost = stats->best_sibling_cpu_cost = INT_MAX; + stats->min_load = stats->best_sibling_cpu_load = ULLONG_MAX; + stats->highest_spare_capacity = 0; + stats->least_loaded_cpu = -1; + stats->best_cpu_wakeup_latency = INT_MAX; + /* No need to initialize stats->best_load */ +} + +static inline bool env_has_special_flags(struct cpu_select_env *env) +{ + if (env->need_idle || env->boost_policy != SCHED_BOOST_NONE || + env->reason) + return true; + + return false; +} + +static inline bool +bias_to_prev_cpu(struct cpu_select_env *env, struct cluster_cpu_stats *stats) +{ + int prev_cpu; + struct task_struct *task = env->p; + struct sched_cluster *cluster; + + if (!task->ravg.mark_start || !sched_short_sleep_task_threshold) + return false; + + prev_cpu = env->prev_cpu; + if (!cpumask_test_cpu(prev_cpu, &env->search_cpus)) + return false; + + if (task->ravg.mark_start - task->last_cpu_selected_ts >= + sched_long_cpu_selection_threshold) + return false; + + /* + * This function should be used by task wake up path only as it's + * assuming p->last_switch_out_ts as last sleep time. + * p->last_switch_out_ts can denote last preemption time as well as + * last sleep time. + */ + if (task->ravg.mark_start - task->last_switch_out_ts >= + sched_short_sleep_task_threshold) + return false; + + env->task_load = scale_load_to_cpu(task_load(task), prev_cpu); + cluster = cpu_rq(prev_cpu)->cluster; + + if (!task_load_will_fit(task, env->task_load, prev_cpu, + sched_boost_policy())) { + + __set_bit(cluster->id, env->backup_list); + __clear_bit(cluster->id, env->candidate_list); + return false; + } + + env->cpu_load = cpu_load_sync(prev_cpu, env->sync); + if (sched_cpu_high_irqload(prev_cpu) || + spill_threshold_crossed(env, cpu_rq(prev_cpu))) { + update_spare_capacity(stats, env, prev_cpu, + cluster->capacity, env->cpu_load); + cpumask_clear_cpu(prev_cpu, &env->search_cpus); + return false; + } + + return true; +} + +static inline bool +wake_to_waker_cluster(struct cpu_select_env *env) +{ + return env->sync && + task_load(current) > sched_big_waker_task_load && + task_load(env->p) < sched_small_wakee_task_load; +} + +static inline bool +bias_to_waker_cpu(struct cpu_select_env *env, int cpu) +{ + return sysctl_sched_prefer_sync_wakee_to_waker && + cpu_rq(cpu)->nr_running == 1 && + cpumask_test_cpu(cpu, &env->search_cpus); +} + +static inline int +cluster_allowed(struct cpu_select_env *env, struct sched_cluster *cluster) +{ + return cpumask_intersects(&env->search_cpus, &cluster->cpus); +} + +/* return cheapest cpu that can fit this task */ +static int select_best_cpu(struct task_struct *p, int target, int reason, + int sync) +{ + struct sched_cluster *cluster, *pref_cluster = NULL; + struct cluster_cpu_stats stats; + struct related_thread_group *grp; + unsigned int sbc_flag = 0; + int cpu = raw_smp_processor_id(); + bool special; + + struct cpu_select_env env = { + .p = p, + .reason = reason, + .need_idle = wake_to_idle(p), + .need_waker_cluster = 0, + .sync = sync, + .prev_cpu = target, + .rtg = NULL, + .sbc_best_flag = 0, + .sbc_best_cluster_flag = 0, + .pack_task = false, + }; + + env.boost_policy = task_sched_boost(p) ? + sched_boost_policy() : SCHED_BOOST_NONE; + + bitmap_copy(env.candidate_list, all_cluster_ids, NR_CPUS); + bitmap_zero(env.backup_list, NR_CPUS); + + cpumask_and(&env.search_cpus, tsk_cpus_allowed(p), cpu_active_mask); + cpumask_andnot(&env.search_cpus, &env.search_cpus, cpu_isolated_mask); + + init_cluster_cpu_stats(&stats); + special = env_has_special_flags(&env); + + rcu_read_lock(); + + grp = task_related_thread_group(p); + + if (grp && grp->preferred_cluster) { + pref_cluster = grp->preferred_cluster; + if (!cluster_allowed(&env, pref_cluster)) + clear_bit(pref_cluster->id, env.candidate_list); + else + env.rtg = grp; + } else if (!special) { + cluster = cpu_rq(cpu)->cluster; + if (wake_to_waker_cluster(&env)) { + if (bias_to_waker_cpu(&env, cpu)) { + target = cpu; + sbc_flag = SBC_FLAG_WAKER_CLUSTER | + SBC_FLAG_WAKER_CPU; + goto out; + } else if (cluster_allowed(&env, cluster)) { + env.need_waker_cluster = 1; + bitmap_zero(env.candidate_list, NR_CPUS); + __set_bit(cluster->id, env.candidate_list); + env.sbc_best_cluster_flag = + SBC_FLAG_WAKER_CLUSTER; + } + } else if (bias_to_prev_cpu(&env, &stats)) { + sbc_flag = SBC_FLAG_PREV_CPU; + goto out; + } + } + + if (!special && is_short_burst_task(p)) { + env.pack_task = true; + sbc_flag = SBC_FLAG_PACK_TASK; + } +retry: + cluster = select_least_power_cluster(&env); + + if (!cluster) + goto out; + + /* + * 'cluster' now points to the minimum power cluster which can satisfy + * task's perf goals. Walk down the cluster list starting with that + * cluster. For non-small tasks, skip clusters that don't have + * mostly_idle/idle cpus + */ + + do { + find_best_cpu_in_cluster(cluster, &env, &stats); + + } while ((cluster = next_best_cluster(cluster, &env, &stats))); + + if (env.need_idle) { + if (stats.best_idle_cpu >= 0) { + target = stats.best_idle_cpu; + sbc_flag |= SBC_FLAG_IDLE_CSTATE; + } else if (stats.least_loaded_cpu >= 0) { + target = stats.least_loaded_cpu; + sbc_flag |= SBC_FLAG_IDLE_LEAST_LOADED; + } + } else if (stats.best_cpu >= 0) { + if (stats.best_sibling_cpu >= 0 && + stats.best_cpu != task_cpu(p) && + stats.min_cost == stats.best_sibling_cpu_cost) { + stats.best_cpu = stats.best_sibling_cpu; + sbc_flag |= SBC_FLAG_BEST_SIBLING; + } + sbc_flag |= env.sbc_best_flag; + target = stats.best_cpu; + } else { + if (env.rtg && env.boost_policy == SCHED_BOOST_NONE) { + env.rtg = NULL; + goto retry; + } + + /* + * With boost_policy == SCHED_BOOST_ON_BIG, we reach here with + * backup_list = little cluster, candidate_list = none and + * stats->best_capacity_cpu points the best spare capacity + * CPU among the CPUs in the big cluster. + */ + if (env.boost_policy == SCHED_BOOST_ON_BIG && + stats.best_capacity_cpu >= 0) + sbc_flag |= SBC_FLAG_BOOST_CLUSTER; + else + find_backup_cluster(&env, &stats); + + if (stats.best_capacity_cpu >= 0) { + target = stats.best_capacity_cpu; + sbc_flag |= SBC_FLAG_BEST_CAP_CPU; + } + } + p->last_cpu_selected_ts = sched_ktime_clock(); +out: + sbc_flag |= env.sbc_best_cluster_flag; + rcu_read_unlock(); + trace_sched_task_load(p, sched_boost_policy() && task_sched_boost(p), + env.reason, env.sync, env.need_idle, sbc_flag, target); + return target; +} + +#ifdef CONFIG_CFS_BANDWIDTH + +static inline struct task_group *next_task_group(struct task_group *tg) +{ + tg = list_entry_rcu(tg->list.next, typeof(struct task_group), list); + + return (&tg->list == &task_groups) ? NULL : tg; +} + +/* Iterate over all cfs_rq in a cpu */ +#define for_each_cfs_rq(cfs_rq, tg, cpu) \ + for (tg = container_of(&task_groups, struct task_group, list); \ + ((tg = next_task_group(tg)) && (cfs_rq = tg->cfs_rq[cpu]));) + +void reset_cfs_rq_hmp_stats(int cpu, int reset_cra) +{ + struct task_group *tg; + struct cfs_rq *cfs_rq; + + rcu_read_lock(); + + for_each_cfs_rq(cfs_rq, tg, cpu) + reset_hmp_stats(&cfs_rq->hmp_stats, reset_cra); + + rcu_read_unlock(); +} + +static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq); + +static void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq, + struct task_struct *p, int change_cra); +static void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq, + struct task_struct *p, int change_cra); + +/* Add task's contribution to a cpu' HMP statistics */ +void _inc_hmp_sched_stats_fair(struct rq *rq, + struct task_struct *p, int change_cra) +{ + struct cfs_rq *cfs_rq; + struct sched_entity *se = &p->se; + + /* + * Although below check is not strictly required (as + * inc/dec_nr_big_task and inc/dec_cumulative_runnable_avg called + * from inc_cfs_rq_hmp_stats() have similar checks), we gain a bit on + * efficiency by short-circuiting for_each_sched_entity() loop when + * sched_disable_window_stats + */ + if (sched_disable_window_stats) + return; + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + inc_cfs_rq_hmp_stats(cfs_rq, p, change_cra); + if (cfs_rq_throttled(cfs_rq)) + break; + } + + /* Update rq->hmp_stats only if we didn't find any throttled cfs_rq */ + if (!se) + inc_rq_hmp_stats(rq, p, change_cra); +} + +/* Remove task's contribution from a cpu' HMP statistics */ +static void +_dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p, int change_cra) +{ + struct cfs_rq *cfs_rq; + struct sched_entity *se = &p->se; + + /* See comment on efficiency in _inc_hmp_sched_stats_fair */ + if (sched_disable_window_stats) + return; + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + dec_cfs_rq_hmp_stats(cfs_rq, p, change_cra); + if (cfs_rq_throttled(cfs_rq)) + break; + } + + /* Update rq->hmp_stats only if we didn't find any throttled cfs_rq */ + if (!se) + dec_rq_hmp_stats(rq, p, change_cra); +} + +static void inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p) +{ + _inc_hmp_sched_stats_fair(rq, p, 1); +} + +static void dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p) +{ + _dec_hmp_sched_stats_fair(rq, p, 1); +} + +static void fixup_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p, + u32 new_task_load, u32 new_pred_demand) +{ + struct cfs_rq *cfs_rq; + struct sched_entity *se = &p->se; + s64 task_load_delta = (s64)new_task_load - task_load(p); + s64 pred_demand_delta = PRED_DEMAND_DELTA; + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + + fixup_cumulative_runnable_avg(&cfs_rq->hmp_stats, p, + task_load_delta, + pred_demand_delta); + fixup_nr_big_tasks(&cfs_rq->hmp_stats, p, task_load_delta); + if (cfs_rq_throttled(cfs_rq)) + break; + } + + /* Fix up rq->hmp_stats only if we didn't find any throttled cfs_rq */ + if (!se) { + fixup_cumulative_runnable_avg(&rq->hmp_stats, p, + task_load_delta, + pred_demand_delta); + fixup_nr_big_tasks(&rq->hmp_stats, p, task_load_delta); + } +} + +static int task_will_be_throttled(struct task_struct *p); + +#else /* CONFIG_CFS_BANDWIDTH */ + +inline void reset_cfs_rq_hmp_stats(int cpu, int reset_cra) { } + +static void +inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p) +{ + inc_nr_big_task(&rq->hmp_stats, p); + inc_cumulative_runnable_avg(&rq->hmp_stats, p); +} + +static void +dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p) +{ + dec_nr_big_task(&rq->hmp_stats, p); + dec_cumulative_runnable_avg(&rq->hmp_stats, p); +} +static void +fixup_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p, + u32 new_task_load, u32 new_pred_demand) +{ + s64 task_load_delta = (s64)new_task_load - task_load(p); + s64 pred_demand_delta = PRED_DEMAND_DELTA; + + fixup_cumulative_runnable_avg(&rq->hmp_stats, p, task_load_delta, + pred_demand_delta); + fixup_nr_big_tasks(&rq->hmp_stats, p, task_load_delta); +} + +static inline int task_will_be_throttled(struct task_struct *p) +{ + return 0; +} + +void _inc_hmp_sched_stats_fair(struct rq *rq, + struct task_struct *p, int change_cra) +{ + inc_nr_big_task(&rq->hmp_stats, p); +} + +#endif /* CONFIG_CFS_BANDWIDTH */ + +/* + * Reset balance_interval at all sched_domain levels of given cpu, so that it + * honors kick. + */ +static inline void reset_balance_interval(int cpu) +{ + struct sched_domain *sd; + + if (cpu >= nr_cpu_ids) + return; + + rcu_read_lock(); + for_each_domain(cpu, sd) + sd->balance_interval = 0; + rcu_read_unlock(); +} + +/* + * Check if a task is on the "wrong" cpu (i.e its current cpu is not the ideal + * cpu as per its demand or priority) + * + * Returns reason why task needs to be migrated + */ +static inline int migration_needed(struct task_struct *p, int cpu) +{ + int nice; + struct related_thread_group *grp; + + if (p->state != TASK_RUNNING || p->nr_cpus_allowed == 1) + return 0; + + /* No need to migrate task that is about to be throttled */ + if (task_will_be_throttled(p)) + return 0; + + if (sched_boost_policy() == SCHED_BOOST_ON_BIG && + cpu_capacity(cpu) != max_capacity && task_sched_boost(p)) + return UP_MIGRATION; + + if (sched_cpu_high_irqload(cpu)) + return IRQLOAD_MIGRATION; + + nice = task_nice(p); + rcu_read_lock(); + grp = task_related_thread_group(p); + /* + * Don't assume higher capacity means higher power. If the task + * is running on the power efficient CPU, avoid migrating it + * to a lower capacity cluster. + */ + if (!grp && (nice > SCHED_UPMIGRATE_MIN_NICE || + upmigrate_discouraged(p)) && + cpu_capacity(cpu) > min_capacity && + cpu_max_power_cost(cpu) == max_power_cost) { + rcu_read_unlock(); + return DOWN_MIGRATION; + } + + if (!task_will_fit(p, cpu)) { + rcu_read_unlock(); + return UP_MIGRATION; + } + rcu_read_unlock(); + + return 0; +} + +static inline int +kick_active_balance(struct rq *rq, struct task_struct *p, int new_cpu) +{ + unsigned long flags; + int rc = 0; + + /* Invoke active balance to force migrate currently running task */ + raw_spin_lock_irqsave(&rq->lock, flags); + if (!rq->active_balance) { + rq->active_balance = 1; + rq->push_cpu = new_cpu; + get_task_struct(p); + rq->push_task = p; + rc = 1; + } + raw_spin_unlock_irqrestore(&rq->lock, flags); + + return rc; +} + +static DEFINE_RAW_SPINLOCK(migration_lock); + +static bool do_migration(int reason, int new_cpu, int cpu) +{ + if ((reason == UP_MIGRATION || reason == DOWN_MIGRATION) + && same_cluster(new_cpu, cpu)) + return false; + + /* Inter cluster high irqload migrations are OK */ + return new_cpu != cpu; +} + +/* + * Check if currently running task should be migrated to a better cpu. + * + * Todo: Effect this via changes to nohz_balancer_kick() and load balance? + */ +void check_for_migration(struct rq *rq, struct task_struct *p) +{ + int cpu = cpu_of(rq), new_cpu; + int active_balance = 0, reason; + + reason = migration_needed(p, cpu); + if (!reason) + return; + + raw_spin_lock(&migration_lock); + new_cpu = select_best_cpu(p, cpu, reason, 0); + + if (do_migration(reason, new_cpu, cpu)) { + active_balance = kick_active_balance(rq, p, new_cpu); + if (active_balance) + mark_reserved(new_cpu); + } + + raw_spin_unlock(&migration_lock); + + if (active_balance) + stop_one_cpu_nowait(cpu, active_load_balance_cpu_stop, rq, + &rq->active_balance_work); +} + +#ifdef CONFIG_CFS_BANDWIDTH + +static void init_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq) +{ + cfs_rq->hmp_stats.nr_big_tasks = 0; + cfs_rq->hmp_stats.cumulative_runnable_avg = 0; + cfs_rq->hmp_stats.pred_demands_sum = 0; +} + +static void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq, + struct task_struct *p, int change_cra) +{ + inc_nr_big_task(&cfs_rq->hmp_stats, p); + if (change_cra) + inc_cumulative_runnable_avg(&cfs_rq->hmp_stats, p); +} + +static void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq, + struct task_struct *p, int change_cra) +{ + dec_nr_big_task(&cfs_rq->hmp_stats, p); + if (change_cra) + dec_cumulative_runnable_avg(&cfs_rq->hmp_stats, p); +} + +static void inc_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats, + struct cfs_rq *cfs_rq) +{ + stats->nr_big_tasks += cfs_rq->hmp_stats.nr_big_tasks; + stats->cumulative_runnable_avg += + cfs_rq->hmp_stats.cumulative_runnable_avg; + stats->pred_demands_sum += cfs_rq->hmp_stats.pred_demands_sum; +} + +static void dec_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats, + struct cfs_rq *cfs_rq) +{ + stats->nr_big_tasks -= cfs_rq->hmp_stats.nr_big_tasks; + stats->cumulative_runnable_avg -= + cfs_rq->hmp_stats.cumulative_runnable_avg; + stats->pred_demands_sum -= cfs_rq->hmp_stats.pred_demands_sum; + + BUG_ON(stats->nr_big_tasks < 0 || + (s64)stats->cumulative_runnable_avg < 0); + BUG_ON((s64)stats->pred_demands_sum < 0); +} + +#else /* CONFIG_CFS_BANDWIDTH */ + +static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq, + struct task_struct *p, int change_cra) { } + +static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq, + struct task_struct *p, int change_cra) { } + +#endif /* CONFIG_CFS_BANDWIDTH */ + +#else /* CONFIG_SCHED_HMP */ + +static inline void init_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq) { } + +static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq, + struct task_struct *p, int change_cra) { } + +static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq, + struct task_struct *p, int change_cra) { } + +#define dec_throttled_cfs_rq_hmp_stats(...) +#define inc_throttled_cfs_rq_hmp_stats(...) + +#endif /* CONFIG_SCHED_HMP */ + #if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10 #error "load tracking assumes 2^10 as unit" #endif @@ -2836,6 +3941,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, if (cfs_rq) cfs_rq->runnable_load_sum += weight * scaled_delta; } + if (running) sa->util_sum += scaled_delta * scale_cpu; @@ -3434,6 +4540,12 @@ static inline int idle_balance(struct rq *rq) return 0; } +static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq, + struct task_struct *p, int change_cra) { } + +static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq, + struct task_struct *p, int change_cra) { } + #endif /* CONFIG_SMP */ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -4060,6 +5172,35 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) return cfs_bandwidth_used() && cfs_rq->throttled; } +#ifdef CONFIG_SCHED_HMP +/* + * Check if task is part of a hierarchy where some cfs_rq does not have any + * runtime left. + * + * We can't rely on throttled_hierarchy() to do this test, as + * cfs_rq->throttle_count will not be updated yet when this function is called + * from scheduler_tick() + */ +static int task_will_be_throttled(struct task_struct *p) +{ + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq; + + if (!cfs_bandwidth_used()) + return 0; + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + if (!cfs_rq->runtime_enabled) + continue; + if (cfs_rq->runtime_remaining <= 0) + return 1; + } + + return 0; +} +#endif + /* check whether cfs_rq, or any parent, is throttled */ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) { @@ -4139,13 +5280,16 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) if (dequeue) dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); qcfs_rq->h_nr_running -= task_delta; + dec_throttled_cfs_rq_hmp_stats(&qcfs_rq->hmp_stats, cfs_rq); if (qcfs_rq->load.weight) dequeue = 0; } - if (!se) + if (!se) { sub_nr_running(rq, task_delta); + dec_throttled_cfs_rq_hmp_stats(&rq->hmp_stats, cfs_rq); + } cfs_rq->throttled = 1; cfs_rq->throttled_clock = rq_clock(rq); @@ -4170,6 +5314,12 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) start_cfs_bandwidth(cfs_b); raw_spin_unlock(&cfs_b->lock); + + /* Log effect on hmp stats after throttling */ + trace_sched_cpu_load_cgroup(rq, idle_cpu(cpu_of(rq)), + sched_irqload(cpu_of(rq)), + power_cost(cpu_of(rq), 0), + cpu_temp(cpu_of(rq))); } void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) @@ -4179,6 +5329,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) struct sched_entity *se; int enqueue = 1; long task_delta; + struct cfs_rq *tcfs_rq __maybe_unused = cfs_rq; se = cfs_rq->tg->se[cpu_of(rq)]; @@ -4206,17 +5357,26 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) if (enqueue) enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); cfs_rq->h_nr_running += task_delta; + inc_throttled_cfs_rq_hmp_stats(&cfs_rq->hmp_stats, tcfs_rq); if (cfs_rq_throttled(cfs_rq)) break; } - if (!se) + if (!se) { add_nr_running(rq, task_delta); + inc_throttled_cfs_rq_hmp_stats(&rq->hmp_stats, tcfs_rq); + } /* determine whether we need to wake up potentially idle cpu */ if (rq->curr == rq->idle && rq->cfs.nr_running) resched_curr(rq); + + /* Log effect on hmp stats after un-throttling */ + trace_sched_cpu_load_cgroup(rq, idle_cpu(cpu_of(rq)), + sched_irqload(cpu_of(rq)), + power_cost(cpu_of(rq), 0), + cpu_temp(cpu_of(rq))); } static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, @@ -4602,6 +5762,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) { cfs_rq->runtime_enabled = 0; INIT_LIST_HEAD(&cfs_rq->throttled_list); + init_cfs_rq_hmp_stats(cfs_rq); } void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) @@ -4717,7 +5878,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) WARN_ON(task_rq(p) != rq); - if (cfs_rq->nr_running > 1) { + if (rq->cfs.h_nr_running > 1) { u64 slice = sched_slice(cfs_rq, se); u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; s64 delta = slice - ran; @@ -4733,8 +5894,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) /* * called from enqueue/dequeue and updates the hrtick when the - * current task is from our class and nr_running is low enough - * to matter. + * current task is from our class. */ static void hrtick_update(struct rq *rq) { @@ -4743,8 +5903,7 @@ static void hrtick_update(struct rq *rq) if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class) return; - if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency) - hrtick_start_fair(rq, curr); + hrtick_start_fair(rq, curr); } #else /* !CONFIG_SCHED_HRTICK */ static inline void @@ -4802,7 +5961,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running++; - walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p); + inc_cfs_rq_hmp_stats(cfs_rq, p, 1); flags = ENQUEUE_WAKEUP; } @@ -4810,7 +5969,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_running++; - walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p); + inc_cfs_rq_hmp_stats(cfs_rq, p, 1); if (cfs_rq_throttled(cfs_rq)) break; @@ -4819,8 +5978,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_shares(se); } - if (!se) + if (!se) { add_nr_running(rq, 1); + inc_rq_hmp_stats(rq, p, 1); + } #ifdef CONFIG_SMP @@ -4843,8 +6004,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) */ schedtune_enqueue_task(p, cpu_of(rq)); - if (!se) { - walt_inc_cumulative_runnable_avg(rq, p); + if (energy_aware() && !se) { if (!task_new && !rq->rd->overutilized && cpu_overutilized(rq->cpu)) { rq->rd->overutilized = true; @@ -4882,7 +6042,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running--; - walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p); + dec_cfs_rq_hmp_stats(cfs_rq, p, 1); /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { @@ -4902,7 +6062,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_running--; - walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p); + dec_cfs_rq_hmp_stats(cfs_rq, p, 1); if (cfs_rq_throttled(cfs_rq)) break; @@ -4911,8 +6071,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_shares(se); } - if (!se) + if (!se) { sub_nr_running(rq, 1); + dec_rq_hmp_stats(rq, p, 1); + } #ifdef CONFIG_SMP @@ -4925,8 +6087,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) */ schedtune_dequeue_task(p, cpu_of(rq)); - if (!se) - walt_dec_cumulative_runnable_avg(rq, p); #endif /* CONFIG_SMP */ hrtick_update(rq); @@ -5339,11 +6499,6 @@ unsigned long capacity_curr_of(int cpu) >> SCHED_CAPACITY_SHIFT; } -static inline bool energy_aware(void) -{ - return sched_feat(ENERGY_AWARE); -} - struct energy_env { struct sched_group *sg_top; struct sched_group *sg_cap; @@ -5943,12 +7098,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, static inline unsigned long task_util(struct task_struct *p) { -#ifdef CONFIG_SCHED_WALT - if (!walt_disabled && sysctl_sched_use_walt_task_util) { - unsigned long demand = p->ravg.demand; - return (demand << 10) / walt_ravg_window; - } -#endif return p->se.avg.util_avg; } @@ -6333,6 +7482,10 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) } } + if (!(current->flags & PF_WAKE_UP_IDLE) && + !(p->flags & PF_WAKE_UP_IDLE)) + return target; + /* * Otherwise, iterate the domains and find an elegible idle cpu. */ @@ -6857,6 +8010,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f int want_affine = 0; int sync = wake_flags & WF_SYNC; +#ifdef CONFIG_SCHED_HMP + return select_best_cpu(p, prev_cpu, 0, sync); +#endif + if (sd_flag & SD_BALANCE_WAKE) { record_wakee(p); want_affine = !wake_wide(p, sibling_count_hint) && @@ -7443,6 +8600,10 @@ enum group_type { #define LBF_NEED_BREAK 0x02 #define LBF_DST_PINNED 0x04 #define LBF_SOME_PINNED 0x08 +#define LBF_BIG_TASK_ACTIVE_BALANCE 0x80 +#define LBF_IGNORE_BIG_TASKS 0x100 +#define LBF_IGNORE_PREFERRED_CLUSTER_TASKS 0x200 +#define LBF_MOVED_RELATED_THREAD_GROUP_TASK 0x400 struct lb_env { struct sched_domain *sd; @@ -7460,6 +8621,8 @@ struct lb_env { unsigned int src_grp_nr_running; /* The set of CPUs under consideration for load-balancing */ struct cpumask *cpus; + unsigned int busiest_grp_capacity; + unsigned int busiest_nr_running; unsigned int flags; @@ -7470,6 +8633,7 @@ struct lb_env { enum fbq_type fbq_type; enum group_type busiest_group_type; struct list_head tasks; + enum sched_boost_policy boost_policy; }; /* @@ -7567,6 +8731,7 @@ static int can_migrate_task(struct task_struct *p, struct lb_env *env) { int tsk_cache_hot; + int twf, group_cpus; lockdep_assert_held(&env->src_rq->lock); @@ -7613,6 +8778,39 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* Record that we found atleast one task that could run on dst_cpu */ env->flags &= ~LBF_ALL_PINNED; + if (cpu_capacity(env->dst_cpu) > cpu_capacity(env->src_cpu)) { + if (nr_big_tasks(env->src_rq) && !is_big_task(p)) + return 0; + + if (env->boost_policy == SCHED_BOOST_ON_BIG && + !task_sched_boost(p)) + return 0; + } + + twf = task_will_fit(p, env->dst_cpu); + + /* + * Attempt to not pull tasks that don't fit. We may get lucky and find + * one that actually fits. + */ + if (env->flags & LBF_IGNORE_BIG_TASKS && !twf) + return 0; + + if (env->flags & LBF_IGNORE_PREFERRED_CLUSTER_TASKS && + !preferred_cluster(rq_cluster(cpu_rq(env->dst_cpu)), p)) + return 0; + + /* + * Group imbalance can sometimes cause work to be pulled across groups + * even though the group could have managed the imbalance on its own. + * Prevent inter-cluster migrations for big tasks when the number of + * tasks is lower than the capacity of the group. + */ + group_cpus = DIV_ROUND_UP(env->busiest_grp_capacity, + SCHED_CAPACITY_SCALE); + if (!twf && env->busiest_nr_running <= group_cpus) + return 0; + if (task_running(env->src_rq, p)) { schedstat_inc(p, se.statistics.nr_failed_migrations_running); return 0; @@ -7620,15 +8818,16 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* * Aggressive migration if: - * 1) destination numa is preferred - * 2) task is cache cold, or - * 3) too many balance attempts have failed. + * 1) IDLE or NEWLY_IDLE balance. + * 2) destination numa is preferred + * 3) task is cache cold, or + * 4) too many balance attempts have failed. */ tsk_cache_hot = migrate_degrades_locality(p, env); if (tsk_cache_hot == -1) tsk_cache_hot = task_hot(p, env); - if (tsk_cache_hot <= 0 || + if (env->idle != CPU_NOT_IDLE || tsk_cache_hot <= 0 || env->sd->nr_balance_failed > env->sd->cache_nice_tries) { if (tsk_cache_hot == 1) { schedstat_inc(env->sd, lb_hot_gained[env->idle]); @@ -7648,10 +8847,12 @@ static void detach_task(struct task_struct *p, struct lb_env *env) { lockdep_assert_held(&env->src_rq->lock); - deactivate_task(env->src_rq, p, 0); p->on_rq = TASK_ON_RQ_MIGRATING; + deactivate_task(env->src_rq, p, 0); double_lock_balance(env->src_rq, env->dst_rq); set_task_cpu(p, env->dst_cpu); + if (task_in_related_thread_group(p)) + env->flags |= LBF_MOVED_RELATED_THREAD_GROUP_TASK; double_unlock_balance(env->src_rq, env->dst_rq); } @@ -7680,6 +8881,7 @@ static struct task_struct *detach_one_task(struct lb_env *env) * inside detach_tasks(). */ schedstat_inc(env->sd, lb_gained[env->idle]); + return p; } return NULL; @@ -7699,12 +8901,20 @@ static int detach_tasks(struct lb_env *env) struct task_struct *p; unsigned long load; int detached = 0; + int orig_loop = env->loop; lockdep_assert_held(&env->src_rq->lock); if (env->imbalance <= 0) return 0; + if (!same_cluster(env->dst_cpu, env->src_cpu)) + env->flags |= LBF_IGNORE_PREFERRED_CLUSTER_TASKS; + + if (cpu_capacity(env->dst_cpu) < cpu_capacity(env->src_cpu)) + env->flags |= LBF_IGNORE_BIG_TASKS; + +redo: while (!list_empty(tasks)) { /* * We don't want to steal all, otherwise we may be treated likewise, @@ -7774,6 +8984,15 @@ next: list_move_tail(&p->se.group_node, tasks); } + if (env->flags & (LBF_IGNORE_BIG_TASKS | + LBF_IGNORE_PREFERRED_CLUSTER_TASKS) && !detached) { + tasks = &env->src_rq->cfs_tasks; + env->flags &= ~(LBF_IGNORE_BIG_TASKS | + LBF_IGNORE_PREFERRED_CLUSTER_TASKS); + env->loop = orig_loop; + goto redo; + } + /* * Right now, this is one of only two places we collect this stat * so we can safely collect detach_one_task() stats here rather @@ -7792,8 +9011,8 @@ static void attach_task(struct rq *rq, struct task_struct *p) lockdep_assert_held(&rq->lock); BUG_ON(task_rq(p) != rq); - p->on_rq = TASK_ON_RQ_QUEUED; activate_task(rq, p, 0); + p->on_rq = TASK_ON_RQ_QUEUED; check_preempt_curr(rq, p, 0); } @@ -7940,6 +9159,10 @@ struct sg_lb_stats { unsigned long group_capacity; unsigned long group_util; /* Total utilization of the group */ unsigned int sum_nr_running; /* Nr tasks running in the group */ +#ifdef CONFIG_SCHED_HMP + unsigned long sum_nr_big_tasks; + u64 group_cpu_load; /* Scaled load of all CPUs of the group */ +#endif unsigned int idle_cpus; unsigned int group_weight; enum group_type group_type; @@ -7983,10 +9206,64 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) .avg_load = 0UL, .sum_nr_running = 0, .group_type = group_other, +#ifdef CONFIG_SCHED_HMP + .sum_nr_big_tasks = 0UL, + .group_cpu_load = 0ULL, +#endif }, }; } +#ifdef CONFIG_SCHED_HMP + +static int +bail_inter_cluster_balance(struct lb_env *env, struct sd_lb_stats *sds) +{ + int local_cpu, busiest_cpu; + int local_capacity, busiest_capacity; + int local_pwr_cost, busiest_pwr_cost; + int nr_cpus; + int boost = sched_boost(); + + if (!sysctl_sched_restrict_cluster_spill || + boost == FULL_THROTTLE_BOOST || boost == CONSERVATIVE_BOOST) + return 0; + + local_cpu = group_first_cpu(sds->local); + busiest_cpu = group_first_cpu(sds->busiest); + + local_capacity = cpu_max_possible_capacity(local_cpu); + busiest_capacity = cpu_max_possible_capacity(busiest_cpu); + + local_pwr_cost = cpu_max_power_cost(local_cpu); + busiest_pwr_cost = cpu_max_power_cost(busiest_cpu); + + if (local_pwr_cost <= busiest_pwr_cost) + return 0; + + if (local_capacity > busiest_capacity && + sds->busiest_stat.sum_nr_big_tasks) + return 0; + + nr_cpus = cpumask_weight(sched_group_cpus(sds->busiest)); + if ((sds->busiest_stat.group_cpu_load < nr_cpus * sched_spill_load) && + (sds->busiest_stat.sum_nr_running < + nr_cpus * sysctl_sched_spill_nr_run)) + return 1; + + return 0; +} + +#else /* CONFIG_SCHED_HMP */ + +static inline int +bail_inter_cluster_balance(struct lb_env *env, struct sd_lb_stats *sds) +{ + return 0; +} + +#endif /* CONFIG_SCHED_HMP */ + /** * get_sd_load_idx - Obtain the load index for a given sched domain. * @sd: The sched_domain whose load_idx is to be obtained. @@ -8130,6 +9407,8 @@ void update_group_capacity(struct sched_domain *sd, int cpu) struct sched_group_capacity *sgc; struct rq *rq = cpu_rq(cpu); + if (cpumask_test_cpu(cpu, cpu_isolated_mask)) + continue; /* * build_sched_domains() -> init_sched_groups_capacity() * gets here before we've attached the domains to the @@ -8161,9 +9440,14 @@ void update_group_capacity(struct sched_domain *sd, int cpu) do { struct sched_group_capacity *sgc = group->sgc; - capacity += sgc->capacity; - max_capacity = max(sgc->max_capacity, max_capacity); - min_capacity = min(sgc->min_capacity, min_capacity); + cpumask_t *cpus = sched_group_cpus(group); + + /* Revisit this later. This won't work for MT domain */ + if (!cpu_isolated(cpumask_first(cpus))) { + capacity += sgc->capacity; + max_capacity = max(sgc->max_capacity, max_capacity); + min_capacity = min(sgc->min_capacity, min_capacity); + } group = group->next; } while (group != child->groups); } @@ -8279,7 +9563,7 @@ group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref) static inline enum group_type group_classify(struct sched_group *group, - struct sg_lb_stats *sgs) + struct sg_lb_stats *sgs, struct lb_env *env) { if (sgs->group_no_capacity) return group_overloaded; @@ -8348,6 +9632,14 @@ static inline void update_sg_lb_stats(struct lb_env *env, for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { struct rq *rq = cpu_rq(i); + trace_sched_cpu_load_lb(cpu_rq(i), idle_cpu(i), + sched_irqload(i), + power_cost(i, 0), + cpu_temp(i)); + + if (cpu_isolated(i)) + continue; + /* if we are entering idle and there are CPUs with * their tick stopped, do an update for them */ @@ -8368,6 +9660,11 @@ static inline void update_sg_lb_stats(struct lb_env *env, if (nr_running > 1) *overload = true; +#ifdef CONFIG_SCHED_HMP + sgs->sum_nr_big_tasks += rq->hmp_stats.nr_big_tasks; + sgs->group_cpu_load += cpu_load(i); +#endif + #ifdef CONFIG_NUMA_BALANCING sgs->nr_numa_running += rq->nr_numa_running; sgs->nr_preferred_running += rq->nr_preferred_running; @@ -8379,25 +9676,62 @@ static inline void update_sg_lb_stats(struct lb_env *env, if (!nr_running && idle_cpu(i)) sgs->idle_cpus++; - if (cpu_overutilized(i)) { + if (energy_aware() && cpu_overutilized(i)) { *overutilized = true; if (!sgs->group_misfit_task && rq->misfit_task) sgs->group_misfit_task = capacity_of(i); } } - /* Adjust by relative CPU capacity of the group */ - sgs->group_capacity = group->sgc->capacity; - sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity; + /* Isolated CPU has no weight */ + if (!group->group_weight) { + sgs->group_capacity = 0; + sgs->avg_load = 0; + sgs->group_no_capacity = 1; + sgs->group_type = group_other; + sgs->group_weight = group->group_weight; + } else { + /* Adjust by relative CPU capacity of the group */ + sgs->group_capacity = group->sgc->capacity; + sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / + sgs->group_capacity; + + sgs->group_weight = group->group_weight; + + sgs->group_no_capacity = group_is_overloaded(env, sgs); + sgs->group_type = group_classify(group, sgs, env); + } if (sgs->sum_nr_running) sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; +} - sgs->group_weight = group->group_weight; +#ifdef CONFIG_SCHED_HMP +static bool update_sd_pick_busiest_active_balance(struct lb_env *env, + struct sd_lb_stats *sds, + struct sched_group *sg, + struct sg_lb_stats *sgs) +{ + if (env->idle != CPU_NOT_IDLE && + cpu_capacity(env->dst_cpu) > group_rq_capacity(sg)) { + if (sgs->sum_nr_big_tasks > + sds->busiest_stat.sum_nr_big_tasks) { + env->flags |= LBF_BIG_TASK_ACTIVE_BALANCE; + return true; + } + } - sgs->group_no_capacity = group_is_overloaded(env, sgs); - sgs->group_type = group_classify(group, sgs); + return false; } +#else +static bool update_sd_pick_busiest_active_balance(struct lb_env *env, + struct sd_lb_stats *sds, + struct sched_group *sg, + struct sg_lb_stats *sgs) +{ + return false; +} +#endif /** * update_sd_pick_busiest - return 1 on busiest group @@ -8419,35 +9753,40 @@ static bool update_sd_pick_busiest(struct lb_env *env, { struct sg_lb_stats *busiest = &sds->busiest_stat; + if (update_sd_pick_busiest_active_balance(env, sds, sg, sgs)) + return true; + if (sgs->group_type > busiest->group_type) return true; if (sgs->group_type < busiest->group_type) return false; - /* - * Candidate sg doesn't face any serious load-balance problems - * so don't pick it if the local sg is already filled up. - */ - if (sgs->group_type == group_other && - !group_has_capacity(env, &sds->local_stat)) - return false; + if (energy_aware()) { + /* + * Candidate sg doesn't face any serious load-balance problems + * so don't pick it if the local sg is already filled up. + */ + if (sgs->group_type == group_other && + !group_has_capacity(env, &sds->local_stat)) + return false; - if (sgs->avg_load <= busiest->avg_load) - return false; + if (sgs->avg_load <= busiest->avg_load) + return false; - if (!(env->sd->flags & SD_ASYM_CPUCAPACITY)) - goto asym_packing; + if (!(env->sd->flags & SD_ASYM_CPUCAPACITY)) + goto asym_packing; - /* - * Candidate sg has no more than one task per CPU and - * has higher per-CPU capacity. Migrating tasks to less - * capable CPUs may harm throughput. Maximize throughput, - * power/energy consequences are not considered. - */ - if (sgs->sum_nr_running <= sgs->group_weight && - group_smaller_cpu_capacity(sds->local, sg)) - return false; + /* + * Candidate sg has no more than one task per CPU and + * has higher per-CPU capacity. Migrating tasks to less + * capable CPUs may harm throughput. Maximize throughput, + * power/energy consequences are not considered. + */ + if (sgs->sum_nr_running <= sgs->group_weight && + group_smaller_cpu_capacity(sds->local, sg)) + return false; + } asym_packing: /* This is the busiest node in its class. */ @@ -8555,14 +9894,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd group_has_capacity(env, &sds->local_stat) && (sgs->sum_nr_running > 1)) { sgs->group_no_capacity = 1; - sgs->group_type = group_classify(sg, sgs); + sgs->group_type = group_classify(sg, sgs, env); } /* * Ignore task groups with misfit tasks if local group has no * capacity or if per-cpu capacity isn't higher. */ - if (sgs->group_type == group_misfit_task && + if (energy_aware() && + sgs->group_type == group_misfit_task && (!group_has_capacity(env, &sds->local_stat) || !group_smaller_cpu_capacity(sg, sds->local))) sgs->group_type = group_other; @@ -8570,6 +9910,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd if (update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; sds->busiest_stat = *sgs; + env->busiest_nr_running = sgs->sum_nr_running; + env->busiest_grp_capacity = sgs->group_capacity; } next_group: @@ -8591,12 +9933,12 @@ next_group: env->dst_rq->rd->overload = overload; /* Update over-utilization (tipping point, U >= 0) indicator */ - if (env->dst_rq->rd->overutilized != overutilized) { + if (energy_aware() && env->dst_rq->rd->overutilized != overutilized) { env->dst_rq->rd->overutilized = overutilized; trace_sched_overutilized(overutilized); } } else { - if (!env->dst_rq->rd->overutilized && overutilized) { + if (energy_aware() && !env->dst_rq->rd->overutilized && overutilized) { env->dst_rq->rd->overutilized = true; trace_sched_overutilized(true); } @@ -8748,20 +10090,22 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s */ if (busiest->avg_load <= sds->avg_load || local->avg_load >= sds->avg_load) { - /* Misfitting tasks should be migrated in any case */ - if (busiest->group_type == group_misfit_task) { - env->imbalance = busiest->group_misfit_task; - return; - } + if (energy_aware()) { + /* Misfitting tasks should be migrated in any case */ + if (busiest->group_type == group_misfit_task) { + env->imbalance = busiest->group_misfit_task; + return; + } - /* - * Busiest group is overloaded, local is not, use the spare - * cycles to maximize throughput - */ - if (busiest->group_type == group_overloaded && - local->group_type <= group_misfit_task) { - env->imbalance = busiest->load_per_task; - return; + /* + * Busiest group is overloaded, local is not, use the spare + * cycles to maximize throughput + */ + if (busiest->group_type == group_overloaded && + local->group_type <= group_misfit_task) { + env->imbalance = busiest->load_per_task; + return; + } } env->imbalance = 0; @@ -8798,7 +10142,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s ) / SCHED_CAPACITY_SCALE; /* Boost imbalance to allow misfit task to be balanced. */ - if (busiest->group_type == group_misfit_task) + if (energy_aware() && busiest->group_type == group_misfit_task) env->imbalance = max_t(long, env->imbalance, busiest->group_misfit_task); @@ -8859,6 +10203,12 @@ static struct sched_group *find_busiest_group(struct lb_env *env) if (!sds.busiest || busiest->sum_nr_running == 0) goto out_balanced; + if (env->flags & LBF_BIG_TASK_ACTIVE_BALANCE) + goto force_balance; + + if (bail_inter_cluster_balance(env, &sds)) + goto out_balanced; + sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load) / sds.total_capacity; @@ -8879,7 +10229,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) goto force_balance; /* Misfitting tasks should be dealt with regardless of the avg load */ - if (busiest->group_type == group_misfit_task) { + if (energy_aware() && busiest->group_type == group_misfit_task) { goto force_balance; } @@ -8930,6 +10280,60 @@ out_balanced: return NULL; } +#ifdef CONFIG_SCHED_HMP +static struct rq *find_busiest_queue_hmp(struct lb_env *env, + struct sched_group *group) +{ + struct rq *busiest = NULL, *busiest_big = NULL; + u64 max_runnable_avg = 0, max_runnable_avg_big = 0; + int max_nr_big = 0, nr_big; + bool find_big = !!(env->flags & LBF_BIG_TASK_ACTIVE_BALANCE); + int i; + cpumask_t cpus; + + cpumask_andnot(&cpus, sched_group_cpus(group), cpu_isolated_mask); + + for_each_cpu(i, &cpus) { + struct rq *rq = cpu_rq(i); + u64 cumulative_runnable_avg = + rq->hmp_stats.cumulative_runnable_avg; + + if (!cpumask_test_cpu(i, env->cpus)) + continue; + + + if (find_big) { + nr_big = nr_big_tasks(rq); + if (nr_big > max_nr_big || + (nr_big > 0 && nr_big == max_nr_big && + cumulative_runnable_avg > max_runnable_avg_big)) { + max_runnable_avg_big = cumulative_runnable_avg; + busiest_big = rq; + max_nr_big = nr_big; + continue; + } + } + + if (cumulative_runnable_avg > max_runnable_avg) { + max_runnable_avg = cumulative_runnable_avg; + busiest = rq; + } + } + + if (busiest_big) + return busiest_big; + + env->flags &= ~LBF_BIG_TASK_ACTIVE_BALANCE; + return busiest; +} +#else +static inline struct rq *find_busiest_queue_hmp(struct lb_env *env, + struct sched_group *group) +{ + return NULL; +} +#endif + /* * find_busiest_queue - find the busiest runqueue among the cpus in group. */ @@ -8940,6 +10344,10 @@ static struct rq *find_busiest_queue(struct lb_env *env, unsigned long busiest_load = 0, busiest_capacity = 1; int i; +#ifdef CONFIG_SCHED_HMP + return find_busiest_queue_hmp(env, group); +#endif + for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { unsigned long capacity, wl; enum fbq_type rt; @@ -9008,15 +10416,20 @@ static struct rq *find_busiest_queue(struct lb_env *env, * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but * so long as it is large enough. */ -#define MAX_PINNED_INTERVAL 512 +#define MAX_PINNED_INTERVAL 16 /* Working cpumask for load_balance and load_balance_newidle. */ DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); +#define NEED_ACTIVE_BALANCE_THRESHOLD 10 + static int need_active_balance(struct lb_env *env) { struct sched_domain *sd = env->sd; + if (env->flags & LBF_BIG_TASK_ACTIVE_BALANCE) + return 1; + if (env->idle == CPU_NEWLY_IDLE) { /* @@ -9041,7 +10454,8 @@ static int need_active_balance(struct lb_env *env) return 1; } - if ((capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) && + if (energy_aware() && + (capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) && ((capacity_orig_of(env->src_cpu) < capacity_orig_of(env->dst_cpu))) && env->src_rq->cfs.h_nr_running == 1 && cpu_overutilized(env->src_cpu) && @@ -9049,10 +10463,18 @@ static int need_active_balance(struct lb_env *env) return 1; } - return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); + return unlikely(sd->nr_balance_failed > + sd->cache_nice_tries + NEED_ACTIVE_BALANCE_THRESHOLD); } -static int active_load_balance_cpu_stop(void *data); +static int group_balance_cpu_not_isolated(struct sched_group *sg) +{ + cpumask_t cpus; + + cpumask_and(&cpus, sched_group_cpus(sg), sched_group_mask(sg)); + cpumask_andnot(&cpus, &cpus, cpu_isolated_mask); + return cpumask_first(&cpus); +} static int should_we_balance(struct lb_env *env) { @@ -9071,7 +10493,8 @@ static int should_we_balance(struct lb_env *env) sg_mask = sched_group_mask(sg); /* Try to find first idle cpu */ for_each_cpu_and(cpu, sg_cpus, env->cpus) { - if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu)) + if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu) || + cpu_isolated(cpu)) continue; balance_cpu = cpu; @@ -9079,7 +10502,7 @@ static int should_we_balance(struct lb_env *env) } if (balance_cpu == -1) - balance_cpu = group_balance_cpu(sg); + balance_cpu = group_balance_cpu_not_isolated(sg); /* * First idle cpu or the first cpu(busiest) in this sched group @@ -9096,23 +10519,29 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, int *continue_balancing) { - int ld_moved, cur_ld_moved, active_balance = 0; + int ld_moved = 0, cur_ld_moved, active_balance = 0; struct sched_domain *sd_parent = lb_sd_parent(sd) ? sd->parent : NULL; - struct sched_group *group; - struct rq *busiest; + struct sched_group *group = NULL; + struct rq *busiest = NULL; unsigned long flags; struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask); struct lb_env env = { - .sd = sd, - .dst_cpu = this_cpu, - .dst_rq = this_rq, - .dst_grpmask = sched_group_cpus(sd->groups), - .idle = idle, - .loop_break = sched_nr_migrate_break, - .cpus = cpus, - .fbq_type = all, - .tasks = LIST_HEAD_INIT(env.tasks), + .sd = sd, + .dst_cpu = this_cpu, + .dst_rq = this_rq, + .dst_grpmask = sched_group_cpus(sd->groups), + .idle = idle, + .loop_break = sched_nr_migrate_break, + .cpus = cpus, + .fbq_type = all, + .tasks = LIST_HEAD_INIT(env.tasks), + .imbalance = 0, + .flags = 0, + .loop = 0, + .busiest_nr_running = 0, + .busiest_grp_capacity = 0, + .boost_policy = sched_boost_policy(), }; /* @@ -9160,12 +10589,24 @@ redo: * correctly treated as an imbalance. */ env.flags |= LBF_ALL_PINNED; - env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); more_balance: raw_spin_lock_irqsave(&busiest->lock, flags); update_rq_clock(busiest); + /* The world might have changed. Validate assumptions */ + if (busiest->nr_running <= 1) { + raw_spin_unlock_irqrestore(&busiest->lock, flags); + env.flags &= ~LBF_ALL_PINNED; + goto no_move; + } + + /* + * Set loop_max when rq's lock is taken to prevent a race. + */ + env.loop_max = min(sysctl_sched_nr_migrate, + busiest->nr_running); + /* * cur_ld_moved - load moved in current iteration * ld_moved - cumulative load moved across iterations @@ -9253,17 +10694,22 @@ more_balance: } } +no_move: if (!ld_moved) { - schedstat_inc(sd, lb_failed[idle]); + if (!(env.flags & LBF_BIG_TASK_ACTIVE_BALANCE)) + schedstat_inc(sd, lb_failed[idle]); + /* * Increment the failure counter only on periodic balance. * We do not want newidle balance, which can be very * frequent, pollute the failure counter causing * excessive cache_hot migrations and active balances. */ - if (idle != CPU_NEWLY_IDLE) - if (env.src_grp_nr_running > 1) + if (idle != CPU_NEWLY_IDLE && + !(env.flags & LBF_BIG_TASK_ACTIVE_BALANCE)) { + if (env.src_grp_nr_running > 1) sd->nr_balance_failed++; + } if (need_active_balance(&env)) { raw_spin_lock_irqsave(&busiest->lock, flags); @@ -9285,7 +10731,8 @@ more_balance: * ->active_balance_work. Once set, it's cleared * only after active load balance is finished. */ - if (!busiest->active_balance) { + if (!busiest->active_balance && + !cpu_isolated(cpu_of(busiest))) { busiest->active_balance = 1; busiest->push_cpu = this_cpu; active_balance = 1; @@ -9296,17 +10743,31 @@ more_balance: stop_one_cpu_nowait(cpu_of(busiest), active_load_balance_cpu_stop, busiest, &busiest->active_balance_work); + *continue_balancing = 0; } /* * We've kicked active balancing, reset the failure * counter. */ - sd->nr_balance_failed = sd->cache_nice_tries+1; + sd->nr_balance_failed = + sd->cache_nice_tries + + NEED_ACTIVE_BALANCE_THRESHOLD - 1; } - } else + } else { sd->nr_balance_failed = 0; + /* Assumes one 'busiest' cpu that we pulled tasks from */ + if (!same_freq_domain(this_cpu, cpu_of(busiest))) { + int check_groups = !!(env.flags & + LBF_MOVED_RELATED_THREAD_GROUP_TASK); + + check_for_freq_change(this_rq, false, check_groups); + check_for_freq_change(busiest, false, check_groups); + } else { + check_for_freq_change(this_rq, true, false); + } + } if (likely(!active_balance)) { /* We were unbalanced, so reset the balancing interval */ sd->balance_interval = sd->min_interval; @@ -9364,6 +10825,11 @@ out_one_pinned: (sd->balance_interval < sd->max_interval)) sd->balance_interval *= 2; out: + trace_sched_load_balance(this_cpu, idle, *continue_balancing, + group ? group->cpumask[0] : 0, + busiest ? busiest->nr_running : 0, + env.imbalance, env.flags, ld_moved, + sd->balance_interval); return ld_moved; } @@ -9406,6 +10872,9 @@ static int idle_balance(struct rq *this_rq) int pulled_task = 0; u64 curr_cost = 0; + if (cpu_isolated(this_cpu)) + return 0; + idle_enter_fair(this_rq); /* @@ -9460,9 +10929,12 @@ static int idle_balance(struct rq *this_rq) /* * Stop searching for tasks to pull if there are - * now runnable tasks on this rq. + * now runnable tasks on the balance rq or if + * continue_balancing has been unset (only possible + * due to active migration). */ - if (pulled_task || this_rq->nr_running > 0) + if (pulled_task || this_rq->nr_running > 0 || + !continue_balancing) break; } rcu_read_unlock(); @@ -9514,13 +10986,19 @@ static int active_load_balance_cpu_stop(void *data) struct task_struct *push_task = NULL; int push_task_detached = 0; struct lb_env env = { - .sd = sd, - .dst_cpu = target_cpu, - .dst_rq = target_rq, - .src_cpu = busiest_rq->cpu, - .src_rq = busiest_rq, - .idle = CPU_IDLE, + .sd = sd, + .dst_cpu = target_cpu, + .dst_rq = target_rq, + .src_cpu = busiest_rq->cpu, + .src_rq = busiest_rq, + .idle = CPU_IDLE, + .busiest_nr_running = 0, + .busiest_grp_capacity = 0, + .flags = 0, + .loop = 0, + .boost_policy = sched_boost_policy(), }; + bool moved = false; raw_spin_lock_irq(&busiest_rq->lock); @@ -9541,12 +11019,15 @@ static int active_load_balance_cpu_stop(void *data) BUG_ON(busiest_rq == target_rq); push_task = busiest_rq->push_task; + target_cpu = busiest_rq->push_cpu; if (push_task) { if (task_on_rq_queued(push_task) && + push_task->state == TASK_RUNNING && task_cpu(push_task) == busiest_cpu && cpu_online(target_cpu)) { detach_task(push_task, &env); push_task_detached = 1; + moved = true; } goto out_unlock; } @@ -9565,14 +11046,18 @@ static int active_load_balance_cpu_stop(void *data) update_rq_clock(busiest_rq); p = detach_one_task(&env); - if (p) + if (p) { schedstat_inc(sd, alb_pushed); - else + moved = true; + } else { schedstat_inc(sd, alb_failed); + } } rcu_read_unlock(); out_unlock: busiest_rq->active_balance = 0; + push_task = busiest_rq->push_task; + target_cpu = busiest_rq->push_cpu; if (push_task) busiest_rq->push_task = NULL; @@ -9583,6 +11068,7 @@ out_unlock: if (push_task_detached) attach_one_task(target_rq, push_task); put_task_struct(push_task); + clear_reserved(target_cpu); } if (p) @@ -9590,6 +11076,15 @@ out_unlock: local_irq_enable(); + if (moved && !same_freq_domain(busiest_cpu, target_cpu)) { + int check_groups = !!(env.flags & + LBF_MOVED_RELATED_THREAD_GROUP_TASK); + check_for_freq_change(busiest_rq, false, check_groups); + check_for_freq_change(target_rq, false, check_groups); + } else if (moved) { + check_for_freq_change(target_rq, true, false); + } + return 0; } @@ -9605,9 +11100,49 @@ static inline int on_null_domain(struct rq *rq) * needed, they will kick the idle load balancer, which then does idle * load balancing for all the idle CPUs. */ -static inline int find_new_ilb(void) + +#ifdef CONFIG_SCHED_HMP +static inline int find_new_hmp_ilb(int type) { - int ilb = cpumask_first(nohz.idle_cpus_mask); + int call_cpu = raw_smp_processor_id(); + struct sched_domain *sd; + int ilb; + + rcu_read_lock(); + + /* Pick an idle cpu "closest" to call_cpu */ + for_each_domain(call_cpu, sd) { + for_each_cpu_and(ilb, nohz.idle_cpus_mask, + sched_domain_span(sd)) { + if (idle_cpu(ilb) && (type != NOHZ_KICK_RESTRICT || + cpu_max_power_cost(ilb) <= + cpu_max_power_cost(call_cpu))) { + rcu_read_unlock(); + reset_balance_interval(ilb); + return ilb; + } + } + } + + rcu_read_unlock(); + return nr_cpu_ids; +} +#else /* CONFIG_SCHED_HMP */ +static inline int find_new_hmp_ilb(int type) +{ + return 0; +} +#endif /* CONFIG_SCHED_HMP */ + +static inline int find_new_ilb(int type) +{ + int ilb; + +#ifdef CONFIG_SCHED_HMP + return find_new_hmp_ilb(type); +#endif + + ilb = cpumask_first(nohz.idle_cpus_mask); if (ilb < nr_cpu_ids && idle_cpu(ilb)) return ilb; @@ -9620,13 +11155,13 @@ static inline int find_new_ilb(void) * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle * CPU (if there is one). */ -static void nohz_balancer_kick(void) +static void nohz_balancer_kick(int type) { int ilb_cpu; nohz.next_balance++; - ilb_cpu = find_new_ilb(); + ilb_cpu = find_new_ilb(type); if (ilb_cpu >= nr_cpu_ids) return; @@ -9643,16 +11178,21 @@ static void nohz_balancer_kick(void) return; } +void nohz_balance_clear_nohz_mask(int cpu) +{ + if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) { + cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); + atomic_dec(&nohz.nr_cpus); + } +} + static inline void nohz_balance_exit_idle(int cpu) { if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { /* * Completely isolated CPUs don't ever set, so we must test. */ - if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) { - cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); - atomic_dec(&nohz.nr_cpus); - } + nohz_balance_clear_nohz_mask(cpu); clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); } } @@ -9709,7 +11249,7 @@ void nohz_balance_enter_idle(int cpu) /* * If we're a completely isolated CPU, we don't play. */ - if (on_null_domain(cpu_rq(cpu))) + if (on_null_domain(cpu_rq(cpu)) || cpu_isolated(cpu)) return; cpumask_set_cpu(cpu, nohz.idle_cpus_mask); @@ -9738,7 +11278,13 @@ static DEFINE_SPINLOCK(balancing); */ void update_max_interval(void) { - max_load_balance_interval = HZ*num_online_cpus()/10; + cpumask_t avail_mask; + unsigned int available_cpus; + + cpumask_andnot(&avail_mask, cpu_online_mask, cpu_isolated_mask); + available_cpus = cpumask_weight(&avail_mask); + + max_load_balance_interval = HZ*available_cpus/10; } /* @@ -9863,12 +11409,15 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) /* Earliest time when we have to do rebalance again */ unsigned long next_balance = jiffies + 60*HZ; int update_next_balance = 0; + cpumask_t cpus; if (idle != CPU_IDLE || !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu))) goto end; - for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { + cpumask_andnot(&cpus, nohz.idle_cpus_mask, cpu_isolated_mask); + + for_each_cpu(balance_cpu, &cpus) { if (balance_cpu == this_cpu || !idle_cpu(balance_cpu)) continue; @@ -9911,6 +11460,79 @@ end: clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)); } +#ifdef CONFIG_SCHED_HMP +static inline int _nohz_kick_needed_hmp(struct rq *rq, int cpu, int *type) +{ + struct sched_domain *sd; + int i; + + if (rq->nr_running < 2) + return 0; + + if (!sysctl_sched_restrict_cluster_spill || + sched_boost_policy() == SCHED_BOOST_ON_ALL) + return 1; + + if (cpu_max_power_cost(cpu) == max_power_cost) + return 1; + + rcu_read_lock(); + sd = rcu_dereference_check_sched_domain(rq->sd); + if (!sd) { + rcu_read_unlock(); + return 0; + } + + for_each_cpu(i, sched_domain_span(sd)) { + if (cpu_load(i) < sched_spill_load && + cpu_rq(i)->nr_running < + sysctl_sched_spill_nr_run) { + /* Change the kick type to limit to CPUs that + * are of equal or lower capacity. + */ + *type = NOHZ_KICK_RESTRICT; + break; + } + } + rcu_read_unlock(); + return 1; +} +#else +static inline int _nohz_kick_needed_hmp(struct rq *rq, int cpu, int *type) +{ + return 0; +} +#endif + +static inline int _nohz_kick_needed(struct rq *rq, int cpu, int *type) +{ + unsigned long now = jiffies; + + /* + * None are in tickless mode and hence no need for NOHZ idle load + * balancing. + */ + if (likely(!atomic_read(&nohz.nr_cpus))) + return 0; + +#ifdef CONFIG_SCHED_HMP + return _nohz_kick_needed_hmp(rq, cpu, type); +#endif + + if (time_before(now, nohz.next_balance)) + return 0; + + if (rq->nr_running >= 2 && + (!energy_aware() || cpu_overutilized(cpu))) + return true; + + /* Do idle load balance if there have misfit task */ + if (energy_aware()) + return rq->misfit_task; + + return (rq->nr_running >= 2); +} + /* * Current heuristic for kicking the idle load balancer in the presence * of an idle cpu in the system. @@ -9922,12 +11544,14 @@ end: * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler * domain span are idle. */ -static inline bool nohz_kick_needed(struct rq *rq) +static inline bool nohz_kick_needed(struct rq *rq, int *type) { - unsigned long now = jiffies; +#ifndef CONFIG_SCHED_HMP struct sched_domain *sd; struct sched_group_capacity *sgc; - int nr_busy, cpu = rq->cpu; + int nr_busy; +#endif + int cpu = rq->cpu; bool kick = false; if (unlikely(rq->idle_balance)) @@ -9940,24 +11564,10 @@ static inline bool nohz_kick_needed(struct rq *rq) set_cpu_sd_state_busy(); nohz_balance_exit_idle(cpu); - /* - * None are in tickless mode and hence no need for NOHZ idle load - * balancing. - */ - if (likely(!atomic_read(&nohz.nr_cpus))) - return false; - - if (time_before(now, nohz.next_balance)) - return false; - - if (rq->nr_running >= 2 && - (!energy_aware() || cpu_overutilized(cpu))) + if (_nohz_kick_needed(rq, cpu, type)) return true; - /* Do idle load balance if there have misfit task */ - if (energy_aware()) - return rq->misfit_task; - +#ifndef CONFIG_SCHED_HMP rcu_read_lock(); sd = rcu_dereference(per_cpu(sd_busy, cpu)); if (sd) { @@ -9989,6 +11599,7 @@ static inline bool nohz_kick_needed(struct rq *rq) unlock: rcu_read_unlock(); +#endif return kick; } #else @@ -10022,15 +11633,19 @@ static void run_rebalance_domains(struct softirq_action *h) */ void trigger_load_balance(struct rq *rq) { - /* Don't need to rebalance while attached to NULL domain */ - if (unlikely(on_null_domain(rq))) + int type = NOHZ_KICK_ANY; + + /* Don't need to rebalance while attached to NULL domain or + * cpu is isolated. + */ + if (unlikely(on_null_domain(rq)) || cpu_isolated(cpu_of(rq))) return; if (time_after_eq(jiffies, rq->next_balance)) raise_softirq(SCHED_SOFTIRQ); #ifdef CONFIG_NO_HZ_COMMON - if (nohz_kick_needed(rq)) - nohz_balancer_kick(); + if (nohz_kick_needed(rq, &type)) + nohz_balancer_kick(type); #endif } @@ -10049,47 +11664,6 @@ static void rq_offline_fair(struct rq *rq) unthrottle_offline_cfs_rqs(rq); } -static inline int -kick_active_balance(struct rq *rq, struct task_struct *p, int new_cpu) -{ - int rc = 0; - - /* Invoke active balance to force migrate currently running task */ - raw_spin_lock(&rq->lock); - if (!rq->active_balance) { - rq->active_balance = 1; - rq->push_cpu = new_cpu; - get_task_struct(p); - rq->push_task = p; - rc = 1; - } - raw_spin_unlock(&rq->lock); - - return rc; -} - -void check_for_migration(struct rq *rq, struct task_struct *p) -{ - int new_cpu; - int active_balance; - int cpu = task_cpu(p); - - if (energy_aware() && rq->misfit_task) { - if (rq->curr->state != TASK_RUNNING || - rq->curr->nr_cpus_allowed == 1) - return; - - new_cpu = select_energy_cpu_brute(p, cpu, 0); - if (capacity_orig_of(new_cpu) > capacity_orig_of(cpu)) { - active_balance = kick_active_balance(rq, p, new_cpu); - if (active_balance) - stop_one_cpu_nowait(cpu, - active_load_balance_cpu_stop, - rq, &rq->active_balance_work); - } - } -} - #endif /* CONFIG_SMP */ /* @@ -10109,7 +11683,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) task_tick_numa(rq, curr); #ifdef CONFIG_SMP - if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) { + if (energy_aware() && + !rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) { rq->rd->overutilized = true; trace_sched_overutilized(true); } @@ -10609,6 +12184,11 @@ const struct sched_class fair_sched_class = { #ifdef CONFIG_FAIR_GROUP_SCHED .task_change_group = task_change_group_fair, #endif +#ifdef CONFIG_SCHED_HMP + .inc_hmp_sched_stats = inc_hmp_sched_stats_fair, + .dec_hmp_sched_stats = dec_hmp_sched_stats_fair, + .fixup_hmp_sched_stats = fixup_hmp_sched_stats_fair, +#endif }; #ifdef CONFIG_SCHED_DEBUG diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 55e461055332..c30c48fde7e6 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -49,7 +49,7 @@ SCHED_FEAT(NONTASK_CAPACITY, true) * Queue remote wakeups on the target CPU and process them * using the scheduler IPI. Reduces rq->lock contention/bounces. */ -SCHED_FEAT(TTWU_QUEUE, true) +SCHED_FEAT(TTWU_QUEUE, false) #ifdef HAVE_RT_PUSH_IPI /* diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c new file mode 100644 index 000000000000..598656b42203 --- /dev/null +++ b/kernel/sched/hmp.c @@ -0,0 +1,4416 @@ +/* Copyright (c) 2012-2018, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Implementation credits: Srivatsa Vaddagiri, Steve Muckle + * Syed Rameez Mustafa, Olav haugan, Joonwoo Park, Pavan Kumar Kondeti + * and Vikram Mulukutla + */ + +#include <linux/cpufreq.h> +#include <linux/list_sort.h> +#include <linux/syscore_ops.h> + +#include "sched.h" + +#include <trace/events/sched.h> + +#define CSTATE_LATENCY_GRANULARITY_SHIFT (6) + +const char *task_event_names[] = {"PUT_PREV_TASK", "PICK_NEXT_TASK", + "TASK_WAKE", "TASK_MIGRATE", "TASK_UPDATE", + "IRQ_UPDATE"}; + +const char *migrate_type_names[] = {"GROUP_TO_RQ", "RQ_TO_GROUP"}; + +static ktime_t ktime_last; +static bool sched_ktime_suspended; + +static bool use_cycle_counter; +static struct cpu_cycle_counter_cb cpu_cycle_counter_cb; + +u64 sched_ktime_clock(void) +{ + if (unlikely(sched_ktime_suspended)) + return ktime_to_ns(ktime_last); + return ktime_get_ns(); +} + +static void sched_resume(void) +{ + sched_ktime_suspended = false; +} + +static int sched_suspend(void) +{ + ktime_last = ktime_get(); + sched_ktime_suspended = true; + return 0; +} + +static struct syscore_ops sched_syscore_ops = { + .resume = sched_resume, + .suspend = sched_suspend +}; + +static int __init sched_init_ops(void) +{ + register_syscore_ops(&sched_syscore_ops); + return 0; +} +late_initcall(sched_init_ops); + +inline void clear_ed_task(struct task_struct *p, struct rq *rq) +{ + if (p == rq->ed_task) + rq->ed_task = NULL; +} + +inline void set_task_last_switch_out(struct task_struct *p, u64 wallclock) +{ + p->last_switch_out_ts = wallclock; +} + +/* + * Note C-state for (idle) cpus. + * + * @cstate = cstate index, 0 -> active state + * @wakeup_energy = energy spent in waking up cpu + * @wakeup_latency = latency to wakeup from cstate + * + */ +void +sched_set_cpu_cstate(int cpu, int cstate, int wakeup_energy, int wakeup_latency) +{ + struct rq *rq = cpu_rq(cpu); + + rq->cstate = cstate; /* C1, C2 etc */ + rq->wakeup_energy = wakeup_energy; + /* disregard small latency delta (64 us). */ + rq->wakeup_latency = ((wakeup_latency >> + CSTATE_LATENCY_GRANULARITY_SHIFT) << + CSTATE_LATENCY_GRANULARITY_SHIFT); +} + +/* + * Note D-state for (idle) cluster. + * + * @dstate = dstate index, 0 -> active state + * @wakeup_energy = energy spent in waking up cluster + * @wakeup_latency = latency to wakeup from cluster + * + */ +void sched_set_cluster_dstate(const cpumask_t *cluster_cpus, int dstate, + int wakeup_energy, int wakeup_latency) +{ + struct sched_cluster *cluster = + cpu_rq(cpumask_first(cluster_cpus))->cluster; + cluster->dstate = dstate; + cluster->dstate_wakeup_energy = wakeup_energy; + cluster->dstate_wakeup_latency = wakeup_latency; +} + +u32 __weak get_freq_max_load(int cpu, u32 freq) +{ + /* 100% by default */ + return 100; +} + +struct freq_max_load_entry { + /* The maximum load which has accounted governor's headroom. */ + u64 hdemand; +}; + +struct freq_max_load { + struct rcu_head rcu; + int length; + struct freq_max_load_entry freqs[0]; +}; + +static DEFINE_PER_CPU(struct freq_max_load *, freq_max_load); +static DEFINE_SPINLOCK(freq_max_load_lock); + +struct cpu_pwr_stats __weak *get_cpu_pwr_stats(void) +{ + return NULL; +} + +int sched_update_freq_max_load(const cpumask_t *cpumask) +{ + int i, cpu, ret; + unsigned int freq; + struct cpu_pstate_pwr *costs; + struct cpu_pwr_stats *per_cpu_info = get_cpu_pwr_stats(); + struct freq_max_load *max_load, *old_max_load; + struct freq_max_load_entry *entry; + u64 max_demand_capacity, max_demand; + unsigned long flags; + u32 hfreq; + int hpct; + + if (!per_cpu_info) + return 0; + + spin_lock_irqsave(&freq_max_load_lock, flags); + max_demand_capacity = div64_u64(max_task_load(), max_possible_capacity); + for_each_cpu(cpu, cpumask) { + if (!per_cpu_info[cpu].ptable) { + ret = -EINVAL; + goto fail; + } + + old_max_load = rcu_dereference(per_cpu(freq_max_load, cpu)); + + /* + * allocate len + 1 and leave the last power cost as 0 for + * power_cost() can stop iterating index when + * per_cpu_info[cpu].len > len of max_load due to race between + * cpu power stats update and get_cpu_pwr_stats(). + */ + max_load = kzalloc(sizeof(struct freq_max_load) + + sizeof(struct freq_max_load_entry) * + (per_cpu_info[cpu].len + 1), GFP_ATOMIC); + if (unlikely(!max_load)) { + ret = -ENOMEM; + goto fail; + } + + max_load->length = per_cpu_info[cpu].len; + + max_demand = max_demand_capacity * + cpu_max_possible_capacity(cpu); + + i = 0; + costs = per_cpu_info[cpu].ptable; + while (costs[i].freq) { + entry = &max_load->freqs[i]; + freq = costs[i].freq; + hpct = get_freq_max_load(cpu, freq); + if (hpct <= 0 || hpct > 100) + hpct = 100; + hfreq = div64_u64((u64)freq * hpct, 100); + entry->hdemand = + div64_u64(max_demand * hfreq, + cpu_max_possible_freq(cpu)); + i++; + } + + rcu_assign_pointer(per_cpu(freq_max_load, cpu), max_load); + if (old_max_load) + kfree_rcu(old_max_load, rcu); + } + + spin_unlock_irqrestore(&freq_max_load_lock, flags); + return 0; + +fail: + for_each_cpu(cpu, cpumask) { + max_load = rcu_dereference(per_cpu(freq_max_load, cpu)); + if (max_load) { + rcu_assign_pointer(per_cpu(freq_max_load, cpu), NULL); + kfree_rcu(max_load, rcu); + } + } + + spin_unlock_irqrestore(&freq_max_load_lock, flags); + return ret; +} + +unsigned int max_possible_efficiency = 1; +unsigned int min_possible_efficiency = UINT_MAX; + +unsigned long __weak arch_get_cpu_efficiency(int cpu) +{ + return SCHED_LOAD_SCALE; +} + +/* Keep track of max/min capacity possible across CPUs "currently" */ +static void __update_min_max_capacity(void) +{ + int i; + int max_cap = 0, min_cap = INT_MAX; + + for_each_online_cpu(i) { + max_cap = max(max_cap, cpu_capacity(i)); + min_cap = min(min_cap, cpu_capacity(i)); + } + + max_capacity = max_cap; + min_capacity = min_cap; +} + +static void update_min_max_capacity(void) +{ + unsigned long flags; + int i; + + local_irq_save(flags); + for_each_possible_cpu(i) + raw_spin_lock(&cpu_rq(i)->lock); + + __update_min_max_capacity(); + + for_each_possible_cpu(i) + raw_spin_unlock(&cpu_rq(i)->lock); + local_irq_restore(flags); +} + +/* + * Return 'capacity' of a cpu in reference to "least" efficient cpu, such that + * least efficient cpu gets capacity of 1024 + */ +static unsigned long +capacity_scale_cpu_efficiency(struct sched_cluster *cluster) +{ + return (1024 * cluster->efficiency) / min_possible_efficiency; +} + +/* + * Return 'capacity' of a cpu in reference to cpu with lowest max_freq + * (min_max_freq), such that one with lowest max_freq gets capacity of 1024. + */ +static unsigned long capacity_scale_cpu_freq(struct sched_cluster *cluster) +{ + return (1024 * cluster_max_freq(cluster)) / min_max_freq; +} + +/* + * Return load_scale_factor of a cpu in reference to "most" efficient cpu, so + * that "most" efficient cpu gets a load_scale_factor of 1 + */ +static inline unsigned long +load_scale_cpu_efficiency(struct sched_cluster *cluster) +{ + return DIV_ROUND_UP(1024 * max_possible_efficiency, + cluster->efficiency); +} + +/* + * Return load_scale_factor of a cpu in reference to cpu with best max_freq + * (max_possible_freq), so that one with best max_freq gets a load_scale_factor + * of 1. + */ +static inline unsigned long load_scale_cpu_freq(struct sched_cluster *cluster) +{ + return DIV_ROUND_UP(1024 * max_possible_freq, + cluster_max_freq(cluster)); +} + +static int compute_capacity(struct sched_cluster *cluster) +{ + int capacity = 1024; + + capacity *= capacity_scale_cpu_efficiency(cluster); + capacity >>= 10; + + capacity *= capacity_scale_cpu_freq(cluster); + capacity >>= 10; + + return capacity; +} + +static int compute_max_possible_capacity(struct sched_cluster *cluster) +{ + int capacity = 1024; + + capacity *= capacity_scale_cpu_efficiency(cluster); + capacity >>= 10; + + capacity *= (1024 * cluster->max_possible_freq) / min_max_freq; + capacity >>= 10; + + return capacity; +} + +static int compute_load_scale_factor(struct sched_cluster *cluster) +{ + int load_scale = 1024; + + /* + * load_scale_factor accounts for the fact that task load + * is in reference to "best" performing cpu. Task's load will need to be + * scaled (up) by a factor to determine suitability to be placed on a + * (little) cpu. + */ + load_scale *= load_scale_cpu_efficiency(cluster); + load_scale >>= 10; + + load_scale *= load_scale_cpu_freq(cluster); + load_scale >>= 10; + + return load_scale; +} + +struct list_head cluster_head; +static DEFINE_MUTEX(cluster_lock); +static cpumask_t all_cluster_cpus = CPU_MASK_NONE; +DECLARE_BITMAP(all_cluster_ids, NR_CPUS); +struct sched_cluster *sched_cluster[NR_CPUS]; +int num_clusters; + +unsigned int max_power_cost = 1; + +struct sched_cluster init_cluster = { + .list = LIST_HEAD_INIT(init_cluster.list), + .id = 0, + .max_power_cost = 1, + .min_power_cost = 1, + .capacity = 1024, + .max_possible_capacity = 1024, + .efficiency = 1, + .load_scale_factor = 1024, + .cur_freq = 1, + .max_freq = 1, + .max_mitigated_freq = UINT_MAX, + .min_freq = 1, + .max_possible_freq = 1, + .dstate = 0, + .dstate_wakeup_energy = 0, + .dstate_wakeup_latency = 0, + .exec_scale_factor = 1024, + .notifier_sent = 0, + .wake_up_idle = 0, +}; + +static void update_all_clusters_stats(void) +{ + struct sched_cluster *cluster; + u64 highest_mpc = 0, lowest_mpc = U64_MAX; + + pre_big_task_count_change(cpu_possible_mask); + + for_each_sched_cluster(cluster) { + u64 mpc; + + cluster->capacity = compute_capacity(cluster); + mpc = cluster->max_possible_capacity = + compute_max_possible_capacity(cluster); + cluster->load_scale_factor = compute_load_scale_factor(cluster); + + cluster->exec_scale_factor = + DIV_ROUND_UP(cluster->efficiency * 1024, + max_possible_efficiency); + + if (mpc > highest_mpc) + highest_mpc = mpc; + + if (mpc < lowest_mpc) + lowest_mpc = mpc; + } + + max_possible_capacity = highest_mpc; + min_max_possible_capacity = lowest_mpc; + + __update_min_max_capacity(); + sched_update_freq_max_load(cpu_possible_mask); + post_big_task_count_change(cpu_possible_mask); +} + +static void assign_cluster_ids(struct list_head *head) +{ + struct sched_cluster *cluster; + int pos = 0; + + list_for_each_entry(cluster, head, list) { + cluster->id = pos; + sched_cluster[pos++] = cluster; + } +} + +static void +move_list(struct list_head *dst, struct list_head *src, bool sync_rcu) +{ + struct list_head *first, *last; + + first = src->next; + last = src->prev; + + if (sync_rcu) { + INIT_LIST_HEAD_RCU(src); + synchronize_rcu(); + } + + first->prev = dst; + dst->prev = last; + last->next = dst; + + /* Ensure list sanity before making the head visible to all CPUs. */ + smp_mb(); + dst->next = first; +} + +static int +compare_clusters(void *priv, struct list_head *a, struct list_head *b) +{ + struct sched_cluster *cluster1, *cluster2; + int ret; + + cluster1 = container_of(a, struct sched_cluster, list); + cluster2 = container_of(b, struct sched_cluster, list); + + /* + * Don't assume higher capacity means higher power. If the + * power cost is same, sort the higher capacity cluster before + * the lower capacity cluster to start placing the tasks + * on the higher capacity cluster. + */ + ret = cluster1->max_power_cost > cluster2->max_power_cost || + (cluster1->max_power_cost == cluster2->max_power_cost && + cluster1->max_possible_capacity < + cluster2->max_possible_capacity); + + return ret; +} + +static void sort_clusters(void) +{ + struct sched_cluster *cluster; + struct list_head new_head; + unsigned int tmp_max = 1; + + INIT_LIST_HEAD(&new_head); + + for_each_sched_cluster(cluster) { + cluster->max_power_cost = power_cost(cluster_first_cpu(cluster), + max_task_load()); + cluster->min_power_cost = power_cost(cluster_first_cpu(cluster), + 0); + + if (cluster->max_power_cost > tmp_max) + tmp_max = cluster->max_power_cost; + } + max_power_cost = tmp_max; + + move_list(&new_head, &cluster_head, true); + + list_sort(NULL, &new_head, compare_clusters); + assign_cluster_ids(&new_head); + + /* + * Ensure cluster ids are visible to all CPUs before making + * cluster_head visible. + */ + move_list(&cluster_head, &new_head, false); +} + +static void +insert_cluster(struct sched_cluster *cluster, struct list_head *head) +{ + struct sched_cluster *tmp; + struct list_head *iter = head; + + list_for_each_entry(tmp, head, list) { + if (cluster->max_power_cost < tmp->max_power_cost) + break; + iter = &tmp->list; + } + + list_add(&cluster->list, iter); +} + +static struct sched_cluster *alloc_new_cluster(const struct cpumask *cpus) +{ + struct sched_cluster *cluster = NULL; + + cluster = kzalloc(sizeof(struct sched_cluster), GFP_ATOMIC); + if (!cluster) { + __WARN_printf("Cluster allocation failed. \ + Possible bad scheduling\n"); + return NULL; + } + + INIT_LIST_HEAD(&cluster->list); + cluster->max_power_cost = 1; + cluster->min_power_cost = 1; + cluster->capacity = 1024; + cluster->max_possible_capacity = 1024; + cluster->efficiency = 1; + cluster->load_scale_factor = 1024; + cluster->cur_freq = 1; + cluster->max_freq = 1; + cluster->max_mitigated_freq = UINT_MAX; + cluster->min_freq = 1; + cluster->max_possible_freq = 1; + cluster->dstate = 0; + cluster->dstate_wakeup_energy = 0; + cluster->dstate_wakeup_latency = 0; + cluster->freq_init_done = false; + + raw_spin_lock_init(&cluster->load_lock); + cluster->cpus = *cpus; + cluster->efficiency = arch_get_cpu_efficiency(cpumask_first(cpus)); + + if (cluster->efficiency > max_possible_efficiency) + max_possible_efficiency = cluster->efficiency; + if (cluster->efficiency < min_possible_efficiency) + min_possible_efficiency = cluster->efficiency; + + cluster->notifier_sent = 0; + return cluster; +} + +static void add_cluster(const struct cpumask *cpus, struct list_head *head) +{ + struct sched_cluster *cluster = alloc_new_cluster(cpus); + int i; + + if (!cluster) + return; + + for_each_cpu(i, cpus) + cpu_rq(i)->cluster = cluster; + + insert_cluster(cluster, head); + set_bit(num_clusters, all_cluster_ids); + num_clusters++; +} + +void update_cluster_topology(void) +{ + struct cpumask cpus = *cpu_possible_mask; + const struct cpumask *cluster_cpus; + struct list_head new_head; + int i; + + INIT_LIST_HEAD(&new_head); + + for_each_cpu(i, &cpus) { + cluster_cpus = cpu_coregroup_mask(i); + cpumask_or(&all_cluster_cpus, &all_cluster_cpus, cluster_cpus); + cpumask_andnot(&cpus, &cpus, cluster_cpus); + add_cluster(cluster_cpus, &new_head); + } + + assign_cluster_ids(&new_head); + + /* + * Ensure cluster ids are visible to all CPUs before making + * cluster_head visible. + */ + move_list(&cluster_head, &new_head, false); + update_all_clusters_stats(); +} + +void init_clusters(void) +{ + bitmap_clear(all_cluster_ids, 0, NR_CPUS); + init_cluster.cpus = *cpu_possible_mask; + raw_spin_lock_init(&init_cluster.load_lock); + INIT_LIST_HEAD(&cluster_head); +} + +int register_cpu_cycle_counter_cb(struct cpu_cycle_counter_cb *cb) +{ + mutex_lock(&cluster_lock); + if (!cb->get_cpu_cycle_counter) { + mutex_unlock(&cluster_lock); + return -EINVAL; + } + + cpu_cycle_counter_cb = *cb; + use_cycle_counter = true; + mutex_unlock(&cluster_lock); + + return 0; +} + +/* Clear any HMP scheduler related requests pending from or on cpu */ +void clear_hmp_request(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + clear_boost_kick(cpu); + clear_reserved(cpu); + if (rq->push_task) { + struct task_struct *push_task = NULL; + + raw_spin_lock_irqsave(&rq->lock, flags); + if (rq->push_task) { + clear_reserved(rq->push_cpu); + push_task = rq->push_task; + rq->push_task = NULL; + } + rq->active_balance = 0; + raw_spin_unlock_irqrestore(&rq->lock, flags); + if (push_task) + put_task_struct(push_task); + } +} + +int sched_set_static_cpu_pwr_cost(int cpu, unsigned int cost) +{ + struct rq *rq = cpu_rq(cpu); + + rq->static_cpu_pwr_cost = cost; + return 0; +} + +unsigned int sched_get_static_cpu_pwr_cost(int cpu) +{ + return cpu_rq(cpu)->static_cpu_pwr_cost; +} + +int sched_set_static_cluster_pwr_cost(int cpu, unsigned int cost) +{ + struct sched_cluster *cluster = cpu_rq(cpu)->cluster; + + cluster->static_cluster_pwr_cost = cost; + return 0; +} + +unsigned int sched_get_static_cluster_pwr_cost(int cpu) +{ + return cpu_rq(cpu)->cluster->static_cluster_pwr_cost; +} + +int sched_set_cluster_wake_idle(int cpu, unsigned int wake_idle) +{ + struct sched_cluster *cluster = cpu_rq(cpu)->cluster; + + cluster->wake_up_idle = !!wake_idle; + return 0; +} + +unsigned int sched_get_cluster_wake_idle(int cpu) +{ + return cpu_rq(cpu)->cluster->wake_up_idle; +} + +/* + * sched_window_stats_policy and sched_ravg_hist_size have a 'sysctl' copy + * associated with them. This is required for atomic update of those variables + * when being modifed via sysctl interface. + * + * IMPORTANT: Initialize both copies to same value!! + */ + +/* + * Tasks that are runnable continuously for a period greather than + * EARLY_DETECTION_DURATION can be flagged early as potential + * high load tasks. + */ +#define EARLY_DETECTION_DURATION 9500000 + +static __read_mostly unsigned int sched_ravg_hist_size = 5; +__read_mostly unsigned int sysctl_sched_ravg_hist_size = 5; + +static __read_mostly unsigned int sched_window_stats_policy = + WINDOW_STATS_MAX_RECENT_AVG; +__read_mostly unsigned int sysctl_sched_window_stats_policy = + WINDOW_STATS_MAX_RECENT_AVG; + +#define SCHED_ACCOUNT_WAIT_TIME 1 + +__read_mostly unsigned int sysctl_sched_cpu_high_irqload = (10 * NSEC_PER_MSEC); + +/* + * Enable colocation and frequency aggregation for all threads in a process. + * The children inherits the group id from the parent. + */ +unsigned int __read_mostly sysctl_sched_enable_thread_grouping; + + +#define SCHED_NEW_TASK_WINDOWS 5 + +#define SCHED_FREQ_ACCOUNT_WAIT_TIME 0 + +/* + * This governs what load needs to be used when reporting CPU busy time + * to the cpufreq governor. + */ +__read_mostly unsigned int sysctl_sched_freq_reporting_policy; + +/* + * For increase, send notification if + * freq_required - cur_freq > sysctl_sched_freq_inc_notify + */ +__read_mostly int sysctl_sched_freq_inc_notify = 10 * 1024 * 1024; /* + 10GHz */ + +/* + * For decrease, send notification if + * cur_freq - freq_required > sysctl_sched_freq_dec_notify + */ +__read_mostly int sysctl_sched_freq_dec_notify = 10 * 1024 * 1024; /* - 10GHz */ + +static __read_mostly unsigned int sched_io_is_busy; + +__read_mostly unsigned int sysctl_sched_pred_alert_freq = 10 * 1024 * 1024; + +/* + * Maximum possible frequency across all cpus. Task demand and cpu + * capacity (cpu_power) metrics are scaled in reference to it. + */ +unsigned int max_possible_freq = 1; + +/* + * Minimum possible max_freq across all cpus. This will be same as + * max_possible_freq on homogeneous systems and could be different from + * max_possible_freq on heterogenous systems. min_max_freq is used to derive + * capacity (cpu_power) of cpus. + */ +unsigned int min_max_freq = 1; + +unsigned int max_capacity = 1024; /* max(rq->capacity) */ +unsigned int min_capacity = 1024; /* min(rq->capacity) */ +unsigned int max_possible_capacity = 1024; /* max(rq->max_possible_capacity) */ +unsigned int +min_max_possible_capacity = 1024; /* min(rq->max_possible_capacity) */ + +/* Min window size (in ns) = 10ms */ +#define MIN_SCHED_RAVG_WINDOW 10000000 + +/* Max window size (in ns) = 1s */ +#define MAX_SCHED_RAVG_WINDOW 1000000000 + +/* Window size (in ns) */ +__read_mostly unsigned int sched_ravg_window = MIN_SCHED_RAVG_WINDOW; + +/* Maximum allowed threshold before freq aggregation must be enabled */ +#define MAX_FREQ_AGGR_THRESH 1000 + +/* Temporarily disable window-stats activity on all cpus */ +unsigned int __read_mostly sched_disable_window_stats; + +struct related_thread_group *related_thread_groups[MAX_NUM_CGROUP_COLOC_ID]; +static LIST_HEAD(active_related_thread_groups); +static DEFINE_RWLOCK(related_thread_group_lock); + +#define for_each_related_thread_group(grp) \ + list_for_each_entry(grp, &active_related_thread_groups, list) + +/* + * Task load is categorized into buckets for the purpose of top task tracking. + * The entire range of load from 0 to sched_ravg_window needs to be covered + * in NUM_LOAD_INDICES number of buckets. Therefore the size of each bucket + * is given by sched_ravg_window / NUM_LOAD_INDICES. Since the default value + * of sched_ravg_window is MIN_SCHED_RAVG_WINDOW, use that to compute + * sched_load_granule. + */ +__read_mostly unsigned int sched_load_granule = + MIN_SCHED_RAVG_WINDOW / NUM_LOAD_INDICES; + +/* Size of bitmaps maintained to track top tasks */ +static const unsigned int top_tasks_bitmap_size = + BITS_TO_LONGS(NUM_LOAD_INDICES + 1) * sizeof(unsigned long); + +/* + * Demand aggregation for frequency purpose: + * + * 'sched_freq_aggregate' controls aggregation of cpu demand of related threads + * for frequency determination purpose. This aggregation is done per-cluster. + * + * CPU demand of tasks from various related groups is aggregated per-cluster and + * added to the "max_busy_cpu" in that cluster, where max_busy_cpu is determined + * by just rq->prev_runnable_sum. + * + * Some examples follow, which assume: + * Cluster0 = CPU0-3, Cluster1 = CPU4-7 + * One related thread group A that has tasks A0, A1, A2 + * + * A->cpu_time[X].curr/prev_sum = counters in which cpu execution stats of + * tasks belonging to group A are accumulated when they run on cpu X. + * + * CX->curr/prev_sum = counters in which cpu execution stats of all tasks + * not belonging to group A are accumulated when they run on cpu X + * + * Lets say the stats for window M was as below: + * + * C0->prev_sum = 1ms, A->cpu_time[0].prev_sum = 5ms + * Task A0 ran 5ms on CPU0 + * Task B0 ran 1ms on CPU0 + * + * C1->prev_sum = 5ms, A->cpu_time[1].prev_sum = 6ms + * Task A1 ran 4ms on CPU1 + * Task A2 ran 2ms on CPU1 + * Task B1 ran 5ms on CPU1 + * + * C2->prev_sum = 0ms, A->cpu_time[2].prev_sum = 0 + * CPU2 idle + * + * C3->prev_sum = 0ms, A->cpu_time[3].prev_sum = 0 + * CPU3 idle + * + * In this case, CPU1 was most busy going by just its prev_sum counter. Demand + * from all group A tasks are added to CPU1. IOW, at end of window M, cpu busy + * time reported to governor will be: + * + * + * C0 busy time = 1ms + * C1 busy time = 5 + 5 + 6 = 16ms + * + */ +static __read_mostly unsigned int sched_freq_aggregate = 1; +__read_mostly unsigned int sysctl_sched_freq_aggregate = 1; + +unsigned int __read_mostly sysctl_sched_freq_aggregate_threshold_pct; +static unsigned int __read_mostly sched_freq_aggregate_threshold; + +/* Initial task load. Newly created tasks are assigned this load. */ +unsigned int __read_mostly sched_init_task_load_windows; +unsigned int __read_mostly sysctl_sched_init_task_load_pct = 15; + +unsigned int max_task_load(void) +{ + return sched_ravg_window; +} + +/* A cpu can no longer accommodate more tasks if: + * + * rq->nr_running > sysctl_sched_spill_nr_run || + * rq->hmp_stats.cumulative_runnable_avg > sched_spill_load + */ +unsigned int __read_mostly sysctl_sched_spill_nr_run = 10; + +/* + * Place sync wakee tasks those have less than configured demand to the waker's + * cluster. + */ +unsigned int __read_mostly sched_small_wakee_task_load; +unsigned int __read_mostly sysctl_sched_small_wakee_task_load_pct = 10; + +unsigned int __read_mostly sched_big_waker_task_load; +unsigned int __read_mostly sysctl_sched_big_waker_task_load_pct = 25; + +/* + * CPUs with load greater than the sched_spill_load_threshold are not + * eligible for task placement. When all CPUs in a cluster achieve a + * load higher than this level, tasks becomes eligible for inter + * cluster migration. + */ +unsigned int __read_mostly sched_spill_load; +unsigned int __read_mostly sysctl_sched_spill_load_pct = 100; + +/* + * Prefer the waker CPU for sync wakee task, if the CPU has only 1 runnable + * task. This eliminates the LPM exit latency associated with the idle + * CPUs in the waker cluster. + */ +unsigned int __read_mostly sysctl_sched_prefer_sync_wakee_to_waker; + +/* + * Tasks whose bandwidth consumption on a cpu is more than + * sched_upmigrate are considered "big" tasks. Big tasks will be + * considered for "up" migration, i.e migrating to a cpu with better + * capacity. + */ +unsigned int __read_mostly sched_upmigrate; +unsigned int __read_mostly sysctl_sched_upmigrate_pct = 80; + +/* + * Big tasks, once migrated, will need to drop their bandwidth + * consumption to less than sched_downmigrate before they are "down" + * migrated. + */ +unsigned int __read_mostly sched_downmigrate; +unsigned int __read_mostly sysctl_sched_downmigrate_pct = 60; + +/* + * Task groups whose aggregate demand on a cpu is more than + * sched_group_upmigrate need to be up-migrated if possible. + */ +unsigned int __read_mostly sched_group_upmigrate; +unsigned int __read_mostly sysctl_sched_group_upmigrate_pct = 100; + +/* + * Task groups, once up-migrated, will need to drop their aggregate + * demand to less than sched_group_downmigrate before they are "down" + * migrated. + */ +unsigned int __read_mostly sched_group_downmigrate; +unsigned int __read_mostly sysctl_sched_group_downmigrate_pct = 95; + +/* + * The load scale factor of a CPU gets boosted when its max frequency + * is restricted due to which the tasks are migrating to higher capacity + * CPUs early. The sched_upmigrate threshold is auto-upgraded by + * rq->max_possible_freq/rq->max_freq of a lower capacity CPU. + */ +unsigned int up_down_migrate_scale_factor = 1024; + +/* + * Scheduler selects and places task to its previous CPU if sleep time is + * less than sysctl_sched_select_prev_cpu_us. + */ +unsigned int __read_mostly +sched_short_sleep_task_threshold = 2000 * NSEC_PER_USEC; + +unsigned int __read_mostly sysctl_sched_select_prev_cpu_us = 2000; + +unsigned int __read_mostly +sched_long_cpu_selection_threshold = 100 * NSEC_PER_MSEC; + +unsigned int __read_mostly sysctl_sched_restrict_cluster_spill; + +/* + * Scheduler tries to avoid waking up idle CPUs for tasks running + * in short bursts. If the task average burst is less than + * sysctl_sched_short_burst nanoseconds and it sleeps on an average + * for more than sysctl_sched_short_sleep nanoseconds, then the + * task is eligible for packing. + */ +unsigned int __read_mostly sysctl_sched_short_burst; +unsigned int __read_mostly sysctl_sched_short_sleep = 1 * NSEC_PER_MSEC; + +static void _update_up_down_migrate(unsigned int *up_migrate, + unsigned int *down_migrate, bool is_group) +{ + unsigned int delta; + + if (up_down_migrate_scale_factor == 1024) + return; + + delta = *up_migrate - *down_migrate; + + *up_migrate /= NSEC_PER_USEC; + *up_migrate *= up_down_migrate_scale_factor; + *up_migrate >>= 10; + *up_migrate *= NSEC_PER_USEC; + + if (!is_group) + *up_migrate = min(*up_migrate, sched_ravg_window); + + *down_migrate /= NSEC_PER_USEC; + *down_migrate *= up_down_migrate_scale_factor; + *down_migrate >>= 10; + *down_migrate *= NSEC_PER_USEC; + + *down_migrate = min(*down_migrate, *up_migrate - delta); +} + +static void update_up_down_migrate(void) +{ + unsigned int up_migrate = pct_to_real(sysctl_sched_upmigrate_pct); + unsigned int down_migrate = pct_to_real(sysctl_sched_downmigrate_pct); + + _update_up_down_migrate(&up_migrate, &down_migrate, false); + sched_upmigrate = up_migrate; + sched_downmigrate = down_migrate; + + up_migrate = pct_to_real(sysctl_sched_group_upmigrate_pct); + down_migrate = pct_to_real(sysctl_sched_group_downmigrate_pct); + + _update_up_down_migrate(&up_migrate, &down_migrate, true); + sched_group_upmigrate = up_migrate; + sched_group_downmigrate = down_migrate; +} + +void set_hmp_defaults(void) +{ + sched_spill_load = + pct_to_real(sysctl_sched_spill_load_pct); + + update_up_down_migrate(); + + sched_init_task_load_windows = + div64_u64((u64)sysctl_sched_init_task_load_pct * + (u64)sched_ravg_window, 100); + + sched_short_sleep_task_threshold = sysctl_sched_select_prev_cpu_us * + NSEC_PER_USEC; + + sched_small_wakee_task_load = + div64_u64((u64)sysctl_sched_small_wakee_task_load_pct * + (u64)sched_ravg_window, 100); + + sched_big_waker_task_load = + div64_u64((u64)sysctl_sched_big_waker_task_load_pct * + (u64)sched_ravg_window, 100); + + sched_freq_aggregate_threshold = + pct_to_real(sysctl_sched_freq_aggregate_threshold_pct); +} + +u32 sched_get_init_task_load(struct task_struct *p) +{ + return p->init_load_pct; +} + +int sched_set_init_task_load(struct task_struct *p, int init_load_pct) +{ + if (init_load_pct < 0 || init_load_pct > 100) + return -EINVAL; + + p->init_load_pct = init_load_pct; + + return 0; +} + +#ifdef CONFIG_CGROUP_SCHED + +int upmigrate_discouraged(struct task_struct *p) +{ + return task_group(p)->upmigrate_discouraged; +} + +#else + +static inline int upmigrate_discouraged(struct task_struct *p) +{ + return 0; +} + +#endif + +/* Is a task "big" on its current cpu */ +static inline int __is_big_task(struct task_struct *p, u64 scaled_load) +{ + int nice = task_nice(p); + + if (nice > SCHED_UPMIGRATE_MIN_NICE || upmigrate_discouraged(p)) + return 0; + + return scaled_load > sched_upmigrate; +} + +int is_big_task(struct task_struct *p) +{ + return __is_big_task(p, scale_load_to_cpu(task_load(p), task_cpu(p))); +} + +u64 cpu_load(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + return scale_load_to_cpu(rq->hmp_stats.cumulative_runnable_avg, cpu); +} + +u64 cpu_load_sync(int cpu, int sync) +{ + return scale_load_to_cpu(cpu_cravg_sync(cpu, sync), cpu); +} + +/* + * Task will fit on a cpu if it's bandwidth consumption on that cpu + * will be less than sched_upmigrate. A big task that was previously + * "up" migrated will be considered fitting on "little" cpu if its + * bandwidth consumption on "little" cpu will be less than + * sched_downmigrate. This will help avoid frequenty migrations for + * tasks with load close to the upmigrate threshold + */ +int task_load_will_fit(struct task_struct *p, u64 task_load, int cpu, + enum sched_boost_policy boost_policy) +{ + int upmigrate = sched_upmigrate; + + if (cpu_capacity(cpu) == max_capacity) + return 1; + + if (cpu_capacity(task_cpu(p)) > cpu_capacity(cpu)) + upmigrate = sched_downmigrate; + + if (boost_policy != SCHED_BOOST_ON_BIG) { + if (task_nice(p) > SCHED_UPMIGRATE_MIN_NICE || + upmigrate_discouraged(p)) + return 1; + + if (task_load < upmigrate) + return 1; + } else { + if (task_sched_boost(p) || task_load >= upmigrate) + return 0; + + return 1; + } + + return 0; +} + +int task_will_fit(struct task_struct *p, int cpu) +{ + u64 tload = scale_load_to_cpu(task_load(p), cpu); + + return task_load_will_fit(p, tload, cpu, sched_boost_policy()); +} + +static int +group_will_fit(struct sched_cluster *cluster, struct related_thread_group *grp, + u64 demand, bool group_boost) +{ + int cpu = cluster_first_cpu(cluster); + int prev_capacity = 0; + unsigned int threshold = sched_group_upmigrate; + u64 load; + + if (cluster->capacity == max_capacity) + return 1; + + if (group_boost) + return 0; + + if (!demand) + return 1; + + if (grp->preferred_cluster) + prev_capacity = grp->preferred_cluster->capacity; + + if (cluster->capacity < prev_capacity) + threshold = sched_group_downmigrate; + + load = scale_load_to_cpu(demand, cpu); + if (load < threshold) + return 1; + + return 0; +} + +/* + * Return the cost of running task p on CPU cpu. This function + * currently assumes that task p is the only task which will run on + * the CPU. + */ +unsigned int power_cost(int cpu, u64 demand) +{ + int first, mid, last; + struct cpu_pwr_stats *per_cpu_info = get_cpu_pwr_stats(); + struct cpu_pstate_pwr *costs; + struct freq_max_load *max_load; + int total_static_pwr_cost = 0; + struct rq *rq = cpu_rq(cpu); + unsigned int pc; + + if (!per_cpu_info || !per_cpu_info[cpu].ptable) + /* + * When power aware scheduling is not in use, or CPU + * power data is not available, just use the CPU + * capacity as a rough stand-in for real CPU power + * numbers, assuming bigger CPUs are more power + * hungry. + */ + return cpu_max_possible_capacity(cpu); + + rcu_read_lock(); + max_load = rcu_dereference(per_cpu(freq_max_load, cpu)); + if (!max_load) { + pc = cpu_max_possible_capacity(cpu); + goto unlock; + } + + costs = per_cpu_info[cpu].ptable; + + if (demand <= max_load->freqs[0].hdemand) { + pc = costs[0].power; + goto unlock; + } else if (demand > max_load->freqs[max_load->length - 1].hdemand) { + pc = costs[max_load->length - 1].power; + goto unlock; + } + + first = 0; + last = max_load->length - 1; + mid = (last - first) >> 1; + while (1) { + if (demand <= max_load->freqs[mid].hdemand) + last = mid; + else + first = mid; + + if (last - first == 1) + break; + mid = first + ((last - first) >> 1); + } + + pc = costs[last].power; + +unlock: + rcu_read_unlock(); + + if (idle_cpu(cpu) && rq->cstate) { + total_static_pwr_cost += rq->static_cpu_pwr_cost; + if (rq->cluster->dstate) + total_static_pwr_cost += + rq->cluster->static_cluster_pwr_cost; + } + + return pc + total_static_pwr_cost; + +} + +void inc_nr_big_task(struct hmp_sched_stats *stats, struct task_struct *p) +{ + if (sched_disable_window_stats) + return; + + if (is_big_task(p)) + stats->nr_big_tasks++; +} + +void dec_nr_big_task(struct hmp_sched_stats *stats, struct task_struct *p) +{ + if (sched_disable_window_stats) + return; + + if (is_big_task(p)) + stats->nr_big_tasks--; + + BUG_ON(stats->nr_big_tasks < 0); +} + +void inc_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra) +{ + inc_nr_big_task(&rq->hmp_stats, p); + if (change_cra) + inc_cumulative_runnable_avg(&rq->hmp_stats, p); +} + +void dec_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra) +{ + dec_nr_big_task(&rq->hmp_stats, p); + if (change_cra) + dec_cumulative_runnable_avg(&rq->hmp_stats, p); +} + +void reset_hmp_stats(struct hmp_sched_stats *stats, int reset_cra) +{ + stats->nr_big_tasks = 0; + if (reset_cra) { + stats->cumulative_runnable_avg = 0; + stats->pred_demands_sum = 0; + } +} + +int preferred_cluster(struct sched_cluster *cluster, struct task_struct *p) +{ + struct related_thread_group *grp; + int rc = 1; + + rcu_read_lock(); + + grp = task_related_thread_group(p); + if (grp) + rc = (grp->preferred_cluster == cluster); + + rcu_read_unlock(); + return rc; +} + +struct sched_cluster *rq_cluster(struct rq *rq) +{ + return rq->cluster; +} + +/* + * reset_cpu_hmp_stats - reset HMP stats for a cpu + * nr_big_tasks + * cumulative_runnable_avg (iff reset_cra is true) + */ +void reset_cpu_hmp_stats(int cpu, int reset_cra) +{ + reset_cfs_rq_hmp_stats(cpu, reset_cra); + reset_hmp_stats(&cpu_rq(cpu)->hmp_stats, reset_cra); +} + +void fixup_nr_big_tasks(struct hmp_sched_stats *stats, + struct task_struct *p, s64 delta) +{ + u64 new_task_load; + u64 old_task_load; + + if (sched_disable_window_stats) + return; + + old_task_load = scale_load_to_cpu(task_load(p), task_cpu(p)); + new_task_load = scale_load_to_cpu(delta + task_load(p), task_cpu(p)); + + if (__is_big_task(p, old_task_load) && !__is_big_task(p, new_task_load)) + stats->nr_big_tasks--; + else if (!__is_big_task(p, old_task_load) && + __is_big_task(p, new_task_load)) + stats->nr_big_tasks++; + + BUG_ON(stats->nr_big_tasks < 0); +} + +/* + * Walk runqueue of cpu and re-initialize 'nr_big_tasks' counters. + */ +static void update_nr_big_tasks(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + struct task_struct *p; + + /* Do not reset cumulative_runnable_avg */ + reset_cpu_hmp_stats(cpu, 0); + + list_for_each_entry(p, &rq->cfs_tasks, se.group_node) + _inc_hmp_sched_stats_fair(rq, p, 0); +} + +/* Disable interrupts and grab runqueue lock of all cpus listed in @cpus */ +void pre_big_task_count_change(const struct cpumask *cpus) +{ + int i; + + local_irq_disable(); + + for_each_cpu(i, cpus) + raw_spin_lock(&cpu_rq(i)->lock); +} + +/* + * Reinitialize 'nr_big_tasks' counters on all affected cpus + */ +void post_big_task_count_change(const struct cpumask *cpus) +{ + int i; + + /* Assumes local_irq_disable() keeps online cpumap stable */ + for_each_cpu(i, cpus) + update_nr_big_tasks(i); + + for_each_cpu(i, cpus) + raw_spin_unlock(&cpu_rq(i)->lock); + + local_irq_enable(); +} + +DEFINE_MUTEX(policy_mutex); + +unsigned int update_freq_aggregate_threshold(unsigned int threshold) +{ + unsigned int old_threshold; + + mutex_lock(&policy_mutex); + + old_threshold = sysctl_sched_freq_aggregate_threshold_pct; + + sysctl_sched_freq_aggregate_threshold_pct = threshold; + sched_freq_aggregate_threshold = + pct_to_real(sysctl_sched_freq_aggregate_threshold_pct); + + mutex_unlock(&policy_mutex); + + return old_threshold; +} + +static inline int invalid_value_freq_input(unsigned int *data) +{ + if (data == &sysctl_sched_freq_aggregate) + return !(*data == 0 || *data == 1); + + return 0; +} + +static inline int invalid_value(unsigned int *data) +{ + unsigned int val = *data; + + if (data == &sysctl_sched_ravg_hist_size) + return (val < 2 || val > RAVG_HIST_SIZE_MAX); + + if (data == &sysctl_sched_window_stats_policy) + return val >= WINDOW_STATS_INVALID_POLICY; + + return invalid_value_freq_input(data); +} + +/* + * Handle "atomic" update of sysctl_sched_window_stats_policy, + * sysctl_sched_ravg_hist_size variables. + */ +int sched_window_update_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + unsigned int *data = (unsigned int *)table->data; + unsigned int old_val; + + mutex_lock(&policy_mutex); + + old_val = *data; + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + if (ret || !write || (write && (old_val == *data))) + goto done; + + if (invalid_value(data)) { + *data = old_val; + ret = -EINVAL; + goto done; + } + + reset_all_window_stats(0, 0); + +done: + mutex_unlock(&policy_mutex); + + return ret; +} + +/* + * Convert percentage value into absolute form. This will avoid div() operation + * in fast path, to convert task load in percentage scale. + */ +int sched_hmp_proc_update_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + unsigned int old_val; + unsigned int *data = (unsigned int *)table->data; + int update_task_count = 0; + + /* + * The policy mutex is acquired with cpu_hotplug.lock + * held from cpu_up()->cpufreq_governor_interactive()-> + * sched_set_window(). So enforce the same order here. + */ + if (write && (data == &sysctl_sched_upmigrate_pct)) { + update_task_count = 1; + get_online_cpus(); + } + + mutex_lock(&policy_mutex); + + old_val = *data; + + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + if (ret || !write) + goto done; + + if (write && (old_val == *data)) + goto done; + + if (sysctl_sched_downmigrate_pct > sysctl_sched_upmigrate_pct || + sysctl_sched_group_downmigrate_pct > + sysctl_sched_group_upmigrate_pct) { + *data = old_val; + ret = -EINVAL; + goto done; + } + + /* + * Big task tunable change will need to re-classify tasks on + * runqueue as big and set their counters appropriately. + * sysctl interface affects secondary variables (*_pct), which is then + * "atomically" carried over to the primary variables. Atomic change + * includes taking runqueue lock of all online cpus and re-initiatizing + * their big counter values based on changed criteria. + */ + if (update_task_count) + pre_big_task_count_change(cpu_online_mask); + + set_hmp_defaults(); + + if (update_task_count) + post_big_task_count_change(cpu_online_mask); + +done: + mutex_unlock(&policy_mutex); + if (update_task_count) + put_online_cpus(); + return ret; +} + +inline int nr_big_tasks(struct rq *rq) +{ + return rq->hmp_stats.nr_big_tasks; +} + +unsigned int cpu_temp(int cpu) +{ + struct cpu_pwr_stats *per_cpu_info = get_cpu_pwr_stats(); + + if (per_cpu_info) + return per_cpu_info[cpu].temp; + else + return 0; +} + +/* + * kfree() may wakeup kswapd. So this function should NOT be called + * with any CPU's rq->lock acquired. + */ +void free_task_load_ptrs(struct task_struct *p) +{ + kfree(p->ravg.curr_window_cpu); + kfree(p->ravg.prev_window_cpu); + + /* + * update_task_ravg() can be called for exiting tasks. While the + * function itself ensures correct behavior, the corresponding + * trace event requires that these pointers be NULL. + */ + p->ravg.curr_window_cpu = NULL; + p->ravg.prev_window_cpu = NULL; +} + +void init_new_task_load(struct task_struct *p) +{ + int i; + u32 init_load_windows = sched_init_task_load_windows; + u32 init_load_pct = current->init_load_pct; + + p->init_load_pct = 0; + rcu_assign_pointer(p->grp, NULL); + INIT_LIST_HEAD(&p->grp_list); + memset(&p->ravg, 0, sizeof(struct ravg)); + p->cpu_cycles = 0; + p->ravg.curr_burst = 0; + /* + * Initialize the avg_burst to twice the threshold, so that + * a task would not be classified as short burst right away + * after fork. It takes at least 6 sleep-wakeup cycles for + * the avg_burst to go below the threshold. + */ + p->ravg.avg_burst = 2 * (u64)sysctl_sched_short_burst; + p->ravg.avg_sleep_time = 0; + + p->ravg.curr_window_cpu = kcalloc(nr_cpu_ids, sizeof(u32), GFP_KERNEL); + p->ravg.prev_window_cpu = kcalloc(nr_cpu_ids, sizeof(u32), GFP_KERNEL); + + /* Don't have much choice. CPU frequency would be bogus */ + BUG_ON(!p->ravg.curr_window_cpu || !p->ravg.prev_window_cpu); + + if (init_load_pct) + init_load_windows = div64_u64((u64)init_load_pct * + (u64)sched_ravg_window, 100); + + p->ravg.demand = init_load_windows; + p->ravg.pred_demand = 0; + for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i) + p->ravg.sum_history[i] = init_load_windows; +} + +/* Return task demand in percentage scale */ +unsigned int pct_task_load(struct task_struct *p) +{ + unsigned int load; + + load = div64_u64((u64)task_load(p) * 100, (u64)max_task_load()); + + return load; +} + +/* + * Return total number of tasks "eligible" to run on highest capacity cpu + * + * This is simply nr_big_tasks for cpus which are not of max_capacity and + * nr_running for cpus of max_capacity + */ +unsigned int nr_eligible_big_tasks(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + int nr_big = rq->hmp_stats.nr_big_tasks; + int nr = rq->nr_running; + + if (!is_max_capacity_cpu(cpu)) + return nr_big; + + return nr; +} + +static inline int exiting_task(struct task_struct *p) +{ + return (p->ravg.sum_history[0] == EXITING_TASK_MARKER); +} + +static int __init set_sched_ravg_window(char *str) +{ + unsigned int window_size; + + get_option(&str, &window_size); + + if (window_size < MIN_SCHED_RAVG_WINDOW || + window_size > MAX_SCHED_RAVG_WINDOW) { + WARN_ON(1); + return -EINVAL; + } + + sched_ravg_window = window_size; + return 0; +} + +early_param("sched_ravg_window", set_sched_ravg_window); + +static inline void +update_window_start(struct rq *rq, u64 wallclock) +{ + s64 delta; + int nr_windows; + + delta = wallclock - rq->window_start; + BUG_ON(delta < 0); + if (delta < sched_ravg_window) + return; + + nr_windows = div64_u64(delta, sched_ravg_window); + rq->window_start += (u64)nr_windows * (u64)sched_ravg_window; +} + +#define DIV64_U64_ROUNDUP(X, Y) div64_u64((X) + (Y - 1), Y) + +static inline u64 scale_exec_time(u64 delta, struct rq *rq) +{ + u32 freq; + + freq = cpu_cycles_to_freq(rq->cc.cycles, rq->cc.time); + delta = DIV64_U64_ROUNDUP(delta * freq, max_possible_freq); + delta *= rq->cluster->exec_scale_factor; + delta >>= 10; + + return delta; +} + +static inline int cpu_is_waiting_on_io(struct rq *rq) +{ + if (!sched_io_is_busy) + return 0; + + return atomic_read(&rq->nr_iowait); +} + +/* Does freq_required sufficiently exceed or fall behind cur_freq? */ +static inline int +nearly_same_freq(unsigned int cur_freq, unsigned int freq_required) +{ + int delta = freq_required - cur_freq; + + if (freq_required > cur_freq) + return delta < sysctl_sched_freq_inc_notify; + + delta = -delta; + + return delta < sysctl_sched_freq_dec_notify; +} + +/* Convert busy time to frequency equivalent */ +static inline unsigned int load_to_freq(struct rq *rq, u64 load) +{ + unsigned int freq; + + load = scale_load_to_cpu(load, cpu_of(rq)); + load *= 128; + load = div64_u64(load, max_task_load()); + + freq = load * cpu_max_possible_freq(cpu_of(rq)); + freq /= 128; + + return freq; +} + +/* + * Return load from all related groups in given frequency domain. + */ +static void group_load_in_freq_domain(struct cpumask *cpus, + u64 *grp_load, u64 *new_grp_load) +{ + int j; + + for_each_cpu(j, cpus) { + struct rq *rq = cpu_rq(j); + + *grp_load += rq->grp_time.prev_runnable_sum; + *new_grp_load += rq->grp_time.nt_prev_runnable_sum; + } +} + +static inline u64 freq_policy_load(struct rq *rq, u64 load); +/* + * Should scheduler alert governor for changing frequency? + * + * @check_pred - evaluate frequency based on the predictive demand + * @check_groups - add load from all related groups on given cpu + * + * check_groups is set to 1 if a "related" task movement/wakeup is triggering + * the notification check. To avoid "re-aggregation" of demand in such cases, + * we check whether the migrated/woken tasks demand (along with demand from + * existing tasks on the cpu) can be met on target cpu + * + */ + +static int send_notification(struct rq *rq, int check_pred, int check_groups) +{ + unsigned int cur_freq, freq_required; + unsigned long flags; + int rc = 0; + u64 group_load = 0, new_load = 0; + + if (check_pred) { + u64 prev = rq->old_busy_time; + u64 predicted = rq->hmp_stats.pred_demands_sum; + + if (rq->cluster->cur_freq == cpu_max_freq(cpu_of(rq))) + return 0; + + prev = max(prev, rq->old_estimated_time); + if (prev > predicted) + return 0; + + cur_freq = load_to_freq(rq, prev); + freq_required = load_to_freq(rq, predicted); + + if (freq_required < cur_freq + sysctl_sched_pred_alert_freq) + return 0; + } else { + /* + * Protect from concurrent update of rq->prev_runnable_sum and + * group cpu load + */ + raw_spin_lock_irqsave(&rq->lock, flags); + if (check_groups) + group_load = rq->grp_time.prev_runnable_sum; + + new_load = rq->prev_runnable_sum + group_load; + new_load = freq_policy_load(rq, new_load); + + raw_spin_unlock_irqrestore(&rq->lock, flags); + + cur_freq = load_to_freq(rq, rq->old_busy_time); + freq_required = load_to_freq(rq, new_load); + + if (nearly_same_freq(cur_freq, freq_required)) + return 0; + } + + raw_spin_lock_irqsave(&rq->lock, flags); + if (!rq->cluster->notifier_sent) { + rq->cluster->notifier_sent = 1; + rc = 1; + trace_sched_freq_alert(cpu_of(rq), check_pred, check_groups, rq, + new_load); + } + raw_spin_unlock_irqrestore(&rq->lock, flags); + + return rc; +} + +/* Alert governor if there is a need to change frequency */ +void check_for_freq_change(struct rq *rq, bool check_pred, bool check_groups) +{ + int cpu = cpu_of(rq); + + if (!send_notification(rq, check_pred, check_groups)) + return; + + atomic_notifier_call_chain( + &load_alert_notifier_head, 0, + (void *)(long)cpu); +} + +void notify_migration(int src_cpu, int dest_cpu, bool src_cpu_dead, + struct task_struct *p) +{ + bool check_groups; + + rcu_read_lock(); + check_groups = task_in_related_thread_group(p); + rcu_read_unlock(); + + if (!same_freq_domain(src_cpu, dest_cpu)) { + if (!src_cpu_dead) + check_for_freq_change(cpu_rq(src_cpu), false, + check_groups); + check_for_freq_change(cpu_rq(dest_cpu), false, check_groups); + } else { + check_for_freq_change(cpu_rq(dest_cpu), true, check_groups); + } +} + +static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p, + u64 irqtime, int event) +{ + if (is_idle_task(p)) { + /* TASK_WAKE && TASK_MIGRATE is not possible on idle task! */ + if (event == PICK_NEXT_TASK) + return 0; + + /* PUT_PREV_TASK, TASK_UPDATE && IRQ_UPDATE are left */ + return irqtime || cpu_is_waiting_on_io(rq); + } + + if (event == TASK_WAKE) + return 0; + + if (event == PUT_PREV_TASK || event == IRQ_UPDATE) + return 1; + + /* + * TASK_UPDATE can be called on sleeping task, when its moved between + * related groups + */ + if (event == TASK_UPDATE) { + if (rq->curr == p) + return 1; + + return p->on_rq ? SCHED_FREQ_ACCOUNT_WAIT_TIME : 0; + } + + /* TASK_MIGRATE, PICK_NEXT_TASK left */ + return SCHED_FREQ_ACCOUNT_WAIT_TIME; +} + +static inline bool is_new_task(struct task_struct *p) +{ + return p->ravg.active_windows < SCHED_NEW_TASK_WINDOWS; +} + +#define INC_STEP 8 +#define DEC_STEP 2 +#define CONSISTENT_THRES 16 +#define INC_STEP_BIG 16 +/* + * bucket_increase - update the count of all buckets + * + * @buckets: array of buckets tracking busy time of a task + * @idx: the index of bucket to be incremented + * + * Each time a complete window finishes, count of bucket that runtime + * falls in (@idx) is incremented. Counts of all other buckets are + * decayed. The rate of increase and decay could be different based + * on current count in the bucket. + */ +static inline void bucket_increase(u8 *buckets, int idx) +{ + int i, step; + + for (i = 0; i < NUM_BUSY_BUCKETS; i++) { + if (idx != i) { + if (buckets[i] > DEC_STEP) + buckets[i] -= DEC_STEP; + else + buckets[i] = 0; + } else { + step = buckets[i] >= CONSISTENT_THRES ? + INC_STEP_BIG : INC_STEP; + if (buckets[i] > U8_MAX - step) + buckets[i] = U8_MAX; + else + buckets[i] += step; + } + } +} + +static inline int busy_to_bucket(u32 normalized_rt) +{ + int bidx; + + bidx = mult_frac(normalized_rt, NUM_BUSY_BUCKETS, max_task_load()); + bidx = min(bidx, NUM_BUSY_BUCKETS - 1); + + /* + * Combine lowest two buckets. The lowest frequency falls into + * 2nd bucket and thus keep predicting lowest bucket is not + * useful. + */ + if (!bidx) + bidx++; + + return bidx; +} + +static inline u64 +scale_load_to_freq(u64 load, unsigned int src_freq, unsigned int dst_freq) +{ + return div64_u64(load * (u64)src_freq, (u64)dst_freq); +} + +/* + * get_pred_busy - calculate predicted demand for a task on runqueue + * + * @rq: runqueue of task p + * @p: task whose prediction is being updated + * @start: starting bucket. returned prediction should not be lower than + * this bucket. + * @runtime: runtime of the task. returned prediction should not be lower + * than this runtime. + * Note: @start can be derived from @runtime. It's passed in only to + * avoid duplicated calculation in some cases. + * + * A new predicted busy time is returned for task @p based on @runtime + * passed in. The function searches through buckets that represent busy + * time equal to or bigger than @runtime and attempts to find the bucket to + * to use for prediction. Once found, it searches through historical busy + * time and returns the latest that falls into the bucket. If no such busy + * time exists, it returns the medium of that bucket. + */ +static u32 get_pred_busy(struct rq *rq, struct task_struct *p, + int start, u32 runtime) +{ + int i; + u8 *buckets = p->ravg.busy_buckets; + u32 *hist = p->ravg.sum_history; + u32 dmin, dmax; + u64 cur_freq_runtime = 0; + int first = NUM_BUSY_BUCKETS, final; + u32 ret = runtime; + + /* skip prediction for new tasks due to lack of history */ + if (unlikely(is_new_task(p))) + goto out; + + /* find minimal bucket index to pick */ + for (i = start; i < NUM_BUSY_BUCKETS; i++) { + if (buckets[i]) { + first = i; + break; + } + } + /* if no higher buckets are filled, predict runtime */ + if (first >= NUM_BUSY_BUCKETS) + goto out; + + /* compute the bucket for prediction */ + final = first; + + /* determine demand range for the predicted bucket */ + if (final < 2) { + /* lowest two buckets are combined */ + dmin = 0; + final = 1; + } else { + dmin = mult_frac(final, max_task_load(), NUM_BUSY_BUCKETS); + } + dmax = mult_frac(final + 1, max_task_load(), NUM_BUSY_BUCKETS); + + /* + * search through runtime history and return first runtime that falls + * into the range of predicted bucket. + */ + for (i = 0; i < sched_ravg_hist_size; i++) { + if (hist[i] >= dmin && hist[i] < dmax) { + ret = hist[i]; + break; + } + } + /* no historical runtime within bucket found, use average of the bin */ + if (ret < dmin) + ret = (dmin + dmax) / 2; + /* + * when updating in middle of a window, runtime could be higher + * than all recorded history. Always predict at least runtime. + */ + ret = max(runtime, ret); +out: + trace_sched_update_pred_demand(rq, p, runtime, + mult_frac((unsigned int)cur_freq_runtime, 100, + sched_ravg_window), ret); + return ret; +} + +static inline u32 calc_pred_demand(struct rq *rq, struct task_struct *p) +{ + if (p->ravg.pred_demand >= p->ravg.curr_window) + return p->ravg.pred_demand; + + return get_pred_busy(rq, p, busy_to_bucket(p->ravg.curr_window), + p->ravg.curr_window); +} + +/* + * predictive demand of a task is calculated at the window roll-over. + * if the task current window busy time exceeds the predicted + * demand, update it here to reflect the task needs. + */ +void update_task_pred_demand(struct rq *rq, struct task_struct *p, int event) +{ + u32 new, old; + + if (is_idle_task(p) || exiting_task(p)) + return; + + if (event != PUT_PREV_TASK && event != TASK_UPDATE && + (!SCHED_FREQ_ACCOUNT_WAIT_TIME || + (event != TASK_MIGRATE && + event != PICK_NEXT_TASK))) + return; + + /* + * TASK_UPDATE can be called on sleeping task, when its moved between + * related groups + */ + if (event == TASK_UPDATE) { + if (!p->on_rq && !SCHED_FREQ_ACCOUNT_WAIT_TIME) + return; + } + + new = calc_pred_demand(rq, p); + old = p->ravg.pred_demand; + + if (old >= new) + return; + + if (task_on_rq_queued(p) && (!task_has_dl_policy(p) || + !p->dl.dl_throttled)) + p->sched_class->fixup_hmp_sched_stats(rq, p, + p->ravg.demand, + new); + + p->ravg.pred_demand = new; +} + +void clear_top_tasks_bitmap(unsigned long *bitmap) +{ + memset(bitmap, 0, top_tasks_bitmap_size); + __set_bit(NUM_LOAD_INDICES, bitmap); +} + +/* + * Special case the last index and provide a fast path for index = 0. + * Note that sched_load_granule can change underneath us if we are not + * holding any runqueue locks while calling the two functions below. + */ +static u32 top_task_load(struct rq *rq) +{ + int index = rq->prev_top; + u8 prev = 1 - rq->curr_table; + + if (!index) { + int msb = NUM_LOAD_INDICES - 1; + + if (!test_bit(msb, rq->top_tasks_bitmap[prev])) + return 0; + else + return sched_load_granule; + } else if (index == NUM_LOAD_INDICES - 1) { + return sched_ravg_window; + } else { + return (index + 1) * sched_load_granule; + } +} + +static u32 load_to_index(u32 load) +{ + u32 index = load / sched_load_granule; + + return min(index, (u32)(NUM_LOAD_INDICES - 1)); +} + +static void update_top_tasks(struct task_struct *p, struct rq *rq, + u32 old_curr_window, int new_window, bool full_window) +{ + u8 curr = rq->curr_table; + u8 prev = 1 - curr; + u8 *curr_table = rq->top_tasks[curr]; + u8 *prev_table = rq->top_tasks[prev]; + int old_index, new_index, update_index; + u32 curr_window = p->ravg.curr_window; + u32 prev_window = p->ravg.prev_window; + bool zero_index_update; + + if (old_curr_window == curr_window && !new_window) + return; + + old_index = load_to_index(old_curr_window); + new_index = load_to_index(curr_window); + + if (!new_window) { + zero_index_update = !old_curr_window && curr_window; + if (old_index != new_index || zero_index_update) { + if (old_curr_window) + curr_table[old_index] -= 1; + if (curr_window) + curr_table[new_index] += 1; + if (new_index > rq->curr_top) + rq->curr_top = new_index; + } + + if (!curr_table[old_index]) + __clear_bit(NUM_LOAD_INDICES - old_index - 1, + rq->top_tasks_bitmap[curr]); + + if (curr_table[new_index] == 1) + __set_bit(NUM_LOAD_INDICES - new_index - 1, + rq->top_tasks_bitmap[curr]); + + return; + } + + /* + * The window has rolled over for this task. By the time we get + * here, curr/prev swaps would has already occurred. So we need + * to use prev_window for the new index. + */ + update_index = load_to_index(prev_window); + + if (full_window) { + /* + * Two cases here. Either 'p' ran for the entire window or + * it didn't run at all. In either case there is no entry + * in the prev table. If 'p' ran the entire window, we just + * need to create a new entry in the prev table. In this case + * update_index will be correspond to sched_ravg_window + * so we can unconditionally update the top index. + */ + if (prev_window) { + prev_table[update_index] += 1; + rq->prev_top = update_index; + } + + if (prev_table[update_index] == 1) + __set_bit(NUM_LOAD_INDICES - update_index - 1, + rq->top_tasks_bitmap[prev]); + } else { + zero_index_update = !old_curr_window && prev_window; + if (old_index != update_index || zero_index_update) { + if (old_curr_window) + prev_table[old_index] -= 1; + + prev_table[update_index] += 1; + + if (update_index > rq->prev_top) + rq->prev_top = update_index; + + if (!prev_table[old_index]) + __clear_bit(NUM_LOAD_INDICES - old_index - 1, + rq->top_tasks_bitmap[prev]); + + if (prev_table[update_index] == 1) + __set_bit(NUM_LOAD_INDICES - update_index - 1, + rq->top_tasks_bitmap[prev]); + } + } + + if (curr_window) { + curr_table[new_index] += 1; + + if (new_index > rq->curr_top) + rq->curr_top = new_index; + + if (curr_table[new_index] == 1) + __set_bit(NUM_LOAD_INDICES - new_index - 1, + rq->top_tasks_bitmap[curr]); + } +} + +static inline void clear_top_tasks_table(u8 *table) +{ + memset(table, 0, NUM_LOAD_INDICES * sizeof(u8)); +} + +static void rollover_top_tasks(struct rq *rq, bool full_window) +{ + u8 curr_table = rq->curr_table; + u8 prev_table = 1 - curr_table; + int curr_top = rq->curr_top; + + clear_top_tasks_table(rq->top_tasks[prev_table]); + clear_top_tasks_bitmap(rq->top_tasks_bitmap[prev_table]); + + if (full_window) { + curr_top = 0; + clear_top_tasks_table(rq->top_tasks[curr_table]); + clear_top_tasks_bitmap( + rq->top_tasks_bitmap[curr_table]); + } + + rq->curr_table = prev_table; + rq->prev_top = curr_top; + rq->curr_top = 0; +} + +static u32 empty_windows[NR_CPUS]; + +static void rollover_task_window(struct task_struct *p, bool full_window) +{ + u32 *curr_cpu_windows = empty_windows; + u32 curr_window; + int i; + + /* Rollover the sum */ + curr_window = 0; + + if (!full_window) { + curr_window = p->ravg.curr_window; + curr_cpu_windows = p->ravg.curr_window_cpu; + } + + p->ravg.prev_window = curr_window; + p->ravg.curr_window = 0; + + /* Roll over individual CPU contributions */ + for (i = 0; i < nr_cpu_ids; i++) { + p->ravg.prev_window_cpu[i] = curr_cpu_windows[i]; + p->ravg.curr_window_cpu[i] = 0; + } +} + +static void rollover_cpu_window(struct rq *rq, bool full_window) +{ + u64 curr_sum = rq->curr_runnable_sum; + u64 nt_curr_sum = rq->nt_curr_runnable_sum; + u64 grp_curr_sum = rq->grp_time.curr_runnable_sum; + u64 grp_nt_curr_sum = rq->grp_time.nt_curr_runnable_sum; + + if (unlikely(full_window)) { + curr_sum = 0; + nt_curr_sum = 0; + grp_curr_sum = 0; + grp_nt_curr_sum = 0; + } + + rq->prev_runnable_sum = curr_sum; + rq->nt_prev_runnable_sum = nt_curr_sum; + rq->grp_time.prev_runnable_sum = grp_curr_sum; + rq->grp_time.nt_prev_runnable_sum = grp_nt_curr_sum; + + rq->curr_runnable_sum = 0; + rq->nt_curr_runnable_sum = 0; + rq->grp_time.curr_runnable_sum = 0; + rq->grp_time.nt_curr_runnable_sum = 0; +} + +/* + * Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum) + */ +static void update_cpu_busy_time(struct task_struct *p, struct rq *rq, + int event, u64 wallclock, u64 irqtime) +{ + int new_window, full_window = 0; + int p_is_curr_task = (p == rq->curr); + u64 mark_start = p->ravg.mark_start; + u64 window_start = rq->window_start; + u32 window_size = sched_ravg_window; + u64 delta; + u64 *curr_runnable_sum = &rq->curr_runnable_sum; + u64 *prev_runnable_sum = &rq->prev_runnable_sum; + u64 *nt_curr_runnable_sum = &rq->nt_curr_runnable_sum; + u64 *nt_prev_runnable_sum = &rq->nt_prev_runnable_sum; + bool new_task; + struct related_thread_group *grp; + int cpu = rq->cpu; + u32 old_curr_window = p->ravg.curr_window; + + new_window = mark_start < window_start; + if (new_window) { + full_window = (window_start - mark_start) >= window_size; + if (p->ravg.active_windows < USHRT_MAX) + p->ravg.active_windows++; + } + + new_task = is_new_task(p); + + /* + * Handle per-task window rollover. We don't care about the idle + * task or exiting tasks. + */ + if (!is_idle_task(p) && !exiting_task(p)) { + if (new_window) + rollover_task_window(p, full_window); + } + + if (p_is_curr_task && new_window) { + rollover_cpu_window(rq, full_window); + rollover_top_tasks(rq, full_window); + } + + if (!account_busy_for_cpu_time(rq, p, irqtime, event)) + goto done; + + grp = p->grp; + if (grp && sched_freq_aggregate) { + struct group_cpu_time *cpu_time = &rq->grp_time; + + curr_runnable_sum = &cpu_time->curr_runnable_sum; + prev_runnable_sum = &cpu_time->prev_runnable_sum; + + nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; + nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; + } + + if (!new_window) { + /* + * account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. No rollover + * since we didn't start a new window. An example of this is + * when a task starts execution and then sleeps within the + * same window. + */ + + if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) + delta = wallclock - mark_start; + else + delta = irqtime; + delta = scale_exec_time(delta, rq); + *curr_runnable_sum += delta; + if (new_task) + *nt_curr_runnable_sum += delta; + + if (!is_idle_task(p) && !exiting_task(p)) { + p->ravg.curr_window += delta; + p->ravg.curr_window_cpu[cpu] += delta; + } + + goto done; + } + + if (!p_is_curr_task) { + /* + * account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. A new window + * has also started, but p is not the current task, so the + * window is not rolled over - just split up and account + * as necessary into curr and prev. The window is only + * rolled over when a new window is processed for the current + * task. + * + * Irqtime can't be accounted by a task that isn't the + * currently running task. + */ + + if (!full_window) { + /* + * A full window hasn't elapsed, account partial + * contribution to previous completed window. + */ + delta = scale_exec_time(window_start - mark_start, rq); + if (!exiting_task(p)) { + p->ravg.prev_window += delta; + p->ravg.prev_window_cpu[cpu] += delta; + } + } else { + /* + * Since at least one full window has elapsed, + * the contribution to the previous window is the + * full window (window_size). + */ + delta = scale_exec_time(window_size, rq); + if (!exiting_task(p)) { + p->ravg.prev_window = delta; + p->ravg.prev_window_cpu[cpu] = delta; + } + } + + *prev_runnable_sum += delta; + if (new_task) + *nt_prev_runnable_sum += delta; + + /* Account piece of busy time in the current window. */ + delta = scale_exec_time(wallclock - window_start, rq); + *curr_runnable_sum += delta; + if (new_task) + *nt_curr_runnable_sum += delta; + + if (!exiting_task(p)) { + p->ravg.curr_window = delta; + p->ravg.curr_window_cpu[cpu] = delta; + } + + goto done; + } + + if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) { + /* + * account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. A new window + * has started and p is the current task so rollover is + * needed. If any of these three above conditions are true + * then this busy time can't be accounted as irqtime. + * + * Busy time for the idle task or exiting tasks need not + * be accounted. + * + * An example of this would be a task that starts execution + * and then sleeps once a new window has begun. + */ + + if (!full_window) { + /* + * A full window hasn't elapsed, account partial + * contribution to previous completed window. + */ + delta = scale_exec_time(window_start - mark_start, rq); + if (!is_idle_task(p) && !exiting_task(p)) { + p->ravg.prev_window += delta; + p->ravg.prev_window_cpu[cpu] += delta; + } + } else { + /* + * Since at least one full window has elapsed, + * the contribution to the previous window is the + * full window (window_size). + */ + delta = scale_exec_time(window_size, rq); + if (!is_idle_task(p) && !exiting_task(p)) { + p->ravg.prev_window = delta; + p->ravg.prev_window_cpu[cpu] = delta; + } + } + + /* + * Rollover is done here by overwriting the values in + * prev_runnable_sum and curr_runnable_sum. + */ + *prev_runnable_sum += delta; + if (new_task) + *nt_prev_runnable_sum += delta; + + /* Account piece of busy time in the current window. */ + delta = scale_exec_time(wallclock - window_start, rq); + *curr_runnable_sum += delta; + if (new_task) + *nt_curr_runnable_sum += delta; + + if (!is_idle_task(p) && !exiting_task(p)) { + p->ravg.curr_window = delta; + p->ravg.curr_window_cpu[cpu] = delta; + } + + goto done; + } + + if (irqtime) { + /* + * account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. A new window + * has started and p is the current task so rollover is + * needed. The current task must be the idle task because + * irqtime is not accounted for any other task. + * + * Irqtime will be accounted each time we process IRQ activity + * after a period of idleness, so we know the IRQ busy time + * started at wallclock - irqtime. + */ + + BUG_ON(!is_idle_task(p)); + mark_start = wallclock - irqtime; + + /* + * Roll window over. If IRQ busy time was just in the current + * window then that is all that need be accounted. + */ + if (mark_start > window_start) { + *curr_runnable_sum = scale_exec_time(irqtime, rq); + return; + } + + /* + * The IRQ busy time spanned multiple windows. Process the + * busy time preceding the current window start first. + */ + delta = window_start - mark_start; + if (delta > window_size) + delta = window_size; + delta = scale_exec_time(delta, rq); + *prev_runnable_sum += delta; + + /* Process the remaining IRQ busy time in the current window. */ + delta = wallclock - window_start; + rq->curr_runnable_sum = scale_exec_time(delta, rq); + + return; + } + +done: + if (!is_idle_task(p) && !exiting_task(p)) + update_top_tasks(p, rq, old_curr_window, + new_window, full_window); +} + +static inline u32 predict_and_update_buckets(struct rq *rq, + struct task_struct *p, u32 runtime) { + + int bidx; + u32 pred_demand; + + bidx = busy_to_bucket(runtime); + pred_demand = get_pred_busy(rq, p, bidx, runtime); + bucket_increase(p->ravg.busy_buckets, bidx); + + return pred_demand; +} + +#define THRESH_CC_UPDATE (2 * NSEC_PER_USEC) + +/* + * Assumes rq_lock is held and wallclock was recorded in the same critical + * section as this function's invocation. + */ +static inline u64 read_cycle_counter(int cpu, u64 wallclock) +{ + struct sched_cluster *cluster = cpu_rq(cpu)->cluster; + u64 delta; + + if (unlikely(!cluster)) + return cpu_cycle_counter_cb.get_cpu_cycle_counter(cpu); + + /* + * Why don't we need locking here? Let's say that delta is negative + * because some other CPU happened to update last_cc_update with a + * more recent timestamp. We simply read the conter again in that case + * with no harmful side effects. This can happen if there is an FIQ + * between when we read the wallclock and when we use it here. + */ + delta = wallclock - atomic64_read(&cluster->last_cc_update); + if (delta > THRESH_CC_UPDATE) { + atomic64_set(&cluster->cycles, + cpu_cycle_counter_cb.get_cpu_cycle_counter(cpu)); + atomic64_set(&cluster->last_cc_update, wallclock); + } + + return atomic64_read(&cluster->cycles); +} + +static void update_task_cpu_cycles(struct task_struct *p, int cpu, + u64 wallclock) +{ + if (use_cycle_counter) + p->cpu_cycles = read_cycle_counter(cpu, wallclock); +} + +static void +update_task_rq_cpu_cycles(struct task_struct *p, struct rq *rq, int event, + u64 wallclock, u64 irqtime) +{ + u64 cur_cycles; + int cpu = cpu_of(rq); + + lockdep_assert_held(&rq->lock); + + if (!use_cycle_counter) { + rq->cc.cycles = cpu_cur_freq(cpu); + rq->cc.time = 1; + return; + } + + cur_cycles = read_cycle_counter(cpu, wallclock); + + /* + * If current task is idle task and irqtime == 0 CPU was + * indeed idle and probably its cycle counter was not + * increasing. We still need estimatied CPU frequency + * for IO wait time accounting. Use the previously + * calculated frequency in such a case. + */ + if (!is_idle_task(rq->curr) || irqtime) { + if (unlikely(cur_cycles < p->cpu_cycles)) + rq->cc.cycles = cur_cycles + (U64_MAX - p->cpu_cycles); + else + rq->cc.cycles = cur_cycles - p->cpu_cycles; + rq->cc.cycles = rq->cc.cycles * NSEC_PER_MSEC; + + if (event == IRQ_UPDATE && is_idle_task(p)) + /* + * Time between mark_start of idle task and IRQ handler + * entry time is CPU cycle counter stall period. + * Upon IRQ handler entry sched_account_irqstart() + * replenishes idle task's cpu cycle counter so + * rq->cc.cycles now represents increased cycles during + * IRQ handler rather than time between idle entry and + * IRQ exit. Thus use irqtime as time delta. + */ + rq->cc.time = irqtime; + else + rq->cc.time = wallclock - p->ravg.mark_start; + BUG_ON((s64)rq->cc.time < 0); + } + + p->cpu_cycles = cur_cycles; + + trace_sched_get_task_cpu_cycles(cpu, event, rq->cc.cycles, + rq->cc.time, p); +} + +static int +account_busy_for_task_demand(struct rq *rq, struct task_struct *p, int event) +{ + /* + * No need to bother updating task demand for exiting tasks + * or the idle task. + */ + if (exiting_task(p) || is_idle_task(p)) + return 0; + + /* + * When a task is waking up it is completing a segment of non-busy + * time. Likewise, if wait time is not treated as busy time, then + * when a task begins to run or is migrated, it is not running and + * is completing a segment of non-busy time. + */ + if (event == TASK_WAKE || (!SCHED_ACCOUNT_WAIT_TIME && + (event == PICK_NEXT_TASK || event == TASK_MIGRATE))) + return 0; + + /* + * TASK_UPDATE can be called on sleeping task, when its moved between + * related groups + */ + if (event == TASK_UPDATE) { + if (rq->curr == p) + return 1; + + return p->on_rq ? SCHED_ACCOUNT_WAIT_TIME : 0; + } + + return 1; +} + +/* + * Called when new window is starting for a task, to record cpu usage over + * recently concluded window(s). Normally 'samples' should be 1. It can be > 1 + * when, say, a real-time task runs without preemption for several windows at a + * stretch. + */ +static void update_history(struct rq *rq, struct task_struct *p, + u32 runtime, int samples, int event) +{ + u32 *hist = &p->ravg.sum_history[0]; + int ridx, widx; + u32 max = 0, avg, demand, pred_demand; + u64 sum = 0; + + /* Ignore windows where task had no activity */ + if (!runtime || is_idle_task(p) || exiting_task(p) || !samples) + goto done; + + /* Push new 'runtime' value onto stack */ + widx = sched_ravg_hist_size - 1; + ridx = widx - samples; + for (; ridx >= 0; --widx, --ridx) { + hist[widx] = hist[ridx]; + sum += hist[widx]; + if (hist[widx] > max) + max = hist[widx]; + } + + for (widx = 0; widx < samples && widx < sched_ravg_hist_size; widx++) { + hist[widx] = runtime; + sum += hist[widx]; + if (hist[widx] > max) + max = hist[widx]; + } + + p->ravg.sum = 0; + + if (sched_window_stats_policy == WINDOW_STATS_RECENT) { + demand = runtime; + } else if (sched_window_stats_policy == WINDOW_STATS_MAX) { + demand = max; + } else { + avg = div64_u64(sum, sched_ravg_hist_size); + if (sched_window_stats_policy == WINDOW_STATS_AVG) + demand = avg; + else + demand = max(avg, runtime); + } + pred_demand = predict_and_update_buckets(rq, p, runtime); + + /* + * A throttled deadline sched class task gets dequeued without + * changing p->on_rq. Since the dequeue decrements hmp stats + * avoid decrementing it here again. + */ + if (task_on_rq_queued(p) && (!task_has_dl_policy(p) || + !p->dl.dl_throttled)) + p->sched_class->fixup_hmp_sched_stats(rq, p, demand, + pred_demand); + + p->ravg.demand = demand; + p->ravg.pred_demand = pred_demand; + +done: + trace_sched_update_history(rq, p, runtime, samples, event); +} + +static u64 add_to_task_demand(struct rq *rq, struct task_struct *p, u64 delta) +{ + delta = scale_exec_time(delta, rq); + p->ravg.sum += delta; + if (unlikely(p->ravg.sum > sched_ravg_window)) + p->ravg.sum = sched_ravg_window; + + return delta; +} + +/* + * Account cpu demand of task and/or update task's cpu demand history + * + * ms = p->ravg.mark_start; + * wc = wallclock + * ws = rq->window_start + * + * Three possibilities: + * + * a) Task event is contained within one window. + * window_start < mark_start < wallclock + * + * ws ms wc + * | | | + * V V V + * |---------------| + * + * In this case, p->ravg.sum is updated *iff* event is appropriate + * (ex: event == PUT_PREV_TASK) + * + * b) Task event spans two windows. + * mark_start < window_start < wallclock + * + * ms ws wc + * | | | + * V V V + * -----|------------------- + * + * In this case, p->ravg.sum is updated with (ws - ms) *iff* event + * is appropriate, then a new window sample is recorded followed + * by p->ravg.sum being set to (wc - ws) *iff* event is appropriate. + * + * c) Task event spans more than two windows. + * + * ms ws_tmp ws wc + * | | | | + * V V V V + * ---|-------|-------|-------|-------|------ + * | | + * |<------ nr_full_windows ------>| + * + * In this case, p->ravg.sum is updated with (ws_tmp - ms) first *iff* + * event is appropriate, window sample of p->ravg.sum is recorded, + * 'nr_full_window' samples of window_size is also recorded *iff* + * event is appropriate and finally p->ravg.sum is set to (wc - ws) + * *iff* event is appropriate. + * + * IMPORTANT : Leave p->ravg.mark_start unchanged, as update_cpu_busy_time() + * depends on it! + */ +static u64 update_task_demand(struct task_struct *p, struct rq *rq, + int event, u64 wallclock) +{ + u64 mark_start = p->ravg.mark_start; + u64 delta, window_start = rq->window_start; + int new_window, nr_full_windows; + u32 window_size = sched_ravg_window; + u64 runtime; + + new_window = mark_start < window_start; + if (!account_busy_for_task_demand(rq, p, event)) { + if (new_window) + /* + * If the time accounted isn't being accounted as + * busy time, and a new window started, only the + * previous window need be closed out with the + * pre-existing demand. Multiple windows may have + * elapsed, but since empty windows are dropped, + * it is not necessary to account those. + */ + update_history(rq, p, p->ravg.sum, 1, event); + return 0; + } + + if (!new_window) { + /* + * The simple case - busy time contained within the existing + * window. + */ + return add_to_task_demand(rq, p, wallclock - mark_start); + } + + /* + * Busy time spans at least two windows. Temporarily rewind + * window_start to first window boundary after mark_start. + */ + delta = window_start - mark_start; + nr_full_windows = div64_u64(delta, window_size); + window_start -= (u64)nr_full_windows * (u64)window_size; + + /* Process (window_start - mark_start) first */ + runtime = add_to_task_demand(rq, p, window_start - mark_start); + + /* Push new sample(s) into task's demand history */ + update_history(rq, p, p->ravg.sum, 1, event); + if (nr_full_windows) { + u64 scaled_window = scale_exec_time(window_size, rq); + + update_history(rq, p, scaled_window, nr_full_windows, event); + runtime += nr_full_windows * scaled_window; + } + + /* + * Roll window_start back to current to process any remainder + * in current window. + */ + window_start += (u64)nr_full_windows * (u64)window_size; + + /* Process (wallclock - window_start) next */ + mark_start = window_start; + runtime += add_to_task_demand(rq, p, wallclock - mark_start); + + return runtime; +} + +static inline void +update_task_burst(struct task_struct *p, struct rq *rq, int event, u64 runtime) +{ + /* + * update_task_demand() has checks for idle task and + * exit task. The runtime may include the wait time, + * so update the burst only for the cases where the + * task is running. + */ + if (event == PUT_PREV_TASK || (event == TASK_UPDATE && + rq->curr == p)) + p->ravg.curr_burst += runtime; +} + +/* Reflect task activity on its demand and cpu's busy time statistics */ +void update_task_ravg(struct task_struct *p, struct rq *rq, int event, + u64 wallclock, u64 irqtime) +{ + u64 runtime; + + if (!rq->window_start || sched_disable_window_stats || + p->ravg.mark_start == wallclock) + return; + + lockdep_assert_held(&rq->lock); + + update_window_start(rq, wallclock); + + if (!p->ravg.mark_start) { + update_task_cpu_cycles(p, cpu_of(rq), wallclock); + goto done; + } + + update_task_rq_cpu_cycles(p, rq, event, wallclock, irqtime); + runtime = update_task_demand(p, rq, event, wallclock); + if (runtime) + update_task_burst(p, rq, event, runtime); + update_cpu_busy_time(p, rq, event, wallclock, irqtime); + update_task_pred_demand(rq, p, event); + + if (exiting_task(p)) + goto done; + + trace_sched_update_task_ravg(p, rq, event, wallclock, irqtime, + rq->cc.cycles, rq->cc.time, + p->grp ? &rq->grp_time : NULL); + +done: + p->ravg.mark_start = wallclock; +} + +void sched_account_irqtime(int cpu, struct task_struct *curr, + u64 delta, u64 wallclock) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags, nr_windows; + u64 cur_jiffies_ts; + + raw_spin_lock_irqsave(&rq->lock, flags); + + /* + * cputime (wallclock) uses sched_clock so use the same here for + * consistency. + */ + delta += sched_clock() - wallclock; + cur_jiffies_ts = get_jiffies_64(); + + if (is_idle_task(curr)) + update_task_ravg(curr, rq, IRQ_UPDATE, sched_ktime_clock(), + delta); + + nr_windows = cur_jiffies_ts - rq->irqload_ts; + + if (nr_windows) { + if (nr_windows < 10) { + /* Decay CPU's irqload by 3/4 for each window. */ + rq->avg_irqload *= (3 * nr_windows); + rq->avg_irqload = div64_u64(rq->avg_irqload, + 4 * nr_windows); + } else { + rq->avg_irqload = 0; + } + rq->avg_irqload += rq->cur_irqload; + rq->cur_irqload = 0; + } + + rq->cur_irqload += delta; + rq->irqload_ts = cur_jiffies_ts; + raw_spin_unlock_irqrestore(&rq->lock, flags); +} + +void sched_account_irqstart(int cpu, struct task_struct *curr, u64 wallclock) +{ + struct rq *rq = cpu_rq(cpu); + + if (!rq->window_start || sched_disable_window_stats) + return; + + if (is_idle_task(curr)) { + /* We're here without rq->lock held, IRQ disabled */ + raw_spin_lock(&rq->lock); + update_task_cpu_cycles(curr, cpu, sched_ktime_clock()); + raw_spin_unlock(&rq->lock); + } +} + +void reset_task_stats(struct task_struct *p) +{ + u32 sum = 0; + u32 *curr_window_ptr = NULL; + u32 *prev_window_ptr = NULL; + + if (exiting_task(p)) { + sum = EXITING_TASK_MARKER; + } else { + curr_window_ptr = p->ravg.curr_window_cpu; + prev_window_ptr = p->ravg.prev_window_cpu; + memset(curr_window_ptr, 0, sizeof(u32) * nr_cpu_ids); + memset(prev_window_ptr, 0, sizeof(u32) * nr_cpu_ids); + } + + memset(&p->ravg, 0, sizeof(struct ravg)); + + p->ravg.curr_window_cpu = curr_window_ptr; + p->ravg.prev_window_cpu = prev_window_ptr; + + p->ravg.avg_burst = 2 * (u64)sysctl_sched_short_burst; + + /* Retain EXITING_TASK marker */ + p->ravg.sum_history[0] = sum; +} + +void mark_task_starting(struct task_struct *p) +{ + u64 wallclock; + struct rq *rq = task_rq(p); + + if (!rq->window_start || sched_disable_window_stats) { + reset_task_stats(p); + return; + } + + wallclock = sched_ktime_clock(); + p->ravg.mark_start = p->last_wake_ts = wallclock; + p->last_cpu_selected_ts = wallclock; + p->last_switch_out_ts = 0; + update_task_cpu_cycles(p, cpu_of(rq), wallclock); +} + +void set_window_start(struct rq *rq) +{ + static int sync_cpu_available; + + if (rq->window_start) + return; + + if (!sync_cpu_available) { + rq->window_start = sched_ktime_clock(); + sync_cpu_available = 1; + } else { + struct rq *sync_rq = cpu_rq(cpumask_any(cpu_online_mask)); + + raw_spin_unlock(&rq->lock); + double_rq_lock(rq, sync_rq); + rq->window_start = sync_rq->window_start; + rq->curr_runnable_sum = rq->prev_runnable_sum = 0; + rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0; + raw_spin_unlock(&sync_rq->lock); + } + + rq->curr->ravg.mark_start = rq->window_start; +} + +static void reset_all_task_stats(void) +{ + struct task_struct *g, *p; + + do_each_thread(g, p) { + reset_task_stats(p); + } while_each_thread(g, p); +} + +enum reset_reason_code { + WINDOW_CHANGE, + POLICY_CHANGE, + HIST_SIZE_CHANGE, + FREQ_AGGREGATE_CHANGE, +}; + +const char *sched_window_reset_reasons[] = { + "WINDOW_CHANGE", + "POLICY_CHANGE", + "HIST_SIZE_CHANGE", + "FREQ_AGGREGATE_CHANGE", +}; + +/* Called with IRQs enabled */ +void reset_all_window_stats(u64 window_start, unsigned int window_size) +{ + int cpu, i; + unsigned long flags; + u64 start_ts = sched_ktime_clock(); + int reason = WINDOW_CHANGE; + unsigned int old = 0, new = 0; + + local_irq_save(flags); + + read_lock(&tasklist_lock); + + read_lock(&related_thread_group_lock); + + /* Taking all runqueue locks prevents race with sched_exit(). */ + for_each_possible_cpu(cpu) + raw_spin_lock(&cpu_rq(cpu)->lock); + + sched_disable_window_stats = 1; + + reset_all_task_stats(); + + read_unlock(&tasklist_lock); + + if (window_size) { + sched_ravg_window = window_size * TICK_NSEC; + set_hmp_defaults(); + sched_load_granule = sched_ravg_window / NUM_LOAD_INDICES; + } + + sched_disable_window_stats = 0; + + for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + + if (window_start) + rq->window_start = window_start; + rq->curr_runnable_sum = rq->prev_runnable_sum = 0; + rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0; + memset(&rq->grp_time, 0, sizeof(struct group_cpu_time)); + for (i = 0; i < NUM_TRACKED_WINDOWS; i++) { + memset(&rq->load_subs[i], 0, + sizeof(struct load_subtractions)); + clear_top_tasks_table(rq->top_tasks[i]); + clear_top_tasks_bitmap(rq->top_tasks_bitmap[i]); + } + + rq->curr_table = 0; + rq->curr_top = 0; + rq->prev_top = 0; + reset_cpu_hmp_stats(cpu, 1); + } + + if (sched_window_stats_policy != sysctl_sched_window_stats_policy) { + reason = POLICY_CHANGE; + old = sched_window_stats_policy; + new = sysctl_sched_window_stats_policy; + sched_window_stats_policy = sysctl_sched_window_stats_policy; + } else if (sched_ravg_hist_size != sysctl_sched_ravg_hist_size) { + reason = HIST_SIZE_CHANGE; + old = sched_ravg_hist_size; + new = sysctl_sched_ravg_hist_size; + sched_ravg_hist_size = sysctl_sched_ravg_hist_size; + } else if (sched_freq_aggregate != + sysctl_sched_freq_aggregate) { + reason = FREQ_AGGREGATE_CHANGE; + old = sched_freq_aggregate; + new = sysctl_sched_freq_aggregate; + sched_freq_aggregate = sysctl_sched_freq_aggregate; + } + + for_each_possible_cpu(cpu) + raw_spin_unlock(&cpu_rq(cpu)->lock); + + read_unlock(&related_thread_group_lock); + + local_irq_restore(flags); + + trace_sched_reset_all_window_stats(window_start, window_size, + sched_ktime_clock() - start_ts, reason, old, new); +} + +/* + * In this function we match the accumulated subtractions with the current + * and previous windows we are operating with. Ignore any entries where + * the window start in the load_subtraction struct does not match either + * the curent or the previous window. This could happen whenever CPUs + * become idle or busy with interrupts disabled for an extended period. + */ +static inline void account_load_subtractions(struct rq *rq) +{ + u64 ws = rq->window_start; + u64 prev_ws = ws - sched_ravg_window; + struct load_subtractions *ls = rq->load_subs; + int i; + + for (i = 0; i < NUM_TRACKED_WINDOWS; i++) { + if (ls[i].window_start == ws) { + rq->curr_runnable_sum -= ls[i].subs; + rq->nt_curr_runnable_sum -= ls[i].new_subs; + } else if (ls[i].window_start == prev_ws) { + rq->prev_runnable_sum -= ls[i].subs; + rq->nt_prev_runnable_sum -= ls[i].new_subs; + } + + ls[i].subs = 0; + ls[i].new_subs = 0; + } + + BUG_ON((s64)rq->prev_runnable_sum < 0); + BUG_ON((s64)rq->curr_runnable_sum < 0); + BUG_ON((s64)rq->nt_prev_runnable_sum < 0); + BUG_ON((s64)rq->nt_curr_runnable_sum < 0); +} + +static inline u64 freq_policy_load(struct rq *rq, u64 load) +{ + unsigned int reporting_policy = sysctl_sched_freq_reporting_policy; + + switch (reporting_policy) { + case FREQ_REPORT_MAX_CPU_LOAD_TOP_TASK: + load = max_t(u64, load, top_task_load(rq)); + break; + case FREQ_REPORT_TOP_TASK: + load = top_task_load(rq); + break; + case FREQ_REPORT_CPU_LOAD: + break; + default: + break; + } + + return load; +} + +void sched_get_cpus_busy(struct sched_load *busy, + const struct cpumask *query_cpus) +{ + unsigned long flags; + struct rq *rq; + const int cpus = cpumask_weight(query_cpus); + u64 load[cpus], group_load[cpus]; + u64 nload[cpus], ngload[cpus]; + u64 pload[cpus]; + unsigned int max_freq[cpus]; + int notifier_sent = 0; + int early_detection[cpus]; + int cpu, i = 0; + unsigned int window_size; + u64 max_prev_sum = 0; + int max_busy_cpu = cpumask_first(query_cpus); + u64 total_group_load = 0, total_ngload = 0; + bool aggregate_load = false; + struct sched_cluster *cluster = cpu_cluster(cpumask_first(query_cpus)); + + if (unlikely(cpus == 0)) + return; + + local_irq_save(flags); + + /* + * This function could be called in timer context, and the + * current task may have been executing for a long time. Ensure + * that the window stats are current by doing an update. + */ + + for_each_cpu(cpu, query_cpus) + raw_spin_lock(&cpu_rq(cpu)->lock); + + window_size = sched_ravg_window; + + /* + * We don't really need the cluster lock for this entire for loop + * block. However, there is no advantage in optimizing this as rq + * locks are held regardless and would prevent migration anyways + */ + raw_spin_lock(&cluster->load_lock); + + for_each_cpu(cpu, query_cpus) { + rq = cpu_rq(cpu); + + update_task_ravg(rq->curr, rq, TASK_UPDATE, sched_ktime_clock(), + 0); + + /* + * Ensure that we don't report load for 'cpu' again via the + * cpufreq_update_util path in the window that started at + * rq->window_start + */ + rq->load_reported_window = rq->window_start; + + account_load_subtractions(rq); + load[i] = rq->prev_runnable_sum; + nload[i] = rq->nt_prev_runnable_sum; + pload[i] = rq->hmp_stats.pred_demands_sum; + rq->old_estimated_time = pload[i]; + + if (load[i] > max_prev_sum) { + max_prev_sum = load[i]; + max_busy_cpu = cpu; + } + + /* + * sched_get_cpus_busy() is called for all CPUs in a + * frequency domain. So the notifier_sent flag per + * cluster works even when a frequency domain spans + * more than 1 cluster. + */ + if (rq->cluster->notifier_sent) { + notifier_sent = 1; + rq->cluster->notifier_sent = 0; + } + early_detection[i] = (rq->ed_task != NULL); + max_freq[i] = cpu_max_freq(cpu); + i++; + } + + raw_spin_unlock(&cluster->load_lock); + + group_load_in_freq_domain( + &cpu_rq(max_busy_cpu)->freq_domain_cpumask, + &total_group_load, &total_ngload); + aggregate_load = !!(total_group_load > sched_freq_aggregate_threshold); + + i = 0; + for_each_cpu(cpu, query_cpus) { + group_load[i] = 0; + ngload[i] = 0; + + if (early_detection[i]) + goto skip_early; + + rq = cpu_rq(cpu); + if (aggregate_load) { + if (cpu == max_busy_cpu) { + group_load[i] = total_group_load; + ngload[i] = total_ngload; + } + } else { + group_load[i] = rq->grp_time.prev_runnable_sum; + ngload[i] = rq->grp_time.nt_prev_runnable_sum; + } + + load[i] += group_load[i]; + nload[i] += ngload[i]; + + load[i] = freq_policy_load(rq, load[i]); + rq->old_busy_time = load[i]; + + /* + * Scale load in reference to cluster max_possible_freq. + * + * Note that scale_load_to_cpu() scales load in reference to + * the cluster max_freq. + */ + load[i] = scale_load_to_cpu(load[i], cpu); + nload[i] = scale_load_to_cpu(nload[i], cpu); + pload[i] = scale_load_to_cpu(pload[i], cpu); +skip_early: + i++; + } + + for_each_cpu(cpu, query_cpus) + raw_spin_unlock(&(cpu_rq(cpu))->lock); + + local_irq_restore(flags); + + i = 0; + for_each_cpu(cpu, query_cpus) { + rq = cpu_rq(cpu); + + if (early_detection[i]) { + busy[i].prev_load = div64_u64(sched_ravg_window, + NSEC_PER_USEC); + busy[i].new_task_load = 0; + busy[i].predicted_load = 0; + goto exit_early; + } + + load[i] = scale_load_to_freq(load[i], max_freq[i], + cpu_max_possible_freq(cpu)); + nload[i] = scale_load_to_freq(nload[i], max_freq[i], + cpu_max_possible_freq(cpu)); + + pload[i] = scale_load_to_freq(pload[i], max_freq[i], + rq->cluster->max_possible_freq); + + busy[i].prev_load = div64_u64(load[i], NSEC_PER_USEC); + busy[i].new_task_load = div64_u64(nload[i], NSEC_PER_USEC); + busy[i].predicted_load = div64_u64(pload[i], NSEC_PER_USEC); + +exit_early: + trace_sched_get_busy(cpu, busy[i].prev_load, + busy[i].new_task_load, + busy[i].predicted_load, + early_detection[i], + aggregate_load && + cpu == max_busy_cpu); + i++; + } +} + +void sched_set_io_is_busy(int val) +{ + sched_io_is_busy = val; +} + +int sched_set_window(u64 window_start, unsigned int window_size) +{ + u64 now, cur_jiffies, jiffy_ktime_ns; + s64 ws; + unsigned long flags; + + if (window_size * TICK_NSEC < MIN_SCHED_RAVG_WINDOW) + return -EINVAL; + + mutex_lock(&policy_mutex); + + /* + * Get a consistent view of ktime, jiffies, and the time + * since the last jiffy (based on last_jiffies_update). + */ + local_irq_save(flags); + cur_jiffies = jiffy_to_ktime_ns(&now, &jiffy_ktime_ns); + local_irq_restore(flags); + + /* translate window_start from jiffies to nanoseconds */ + ws = (window_start - cur_jiffies); /* jiffy difference */ + ws *= TICK_NSEC; + ws += jiffy_ktime_ns; + + /* + * Roll back calculated window start so that it is in + * the past (window stats must have a current window). + */ + while (ws > now) + ws -= (window_size * TICK_NSEC); + + BUG_ON(sched_ktime_clock() < ws); + + reset_all_window_stats(ws, window_size); + + sched_update_freq_max_load(cpu_possible_mask); + + mutex_unlock(&policy_mutex); + + return 0; +} + +static inline void create_subtraction_entry(struct rq *rq, u64 ws, int index) +{ + rq->load_subs[index].window_start = ws; + rq->load_subs[index].subs = 0; + rq->load_subs[index].new_subs = 0; +} + +static bool get_subtraction_index(struct rq *rq, u64 ws) +{ + int i; + u64 oldest = ULLONG_MAX; + int oldest_index = 0; + + for (i = 0; i < NUM_TRACKED_WINDOWS; i++) { + u64 entry_ws = rq->load_subs[i].window_start; + + if (ws == entry_ws) + return i; + + if (entry_ws < oldest) { + oldest = entry_ws; + oldest_index = i; + } + } + + create_subtraction_entry(rq, ws, oldest_index); + return oldest_index; +} + +static void update_rq_load_subtractions(int index, struct rq *rq, + u32 sub_load, bool new_task) +{ + rq->load_subs[index].subs += sub_load; + if (new_task) + rq->load_subs[index].new_subs += sub_load; +} + +static void update_cluster_load_subtractions(struct task_struct *p, + int cpu, u64 ws, bool new_task) +{ + struct sched_cluster *cluster = cpu_cluster(cpu); + struct cpumask cluster_cpus = cluster->cpus; + u64 prev_ws = ws - sched_ravg_window; + int i; + + cpumask_clear_cpu(cpu, &cluster_cpus); + raw_spin_lock(&cluster->load_lock); + + for_each_cpu(i, &cluster_cpus) { + struct rq *rq = cpu_rq(i); + int index; + + if (p->ravg.curr_window_cpu[i]) { + index = get_subtraction_index(rq, ws); + update_rq_load_subtractions(index, rq, + p->ravg.curr_window_cpu[i], new_task); + p->ravg.curr_window_cpu[i] = 0; + } + + if (p->ravg.prev_window_cpu[i]) { + index = get_subtraction_index(rq, prev_ws); + update_rq_load_subtractions(index, rq, + p->ravg.prev_window_cpu[i], new_task); + p->ravg.prev_window_cpu[i] = 0; + } + } + + raw_spin_unlock(&cluster->load_lock); +} + +static inline void inter_cluster_migration_fixup + (struct task_struct *p, int new_cpu, int task_cpu, bool new_task) +{ + struct rq *dest_rq = cpu_rq(new_cpu); + struct rq *src_rq = cpu_rq(task_cpu); + + if (same_freq_domain(new_cpu, task_cpu)) + return; + + p->ravg.curr_window_cpu[new_cpu] = p->ravg.curr_window; + p->ravg.prev_window_cpu[new_cpu] = p->ravg.prev_window; + + dest_rq->curr_runnable_sum += p->ravg.curr_window; + dest_rq->prev_runnable_sum += p->ravg.prev_window; + + src_rq->curr_runnable_sum -= p->ravg.curr_window_cpu[task_cpu]; + src_rq->prev_runnable_sum -= p->ravg.prev_window_cpu[task_cpu]; + + if (new_task) { + dest_rq->nt_curr_runnable_sum += p->ravg.curr_window; + dest_rq->nt_prev_runnable_sum += p->ravg.prev_window; + + src_rq->nt_curr_runnable_sum -= + p->ravg.curr_window_cpu[task_cpu]; + src_rq->nt_prev_runnable_sum -= + p->ravg.prev_window_cpu[task_cpu]; + } + + p->ravg.curr_window_cpu[task_cpu] = 0; + p->ravg.prev_window_cpu[task_cpu] = 0; + + update_cluster_load_subtractions(p, task_cpu, + src_rq->window_start, new_task); + + BUG_ON((s64)src_rq->prev_runnable_sum < 0); + BUG_ON((s64)src_rq->curr_runnable_sum < 0); + BUG_ON((s64)src_rq->nt_prev_runnable_sum < 0); + BUG_ON((s64)src_rq->nt_curr_runnable_sum < 0); +} + +static int get_top_index(unsigned long *bitmap, unsigned long old_top) +{ + int index = find_next_bit(bitmap, NUM_LOAD_INDICES, old_top); + + if (index == NUM_LOAD_INDICES) + return 0; + + return NUM_LOAD_INDICES - 1 - index; +} + +static void +migrate_top_tasks(struct task_struct *p, struct rq *src_rq, struct rq *dst_rq) +{ + int index; + int top_index; + u32 curr_window = p->ravg.curr_window; + u32 prev_window = p->ravg.prev_window; + u8 src = src_rq->curr_table; + u8 dst = dst_rq->curr_table; + u8 *src_table; + u8 *dst_table; + + if (curr_window) { + src_table = src_rq->top_tasks[src]; + dst_table = dst_rq->top_tasks[dst]; + index = load_to_index(curr_window); + src_table[index] -= 1; + dst_table[index] += 1; + + if (!src_table[index]) + __clear_bit(NUM_LOAD_INDICES - index - 1, + src_rq->top_tasks_bitmap[src]); + + if (dst_table[index] == 1) + __set_bit(NUM_LOAD_INDICES - index - 1, + dst_rq->top_tasks_bitmap[dst]); + + if (index > dst_rq->curr_top) + dst_rq->curr_top = index; + + top_index = src_rq->curr_top; + if (index == top_index && !src_table[index]) + src_rq->curr_top = get_top_index( + src_rq->top_tasks_bitmap[src], top_index); + } + + if (prev_window) { + src = 1 - src; + dst = 1 - dst; + src_table = src_rq->top_tasks[src]; + dst_table = dst_rq->top_tasks[dst]; + index = load_to_index(prev_window); + src_table[index] -= 1; + dst_table[index] += 1; + + if (!src_table[index]) + __clear_bit(NUM_LOAD_INDICES - index - 1, + src_rq->top_tasks_bitmap[src]); + + if (dst_table[index] == 1) + __set_bit(NUM_LOAD_INDICES - index - 1, + dst_rq->top_tasks_bitmap[dst]); + + if (index > dst_rq->prev_top) + dst_rq->prev_top = index; + + top_index = src_rq->prev_top; + if (index == top_index && !src_table[index]) + src_rq->prev_top = get_top_index( + src_rq->top_tasks_bitmap[src], top_index); + } +} + +void fixup_busy_time(struct task_struct *p, int new_cpu) +{ + struct rq *src_rq = task_rq(p); + struct rq *dest_rq = cpu_rq(new_cpu); + u64 wallclock; + u64 *src_curr_runnable_sum, *dst_curr_runnable_sum; + u64 *src_prev_runnable_sum, *dst_prev_runnable_sum; + u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum; + u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum; + bool new_task; + struct related_thread_group *grp; + + if (!p->on_rq && p->state != TASK_WAKING) + return; + + if (exiting_task(p)) { + clear_ed_task(p, src_rq); + return; + } + + if (p->state == TASK_WAKING) + double_rq_lock(src_rq, dest_rq); + + if (sched_disable_window_stats) + goto done; + + wallclock = sched_ktime_clock(); + + update_task_ravg(task_rq(p)->curr, task_rq(p), + TASK_UPDATE, + wallclock, 0); + update_task_ravg(dest_rq->curr, dest_rq, + TASK_UPDATE, wallclock, 0); + + update_task_ravg(p, task_rq(p), TASK_MIGRATE, + wallclock, 0); + + update_task_cpu_cycles(p, new_cpu, wallclock); + + new_task = is_new_task(p); + /* Protected by rq_lock */ + grp = p->grp; + + /* + * For frequency aggregation, we continue to do migration fixups + * even for intra cluster migrations. This is because, the aggregated + * load has to reported on a single CPU regardless. + */ + if (grp && sched_freq_aggregate) { + struct group_cpu_time *cpu_time; + + cpu_time = &src_rq->grp_time; + src_curr_runnable_sum = &cpu_time->curr_runnable_sum; + src_prev_runnable_sum = &cpu_time->prev_runnable_sum; + src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; + src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; + + cpu_time = &dest_rq->grp_time; + dst_curr_runnable_sum = &cpu_time->curr_runnable_sum; + dst_prev_runnable_sum = &cpu_time->prev_runnable_sum; + dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; + dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; + + if (p->ravg.curr_window) { + *src_curr_runnable_sum -= p->ravg.curr_window; + *dst_curr_runnable_sum += p->ravg.curr_window; + if (new_task) { + *src_nt_curr_runnable_sum -= + p->ravg.curr_window; + *dst_nt_curr_runnable_sum += + p->ravg.curr_window; + } + } + + if (p->ravg.prev_window) { + *src_prev_runnable_sum -= p->ravg.prev_window; + *dst_prev_runnable_sum += p->ravg.prev_window; + if (new_task) { + *src_nt_prev_runnable_sum -= + p->ravg.prev_window; + *dst_nt_prev_runnable_sum += + p->ravg.prev_window; + } + } + } else { + inter_cluster_migration_fixup(p, new_cpu, + task_cpu(p), new_task); + } + + migrate_top_tasks(p, src_rq, dest_rq); + + if (!same_freq_domain(new_cpu, task_cpu(p))) { + cpufreq_update_util(dest_rq, SCHED_CPUFREQ_INTERCLUSTER_MIG); + cpufreq_update_util(src_rq, SCHED_CPUFREQ_INTERCLUSTER_MIG); + } + + if (p == src_rq->ed_task) { + src_rq->ed_task = NULL; + if (!dest_rq->ed_task) + dest_rq->ed_task = p; + } + +done: + if (p->state == TASK_WAKING) + double_rq_unlock(src_rq, dest_rq); +} + +#define sched_up_down_migrate_auto_update 1 +static void check_for_up_down_migrate_update(const struct cpumask *cpus) +{ + int i = cpumask_first(cpus); + + if (!sched_up_down_migrate_auto_update) + return; + + if (cpu_max_possible_capacity(i) == max_possible_capacity) + return; + + if (cpu_max_possible_freq(i) == cpu_max_freq(i)) + up_down_migrate_scale_factor = 1024; + else + up_down_migrate_scale_factor = (1024 * + cpu_max_possible_freq(i)) / cpu_max_freq(i); + + update_up_down_migrate(); +} + +/* Return cluster which can offer required capacity for group */ +static struct sched_cluster *best_cluster(struct related_thread_group *grp, + u64 total_demand, bool group_boost) +{ + struct sched_cluster *cluster = NULL; + + for_each_sched_cluster(cluster) { + if (group_will_fit(cluster, grp, total_demand, group_boost)) + return cluster; + } + + return sched_cluster[0]; +} + +static void _set_preferred_cluster(struct related_thread_group *grp) +{ + struct task_struct *p; + u64 combined_demand = 0; + bool boost_on_big = sched_boost_policy() == SCHED_BOOST_ON_BIG; + bool group_boost = false; + u64 wallclock; + + if (list_empty(&grp->tasks)) + return; + + wallclock = sched_ktime_clock(); + + /* + * wakeup of two or more related tasks could race with each other and + * could result in multiple calls to _set_preferred_cluster being issued + * at same time. Avoid overhead in such cases of rechecking preferred + * cluster + */ + if (wallclock - grp->last_update < sched_ravg_window / 10) + return; + + list_for_each_entry(p, &grp->tasks, grp_list) { + if (boost_on_big && task_sched_boost(p)) { + group_boost = true; + break; + } + + if (p->ravg.mark_start < wallclock - + (sched_ravg_window * sched_ravg_hist_size)) + continue; + + combined_demand += p->ravg.demand; + + } + + grp->preferred_cluster = best_cluster(grp, + combined_demand, group_boost); + grp->last_update = sched_ktime_clock(); + trace_sched_set_preferred_cluster(grp, combined_demand); +} + +void set_preferred_cluster(struct related_thread_group *grp) +{ + raw_spin_lock(&grp->lock); + _set_preferred_cluster(grp); + raw_spin_unlock(&grp->lock); +} + +#define ADD_TASK 0 +#define REM_TASK 1 + +#define DEFAULT_CGROUP_COLOC_ID 1 + +/* + * Task's cpu usage is accounted in: + * rq->curr/prev_runnable_sum, when its ->grp is NULL + * grp->cpu_time[cpu]->curr/prev_runnable_sum, when its ->grp is !NULL + * + * Transfer task's cpu usage between those counters when transitioning between + * groups + */ +static void transfer_busy_time(struct rq *rq, struct related_thread_group *grp, + struct task_struct *p, int event) +{ + u64 wallclock; + struct group_cpu_time *cpu_time; + u64 *src_curr_runnable_sum, *dst_curr_runnable_sum; + u64 *src_prev_runnable_sum, *dst_prev_runnable_sum; + u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum; + u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum; + int migrate_type; + int cpu = cpu_of(rq); + bool new_task; + int i; + + if (!sched_freq_aggregate) + return; + + wallclock = sched_ktime_clock(); + + update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); + update_task_ravg(p, rq, TASK_UPDATE, wallclock, 0); + new_task = is_new_task(p); + + cpu_time = &rq->grp_time; + if (event == ADD_TASK) { + migrate_type = RQ_TO_GROUP; + + src_curr_runnable_sum = &rq->curr_runnable_sum; + dst_curr_runnable_sum = &cpu_time->curr_runnable_sum; + src_prev_runnable_sum = &rq->prev_runnable_sum; + dst_prev_runnable_sum = &cpu_time->prev_runnable_sum; + + src_nt_curr_runnable_sum = &rq->nt_curr_runnable_sum; + dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; + src_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum; + dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; + + *src_curr_runnable_sum -= p->ravg.curr_window_cpu[cpu]; + *src_prev_runnable_sum -= p->ravg.prev_window_cpu[cpu]; + if (new_task) { + *src_nt_curr_runnable_sum -= + p->ravg.curr_window_cpu[cpu]; + *src_nt_prev_runnable_sum -= + p->ravg.prev_window_cpu[cpu]; + } + + update_cluster_load_subtractions(p, cpu, + rq->window_start, new_task); + + } else { + migrate_type = GROUP_TO_RQ; + + src_curr_runnable_sum = &cpu_time->curr_runnable_sum; + dst_curr_runnable_sum = &rq->curr_runnable_sum; + src_prev_runnable_sum = &cpu_time->prev_runnable_sum; + dst_prev_runnable_sum = &rq->prev_runnable_sum; + + src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; + dst_nt_curr_runnable_sum = &rq->nt_curr_runnable_sum; + src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; + dst_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum; + + *src_curr_runnable_sum -= p->ravg.curr_window; + *src_prev_runnable_sum -= p->ravg.prev_window; + if (new_task) { + *src_nt_curr_runnable_sum -= p->ravg.curr_window; + *src_nt_prev_runnable_sum -= p->ravg.prev_window; + } + + /* + * Need to reset curr/prev windows for all CPUs, not just the + * ones in the same cluster. Since inter cluster migrations + * did not result in the appropriate book keeping, the values + * per CPU would be inaccurate. + */ + for_each_possible_cpu(i) { + p->ravg.curr_window_cpu[i] = 0; + p->ravg.prev_window_cpu[i] = 0; + } + } + + *dst_curr_runnable_sum += p->ravg.curr_window; + *dst_prev_runnable_sum += p->ravg.prev_window; + if (new_task) { + *dst_nt_curr_runnable_sum += p->ravg.curr_window; + *dst_nt_prev_runnable_sum += p->ravg.prev_window; + } + + /* + * When a task enter or exits a group, it's curr and prev windows are + * moved to a single CPU. This behavior might be sub-optimal in the + * exit case, however, it saves us the overhead of handling inter + * cluster migration fixups while the task is part of a related group. + */ + p->ravg.curr_window_cpu[cpu] = p->ravg.curr_window; + p->ravg.prev_window_cpu[cpu] = p->ravg.prev_window; + + trace_sched_migration_update_sum(p, migrate_type, rq); + + BUG_ON((s64)*src_curr_runnable_sum < 0); + BUG_ON((s64)*src_prev_runnable_sum < 0); + BUG_ON((s64)*src_nt_curr_runnable_sum < 0); + BUG_ON((s64)*src_nt_prev_runnable_sum < 0); +} + +static inline struct related_thread_group* +lookup_related_thread_group(unsigned int group_id) +{ + return related_thread_groups[group_id]; +} + +int alloc_related_thread_groups(void) +{ + int i, ret; + struct related_thread_group *grp; + + /* groupd_id = 0 is invalid as it's special id to remove group. */ + for (i = 1; i < MAX_NUM_CGROUP_COLOC_ID; i++) { + grp = kzalloc(sizeof(*grp), GFP_NOWAIT); + if (!grp) { + ret = -ENOMEM; + goto err; + } + + grp->id = i; + INIT_LIST_HEAD(&grp->tasks); + INIT_LIST_HEAD(&grp->list); + raw_spin_lock_init(&grp->lock); + + related_thread_groups[i] = grp; + } + + return 0; + +err: + for (i = 1; i < MAX_NUM_CGROUP_COLOC_ID; i++) { + grp = lookup_related_thread_group(i); + if (grp) { + kfree(grp); + related_thread_groups[i] = NULL; + } else { + break; + } + } + + return ret; +} + +static void remove_task_from_group(struct task_struct *p) +{ + struct related_thread_group *grp = p->grp; + struct rq *rq; + int empty_group = 1; + + raw_spin_lock(&grp->lock); + + rq = __task_rq_lock(p); + transfer_busy_time(rq, p->grp, p, REM_TASK); + list_del_init(&p->grp_list); + rcu_assign_pointer(p->grp, NULL); + __task_rq_unlock(rq); + + if (!list_empty(&grp->tasks)) { + empty_group = 0; + _set_preferred_cluster(grp); + } + + raw_spin_unlock(&grp->lock); + + /* Reserved groups cannot be destroyed */ + if (empty_group && grp->id != DEFAULT_CGROUP_COLOC_ID) + /* + * We test whether grp->list is attached with list_empty() + * hence re-init the list after deletion. + */ + list_del_init(&grp->list); +} + +static int +add_task_to_group(struct task_struct *p, struct related_thread_group *grp) +{ + struct rq *rq; + + raw_spin_lock(&grp->lock); + + /* + * Change p->grp under rq->lock. Will prevent races with read-side + * reference of p->grp in various hot-paths + */ + rq = __task_rq_lock(p); + transfer_busy_time(rq, grp, p, ADD_TASK); + list_add(&p->grp_list, &grp->tasks); + rcu_assign_pointer(p->grp, grp); + __task_rq_unlock(rq); + + _set_preferred_cluster(grp); + + raw_spin_unlock(&grp->lock); + + return 0; +} + +void add_new_task_to_grp(struct task_struct *new) +{ + unsigned long flags; + struct related_thread_group *grp; + struct task_struct *leader = new->group_leader; + unsigned int leader_grp_id = sched_get_group_id(leader); + + if (!sysctl_sched_enable_thread_grouping && + leader_grp_id != DEFAULT_CGROUP_COLOC_ID) + return; + + if (thread_group_leader(new)) + return; + + if (leader_grp_id == DEFAULT_CGROUP_COLOC_ID) { + if (!same_schedtune(new, leader)) + return; + } + + write_lock_irqsave(&related_thread_group_lock, flags); + + rcu_read_lock(); + grp = task_related_thread_group(leader); + rcu_read_unlock(); + + /* + * It's possible that someone already added the new task to the + * group. A leader's thread group is updated prior to calling + * this function. It's also possible that the leader has exited + * the group. In either case, there is nothing else to do. + */ + if (!grp || new->grp) { + write_unlock_irqrestore(&related_thread_group_lock, flags); + return; + } + + raw_spin_lock(&grp->lock); + + rcu_assign_pointer(new->grp, grp); + list_add(&new->grp_list, &grp->tasks); + + raw_spin_unlock(&grp->lock); + write_unlock_irqrestore(&related_thread_group_lock, flags); +} + +static int __sched_set_group_id(struct task_struct *p, unsigned int group_id) +{ + int rc = 0; + unsigned long flags; + struct related_thread_group *grp = NULL; + + if (group_id >= MAX_NUM_CGROUP_COLOC_ID) + return -EINVAL; + + raw_spin_lock_irqsave(&p->pi_lock, flags); + write_lock(&related_thread_group_lock); + + /* Switching from one group to another directly is not permitted */ + if ((current != p && p->flags & PF_EXITING) || + (!p->grp && !group_id) || + (p->grp && group_id)) + goto done; + + if (!group_id) { + remove_task_from_group(p); + goto done; + } + + grp = lookup_related_thread_group(group_id); + if (list_empty(&grp->list)) + list_add(&grp->list, &active_related_thread_groups); + + rc = add_task_to_group(p, grp); +done: + write_unlock(&related_thread_group_lock); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + + return rc; +} + +int sched_set_group_id(struct task_struct *p, unsigned int group_id) +{ + /* DEFAULT_CGROUP_COLOC_ID is a reserved id */ + if (group_id == DEFAULT_CGROUP_COLOC_ID) + return -EINVAL; + + return __sched_set_group_id(p, group_id); +} + +unsigned int sched_get_group_id(struct task_struct *p) +{ + unsigned int group_id; + struct related_thread_group *grp; + + rcu_read_lock(); + grp = task_related_thread_group(p); + group_id = grp ? grp->id : 0; + rcu_read_unlock(); + + return group_id; +} + +#if defined(CONFIG_SCHED_TUNE) && defined(CONFIG_CGROUP_SCHEDTUNE) +/* + * We create a default colocation group at boot. There is no need to + * synchronize tasks between cgroups at creation time because the + * correct cgroup hierarchy is not available at boot. Therefore cgroup + * colocation is turned off by default even though the colocation group + * itself has been allocated. Furthermore this colocation group cannot + * be destroyted once it has been created. All of this has been as part + * of runtime optimizations. + * + * The job of synchronizing tasks to the colocation group is done when + * the colocation flag in the cgroup is turned on. + */ +static int __init create_default_coloc_group(void) +{ + struct related_thread_group *grp = NULL; + unsigned long flags; + + grp = lookup_related_thread_group(DEFAULT_CGROUP_COLOC_ID); + write_lock_irqsave(&related_thread_group_lock, flags); + list_add(&grp->list, &active_related_thread_groups); + write_unlock_irqrestore(&related_thread_group_lock, flags); + + update_freq_aggregate_threshold(MAX_FREQ_AGGR_THRESH); + return 0; +} +late_initcall(create_default_coloc_group); + +int sync_cgroup_colocation(struct task_struct *p, bool insert) +{ + unsigned int grp_id = insert ? DEFAULT_CGROUP_COLOC_ID : 0; + + return __sched_set_group_id(p, grp_id); +} +#endif + +static void update_cpu_cluster_capacity(const cpumask_t *cpus) +{ + int i; + struct sched_cluster *cluster; + struct cpumask cpumask; + + cpumask_copy(&cpumask, cpus); + pre_big_task_count_change(cpu_possible_mask); + + for_each_cpu(i, &cpumask) { + cluster = cpu_rq(i)->cluster; + cpumask_andnot(&cpumask, &cpumask, &cluster->cpus); + + cluster->capacity = compute_capacity(cluster); + cluster->load_scale_factor = compute_load_scale_factor(cluster); + + /* 'cpus' can contain cpumask more than one cluster */ + check_for_up_down_migrate_update(&cluster->cpus); + } + + __update_min_max_capacity(); + + post_big_task_count_change(cpu_possible_mask); +} + +static DEFINE_SPINLOCK(cpu_freq_min_max_lock); +void sched_update_cpu_freq_min_max(const cpumask_t *cpus, u32 fmin, u32 fmax) +{ + struct cpumask cpumask; + struct sched_cluster *cluster; + int i, update_capacity = 0; + unsigned long flags; + + spin_lock_irqsave(&cpu_freq_min_max_lock, flags); + cpumask_copy(&cpumask, cpus); + for_each_cpu(i, &cpumask) { + cluster = cpu_rq(i)->cluster; + cpumask_andnot(&cpumask, &cpumask, &cluster->cpus); + + update_capacity += (cluster->max_mitigated_freq != fmax); + cluster->max_mitigated_freq = fmax; + } + spin_unlock_irqrestore(&cpu_freq_min_max_lock, flags); + + if (update_capacity) + update_cpu_cluster_capacity(cpus); +} + +static int cpufreq_notifier_policy(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct cpufreq_policy *policy = (struct cpufreq_policy *)data; + struct sched_cluster *cluster = NULL; + struct cpumask policy_cluster = *policy->related_cpus; + unsigned int orig_max_freq = 0; + int i, j, update_capacity = 0; + + if (val != CPUFREQ_NOTIFY && val != CPUFREQ_REMOVE_POLICY && + val != CPUFREQ_CREATE_POLICY) + return 0; + + if (val == CPUFREQ_REMOVE_POLICY || val == CPUFREQ_CREATE_POLICY) { + update_min_max_capacity(); + return 0; + } + + max_possible_freq = max(max_possible_freq, policy->cpuinfo.max_freq); + if (min_max_freq == 1) + min_max_freq = UINT_MAX; + min_max_freq = min(min_max_freq, policy->cpuinfo.max_freq); + BUG_ON(!min_max_freq); + BUG_ON(!policy->max); + + for_each_cpu(i, &policy_cluster) { + cluster = cpu_rq(i)->cluster; + cpumask_andnot(&policy_cluster, &policy_cluster, + &cluster->cpus); + + orig_max_freq = cluster->max_freq; + cluster->min_freq = policy->min; + cluster->max_freq = policy->max; + cluster->cur_freq = policy->cur; + + if (!cluster->freq_init_done) { + mutex_lock(&cluster_lock); + for_each_cpu(j, &cluster->cpus) + cpumask_copy(&cpu_rq(j)->freq_domain_cpumask, + policy->related_cpus); + cluster->max_possible_freq = policy->cpuinfo.max_freq; + cluster->max_possible_capacity = + compute_max_possible_capacity(cluster); + cluster->freq_init_done = true; + + sort_clusters(); + update_all_clusters_stats(); + mutex_unlock(&cluster_lock); + continue; + } + + update_capacity += (orig_max_freq != cluster->max_freq); + } + + if (update_capacity) + update_cpu_cluster_capacity(policy->related_cpus); + + return 0; +} + +static int cpufreq_notifier_trans(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct cpufreq_freqs *freq = (struct cpufreq_freqs *)data; + unsigned int cpu = freq->cpu, new_freq = freq->new; + unsigned long flags; + struct sched_cluster *cluster; + struct cpumask policy_cpus = cpu_rq(cpu)->freq_domain_cpumask; + int i, j; + + if (val != CPUFREQ_POSTCHANGE) + return 0; + + BUG_ON(!new_freq); + + if (cpu_cur_freq(cpu) == new_freq) + return 0; + + for_each_cpu(i, &policy_cpus) { + cluster = cpu_rq(i)->cluster; + + for_each_cpu(j, &cluster->cpus) { + struct rq *rq = cpu_rq(j); + + raw_spin_lock_irqsave(&rq->lock, flags); + update_task_ravg(rq->curr, rq, TASK_UPDATE, + sched_ktime_clock(), 0); + raw_spin_unlock_irqrestore(&rq->lock, flags); + } + + cluster->cur_freq = new_freq; + cpumask_andnot(&policy_cpus, &policy_cpus, &cluster->cpus); + } + + return 0; +} + +static int pwr_stats_ready_notifier(struct notifier_block *nb, + unsigned long cpu, void *data) +{ + cpumask_t mask = CPU_MASK_NONE; + + cpumask_set_cpu(cpu, &mask); + sched_update_freq_max_load(&mask); + + mutex_lock(&cluster_lock); + sort_clusters(); + mutex_unlock(&cluster_lock); + + return 0; +} + +static struct notifier_block notifier_policy_block = { + .notifier_call = cpufreq_notifier_policy +}; + +static struct notifier_block notifier_trans_block = { + .notifier_call = cpufreq_notifier_trans +}; + +static struct notifier_block notifier_pwr_stats_ready = { + .notifier_call = pwr_stats_ready_notifier +}; + +int __weak register_cpu_pwr_stats_ready_notifier(struct notifier_block *nb) +{ + return -EINVAL; +} + +static int register_sched_callback(void) +{ + int ret; + + ret = cpufreq_register_notifier(¬ifier_policy_block, + CPUFREQ_POLICY_NOTIFIER); + + if (!ret) + ret = cpufreq_register_notifier(¬ifier_trans_block, + CPUFREQ_TRANSITION_NOTIFIER); + + register_cpu_pwr_stats_ready_notifier(¬ifier_pwr_stats_ready); + + return 0; +} + +/* + * cpufreq callbacks can be registered at core_initcall or later time. + * Any registration done prior to that is "forgotten" by cpufreq. See + * initialization of variable init_cpufreq_transition_notifier_list_called + * for further information. + */ +core_initcall(register_sched_callback); + +int update_preferred_cluster(struct related_thread_group *grp, + struct task_struct *p, u32 old_load) +{ + u32 new_load = task_load(p); + + if (!grp) + return 0; + + /* + * Update if task's load has changed significantly or a complete window + * has passed since we last updated preference + */ + if (abs(new_load - old_load) > sched_ravg_window / 4 || + sched_ktime_clock() - grp->last_update > sched_ravg_window) + return 1; + + return 0; +} + +bool early_detection_notify(struct rq *rq, u64 wallclock) +{ + struct task_struct *p; + int loop_max = 10; + + if (sched_boost_policy() == SCHED_BOOST_NONE || !rq->cfs.h_nr_running) + return 0; + + rq->ed_task = NULL; + list_for_each_entry(p, &rq->cfs_tasks, se.group_node) { + if (!loop_max) + break; + + if (wallclock - p->last_wake_ts >= EARLY_DETECTION_DURATION) { + rq->ed_task = p; + return 1; + } + + loop_max--; + } + + return 0; +} + +void update_avg_burst(struct task_struct *p) +{ + update_avg(&p->ravg.avg_burst, p->ravg.curr_burst); + p->ravg.curr_burst = 0; +} + +void note_task_waking(struct task_struct *p, u64 wallclock) +{ + u64 sleep_time = wallclock - p->last_switch_out_ts; + + /* + * When a short burst and short sleeping task goes for a long + * sleep, the task's avg_sleep_time gets boosted. It will not + * come below short_sleep threshold for a lot of time and it + * results in incorrect packing. The idead behind tracking + * avg_sleep_time is to detect if a task is short sleeping + * or not. So limit the sleep time to twice the short sleep + * threshold. For regular long sleeping tasks, the avg_sleep_time + * would be higher than threshold, and packing happens correctly. + */ + sleep_time = min_t(u64, sleep_time, 2 * sysctl_sched_short_sleep); + update_avg(&p->ravg.avg_sleep_time, sleep_time); + + p->last_wake_ts = wallclock; +} + +#ifdef CONFIG_CGROUP_SCHED +u64 cpu_upmigrate_discourage_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct task_group *tg = css_tg(css); + + return tg->upmigrate_discouraged; +} + +int cpu_upmigrate_discourage_write_u64(struct cgroup_subsys_state *css, + struct cftype *cft, u64 upmigrate_discourage) +{ + struct task_group *tg = css_tg(css); + int discourage = upmigrate_discourage > 0; + + if (tg->upmigrate_discouraged == discourage) + return 0; + + /* + * Revisit big-task classification for tasks of this cgroup. It would + * have been efficient to walk tasks of just this cgroup in running + * state, but we don't have easy means to do that. Walk all tasks in + * running state on all cpus instead and re-visit their big task + * classification. + */ + get_online_cpus(); + pre_big_task_count_change(cpu_online_mask); + + tg->upmigrate_discouraged = discourage; + + post_big_task_count_change(cpu_online_mask); + put_online_cpus(); + + return 0; +} +#endif /* CONFIG_CGROUP_SCHED */ diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 33d7003fa1b8..d562efb04775 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -80,6 +80,26 @@ static void update_curr_idle(struct rq *rq) { } +#ifdef CONFIG_SCHED_HMP + +static void +inc_hmp_sched_stats_idle(struct rq *rq, struct task_struct *p) +{ +} + +static void +dec_hmp_sched_stats_idle(struct rq *rq, struct task_struct *p) +{ +} + +static void +fixup_hmp_sched_stats_idle(struct rq *rq, struct task_struct *p, + u32 new_task_load, u32 new_pred_demand) +{ +} + +#endif + /* * Simple, special scheduling class for the per-CPU idle tasks: */ @@ -108,4 +128,9 @@ const struct sched_class idle_sched_class = { .prio_changed = prio_changed_idle, .switched_to = switched_to_idle, .update_curr = update_curr_idle, +#ifdef CONFIG_SCHED_HMP + .inc_hmp_sched_stats = inc_hmp_sched_stats_idle, + .dec_hmp_sched_stats = dec_hmp_sched_stats_idle, + .fixup_hmp_sched_stats = fixup_hmp_sched_stats_idle, +#endif }; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index ad0519e8c7f5..391ec29c71c0 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -5,11 +5,12 @@ #include "sched.h" +#include <linux/interrupt.h> #include <linux/slab.h> #include <linux/irq_work.h> +#include <trace/events/sched.h> #include <linux/hrtimer.h> -#include "walt.h" #include "tune.h" int sched_rr_timeslice = RR_TIMESLICE; @@ -258,8 +259,12 @@ static void pull_rt_task(struct rq *this_rq); static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) { - /* Try to pull RT tasks here if we lower this rq's prio */ - return rq->rt.highest_prio.curr > prev->prio; + /* + * Try to pull RT tasks here if we lower this rq's prio and cpu is not + * isolated + */ + return rq->rt.highest_prio.curr > prev->prio && + !cpu_isolated(cpu_of(rq)); } static inline int rt_overloaded(struct rq *rq) @@ -430,7 +435,7 @@ static void dequeue_top_rt_rq(struct rt_rq *rt_rq); static inline int on_rt_rq(struct sched_rt_entity *rt_se) { - return !list_empty(&rt_se->run_list); + return rt_se->on_rq; } #ifdef CONFIG_RT_GROUP_SCHED @@ -476,8 +481,8 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) return rt_se->my_q; } -static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head); -static void dequeue_rt_entity(struct sched_rt_entity *rt_se); +static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags); +static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags); static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) { @@ -493,7 +498,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) if (!rt_se) enqueue_top_rt_rq(rt_rq); else if (!on_rt_rq(rt_se)) - enqueue_rt_entity(rt_se, false); + enqueue_rt_entity(rt_se, 0); if (rt_rq->highest_prio.curr < curr->prio) resched_curr(rq); @@ -510,7 +515,7 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) if (!rt_se) dequeue_top_rt_rq(rt_rq); else if (on_rt_rq(rt_se)) - dequeue_rt_entity(rt_se); + dequeue_rt_entity(rt_se, 0); } static inline int rt_rq_throttled(struct rt_rq *rt_rq) @@ -1244,6 +1249,41 @@ void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {} #endif /* CONFIG_RT_GROUP_SCHED */ +#ifdef CONFIG_SCHED_HMP + +static void +inc_hmp_sched_stats_rt(struct rq *rq, struct task_struct *p) +{ + inc_cumulative_runnable_avg(&rq->hmp_stats, p); +} + +static void +dec_hmp_sched_stats_rt(struct rq *rq, struct task_struct *p) +{ + dec_cumulative_runnable_avg(&rq->hmp_stats, p); +} + +static void +fixup_hmp_sched_stats_rt(struct rq *rq, struct task_struct *p, + u32 new_task_load, u32 new_pred_demand) +{ + s64 task_load_delta = (s64)new_task_load - task_load(p); + s64 pred_demand_delta = PRED_DEMAND_DELTA; + + fixup_cumulative_runnable_avg(&rq->hmp_stats, p, task_load_delta, + pred_demand_delta); +} + +#else /* CONFIG_SCHED_HMP */ + +static inline void +inc_hmp_sched_stats_rt(struct rq *rq, struct task_struct *p) { } + +static inline void +dec_hmp_sched_stats_rt(struct rq *rq, struct task_struct *p) { } + +#endif /* CONFIG_SCHED_HMP */ + static inline unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se) { @@ -1280,7 +1320,30 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) dec_rt_group(rt_se, rt_rq); } -static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) +/* + * Change rt_se->run_list location unless SAVE && !MOVE + * + * assumes ENQUEUE/DEQUEUE flags match + */ +static inline bool move_entity(unsigned int flags) +{ + if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE) + return false; + + return true; +} + +static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array) +{ + list_del_init(&rt_se->run_list); + + if (list_empty(array->queue + rt_se_prio(rt_se))) + __clear_bit(rt_se_prio(rt_se), array->bitmap); + + rt_se->on_list = 0; +} + +static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) { struct rt_rq *rt_rq = rt_rq_of_se(rt_se); struct rt_prio_array *array = &rt_rq->active; @@ -1293,26 +1356,37 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) * get throttled and the current group doesn't have any other * active members. */ - if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) + if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) { + if (rt_se->on_list) + __delist_rt_entity(rt_se, array); return; + } - if (head) - list_add(&rt_se->run_list, queue); - else - list_add_tail(&rt_se->run_list, queue); - __set_bit(rt_se_prio(rt_se), array->bitmap); + if (move_entity(flags)) { + WARN_ON_ONCE(rt_se->on_list); + if (flags & ENQUEUE_HEAD) + list_add(&rt_se->run_list, queue); + else + list_add_tail(&rt_se->run_list, queue); + + __set_bit(rt_se_prio(rt_se), array->bitmap); + rt_se->on_list = 1; + } + rt_se->on_rq = 1; inc_rt_tasks(rt_se, rt_rq); } -static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) +static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) { struct rt_rq *rt_rq = rt_rq_of_se(rt_se); struct rt_prio_array *array = &rt_rq->active; - list_del_init(&rt_se->run_list); - if (list_empty(array->queue + rt_se_prio(rt_se))) - __clear_bit(rt_se_prio(rt_se), array->bitmap); + if (move_entity(flags)) { + WARN_ON_ONCE(!rt_se->on_list); + __delist_rt_entity(rt_se, array); + } + rt_se->on_rq = 0; dec_rt_tasks(rt_se, rt_rq); } @@ -1321,7 +1395,7 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) * Because the prio of an upper entry depends on the lower * entries, we must remove entries top - down. */ -static void dequeue_rt_stack(struct sched_rt_entity *rt_se) +static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags) { struct sched_rt_entity *back = NULL; @@ -1334,31 +1408,31 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se) for (rt_se = back; rt_se; rt_se = rt_se->back) { if (on_rt_rq(rt_se)) - __dequeue_rt_entity(rt_se); + __dequeue_rt_entity(rt_se, flags); } } -static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) +static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) { struct rq *rq = rq_of_rt_se(rt_se); - dequeue_rt_stack(rt_se); + dequeue_rt_stack(rt_se, flags); for_each_sched_rt_entity(rt_se) - __enqueue_rt_entity(rt_se, head); + __enqueue_rt_entity(rt_se, flags); enqueue_top_rt_rq(&rq->rt); } -static void dequeue_rt_entity(struct sched_rt_entity *rt_se) +static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) { struct rq *rq = rq_of_rt_se(rt_se); - dequeue_rt_stack(rt_se); + dequeue_rt_stack(rt_se, flags); for_each_sched_rt_entity(rt_se) { struct rt_rq *rt_rq = group_rt_rq(rt_se); if (rt_rq && rt_rq->rt_nr_running) - __enqueue_rt_entity(rt_se, false); + __enqueue_rt_entity(rt_se, flags); } enqueue_top_rt_rq(&rq->rt); } @@ -1374,8 +1448,8 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) if (flags & ENQUEUE_WAKEUP) rt_se->timeout = 0; - enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); - walt_inc_cumulative_runnable_avg(rq, p); + enqueue_rt_entity(rt_se, flags); + inc_hmp_sched_stats_rt(rq, p); if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); @@ -1413,8 +1487,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) struct sched_rt_entity *rt_se = &p->rt; update_curr_rt(rq); - dequeue_rt_entity(rt_se); - walt_dec_cumulative_runnable_avg(rq, p); + dequeue_rt_entity(rt_se, flags); + dec_hmp_sched_stats_rt(rq, p); dequeue_pushable_task(rq, p); @@ -1469,6 +1543,40 @@ static void yield_task_rt(struct rq *rq) #ifdef CONFIG_SMP static int find_lowest_rq(struct task_struct *task); +#ifdef CONFIG_SCHED_HMP +static int +select_task_rq_rt_hmp(struct task_struct *p, int cpu, int sd_flag, int flags) +{ + int target; + + rcu_read_lock(); + target = find_lowest_rq(p); + if (target != -1) + cpu = target; + rcu_read_unlock(); + + return cpu; +} +#endif + +/* + * Return whether the task on the given cpu is currently non-preemptible + * while handling a potentially long softint, or if the task is likely + * to block preemptions soon because it is a ksoftirq thread that is + * handling slow softints. + */ +bool +task_may_not_preempt(struct task_struct *task, int cpu) +{ + __u32 softirqs = per_cpu(active_softirqs, cpu) | + __IRQ_STAT(cpu, __softirq_pending); + struct task_struct *cpu_ksoftirqd = per_cpu(ksoftirqd, cpu); + + return ((softirqs & LONG_SOFTIRQ_MASK) && + (task == cpu_ksoftirqd || + task_thread_info(task)->preempt_count & SOFTIRQ_MASK)); +} + /* * Perform a schedtune dequeue and cancelation of boost timers if needed. * Should be called only with the rq->lock held. @@ -1501,6 +1609,11 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags, { struct task_struct *curr; struct rq *rq; + bool may_not_preempt; + +#ifdef CONFIG_SCHED_HMP + return select_task_rq_rt_hmp(p, cpu, sd_flag, flags); +#endif /* For anything but wake ups, just return the task_cpu */ if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) @@ -1512,7 +1625,17 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags, curr = READ_ONCE(rq->curr); /* unlocked access */ /* - * If the current task on @p's runqueue is an RT task, then + * If the current task on @p's runqueue is a softirq task, + * it may run without preemption for a time that is + * ill-suited for a waiting RT task. Therefore, try to + * wake this RT task on another runqueue. + * + * Also, if the current task on @p's runqueue is an RT task, then + * it may run without preemption for a time that is + * ill-suited for a waiting RT task. Therefore, try to + * wake this RT task on another runqueue. + * + * Also, if the current task on @p's runqueue is an RT task, then * try to see if we can wake this RT task up on another * runqueue. Otherwise simply start this RT task * on its current runqueue. @@ -1533,17 +1656,22 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags, * This test is optimistic, if we get it wrong the load-balancer * will have to sort it out. */ - if (curr && unlikely(rt_task(curr)) && + may_not_preempt = task_may_not_preempt(curr, cpu); + if (may_not_preempt || + (unlikely(rt_task(curr)) && (curr->nr_cpus_allowed < 2 || - curr->prio <= p->prio)) { + curr->prio <= p->prio))) { int target = find_lowest_rq(p); /* - * Don't bother moving it if the destination CPU is - * not running a lower priority task. + * If cpu is non-preemptible, prefer remote cpu + * even if it's running a higher-prio task. + * Otherwise: Don't bother moving it if the + * destination CPU is not running a lower priority task. */ if (target != -1 && - p->prio < cpu_rq(target)->rt.highest_prio.curr) + (may_not_preempt || + p->prio < cpu_rq(target)->rt.highest_prio.curr)) cpu = target; } rcu_read_unlock(); @@ -1752,6 +1880,109 @@ static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu) static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); +#ifdef CONFIG_SCHED_HMP + +static int find_lowest_rq_hmp(struct task_struct *task) +{ + struct cpumask *lowest_mask = *this_cpu_ptr(&local_cpu_mask); + struct cpumask candidate_mask = CPU_MASK_NONE; + struct sched_cluster *cluster; + int best_cpu = -1; + int prev_cpu = task_cpu(task); + u64 cpu_load, min_load = ULLONG_MAX; + int i; + int restrict_cluster; + int boost_on_big; + int pack_task, wakeup_latency, least_wakeup_latency = INT_MAX; + + boost_on_big = sched_boost() == FULL_THROTTLE_BOOST && + sched_boost_policy() == SCHED_BOOST_ON_BIG; + + restrict_cluster = sysctl_sched_restrict_cluster_spill; + + /* Make sure the mask is initialized first */ + if (unlikely(!lowest_mask)) + return best_cpu; + + if (task->nr_cpus_allowed == 1) + return best_cpu; /* No other targets possible */ + + if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) + return best_cpu; /* No targets found */ + + pack_task = is_short_burst_task(task); + + /* + * At this point we have built a mask of cpus representing the + * lowest priority tasks in the system. Now we want to elect + * the best one based on our affinity and topology. + */ + +retry: + for_each_sched_cluster(cluster) { + if (boost_on_big && cluster->capacity != max_possible_capacity) + continue; + + cpumask_and(&candidate_mask, &cluster->cpus, lowest_mask); + cpumask_andnot(&candidate_mask, &candidate_mask, + cpu_isolated_mask); + /* + * When placement boost is active, if there is no eligible CPU + * in the highest capacity cluster, we fallback to the other + * clusters. So clear the CPUs of the traversed cluster from + * the lowest_mask. + */ + if (unlikely(boost_on_big)) + cpumask_andnot(lowest_mask, lowest_mask, + &cluster->cpus); + + if (cpumask_empty(&candidate_mask)) + continue; + + for_each_cpu(i, &candidate_mask) { + if (sched_cpu_high_irqload(i)) + continue; + + cpu_load = cpu_rq(i)->hmp_stats.cumulative_runnable_avg; + if (!restrict_cluster) + cpu_load = scale_load_to_cpu(cpu_load, i); + + if (pack_task) { + wakeup_latency = cpu_rq(i)->wakeup_latency; + + if (wakeup_latency > least_wakeup_latency) + continue; + + if (wakeup_latency < least_wakeup_latency) { + least_wakeup_latency = wakeup_latency; + min_load = cpu_load; + best_cpu = i; + continue; + } + } + + if (cpu_load < min_load || + (cpu_load == min_load && + (i == prev_cpu || (best_cpu != prev_cpu && + cpus_share_cache(prev_cpu, i))))) { + min_load = cpu_load; + best_cpu = i; + } + } + + if (restrict_cluster && best_cpu != -1) + break; + } + + if (unlikely(boost_on_big && best_cpu == -1)) { + boost_on_big = 0; + goto retry; + } + + return best_cpu; +} +#endif /* CONFIG_SCHED_HMP */ + static int find_lowest_rq(struct task_struct *task) { struct sched_domain *sd; @@ -1759,6 +1990,10 @@ static int find_lowest_rq(struct task_struct *task) int this_cpu = smp_processor_id(); int cpu = task_cpu(task); +#ifdef CONFIG_SCHED_HMP + return find_lowest_rq_hmp(task); +#endif + /* Make sure the mask is initialized first */ if (unlikely(!lowest_mask)) return -1; @@ -1975,11 +2210,13 @@ retry: goto retry; } + next_task->on_rq = TASK_ON_RQ_MIGRATING; deactivate_task(rq, next_task, 0); next_task->on_rq = TASK_ON_RQ_MIGRATING; set_task_cpu(next_task, lowest_rq->cpu); next_task->on_rq = TASK_ON_RQ_QUEUED; activate_task(lowest_rq, next_task, 0); + next_task->on_rq = TASK_ON_RQ_QUEUED; ret = 1; resched_curr(lowest_rq); @@ -2249,11 +2486,13 @@ static void pull_rt_task(struct rq *this_rq) resched = true; + p->on_rq = TASK_ON_RQ_MIGRATING; deactivate_task(src_rq, p, 0); p->on_rq = TASK_ON_RQ_MIGRATING; set_task_cpu(p, this_cpu); p->on_rq = TASK_ON_RQ_QUEUED; activate_task(this_rq, p, 0); + p->on_rq = TASK_ON_RQ_QUEUED; /* * We continue with the search, just in * case there's an even higher prio task @@ -2326,7 +2565,8 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) * we may need to handle the pulling of RT tasks * now. */ - if (!task_on_rq_queued(p) || rq->rt.rt_nr_running) + if (!task_on_rq_queued(p) || rq->rt.rt_nr_running || + cpu_isolated(cpu_of(rq))) return; queue_pull_task(rq); @@ -2341,6 +2581,7 @@ void __init init_sched_rt_class(void) GFP_KERNEL, cpu_to_node(i)); } } + #endif /* CONFIG_SMP */ /* @@ -2514,6 +2755,11 @@ const struct sched_class rt_sched_class = { .switched_to = switched_to_rt, .update_curr = update_curr_rt, +#ifdef CONFIG_SCHED_HMP + .inc_hmp_sched_stats = inc_hmp_sched_stats_rt, + .dec_hmp_sched_stats = dec_hmp_sched_stats_rt, + .fixup_hmp_sched_stats = fixup_hmp_sched_stats_rt, +#endif }; #ifdef CONFIG_SCHED_DEBUG diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d066a6870245..90cc450dff7e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -28,14 +28,13 @@ extern unsigned long calc_load_update; extern atomic_long_t calc_load_tasks; extern void calc_global_load_tick(struct rq *this_rq); + extern long calc_load_fold_active(struct rq *this_rq); #ifdef CONFIG_SMP extern void update_cpu_load_active(struct rq *this_rq); -extern void check_for_migration(struct rq *rq, struct task_struct *p); #else static inline void update_cpu_load_active(struct rq *this_rq) { } -static inline void check_for_migration(struct rq *rq, struct task_struct *p) { } #endif /* @@ -245,6 +244,10 @@ struct cfs_bandwidth { struct task_group { struct cgroup_subsys_state css; +#ifdef CONFIG_SCHED_HMP + bool upmigrate_discouraged; +#endif + #ifdef CONFIG_FAIR_GROUP_SCHED /* schedulable entities of this group on each cpu */ struct sched_entity **se; @@ -350,12 +353,96 @@ static inline void set_task_rq_fair(struct sched_entity *se, #endif /* CONFIG_SMP */ #endif /* CONFIG_FAIR_GROUP_SCHED */ +extern struct task_group *css_tg(struct cgroup_subsys_state *css); #else /* CONFIG_CGROUP_SCHED */ struct cfs_bandwidth { }; #endif /* CONFIG_CGROUP_SCHED */ +#ifdef CONFIG_SCHED_HMP + +#define NUM_TRACKED_WINDOWS 2 +#define NUM_LOAD_INDICES 1000 + +struct hmp_sched_stats { + int nr_big_tasks; + u64 cumulative_runnable_avg; + u64 pred_demands_sum; +}; + +struct load_subtractions { + u64 window_start; + u64 subs; + u64 new_subs; +}; + +struct group_cpu_time { + u64 curr_runnable_sum; + u64 prev_runnable_sum; + u64 nt_curr_runnable_sum; + u64 nt_prev_runnable_sum; +}; + +struct sched_cluster { + raw_spinlock_t load_lock; + struct list_head list; + struct cpumask cpus; + int id; + int max_power_cost; + int min_power_cost; + int max_possible_capacity; + int capacity; + int efficiency; /* Differentiate cpus with different IPC capability */ + int load_scale_factor; + unsigned int exec_scale_factor; + /* + * max_freq = user maximum + * max_mitigated_freq = thermal defined maximum + * max_possible_freq = maximum supported by hardware + */ + unsigned int cur_freq, max_freq, max_mitigated_freq, min_freq; + unsigned int max_possible_freq; + bool freq_init_done; + int dstate, dstate_wakeup_latency, dstate_wakeup_energy; + unsigned int static_cluster_pwr_cost; + int notifier_sent; + bool wake_up_idle; + atomic64_t last_cc_update; + atomic64_t cycles; +}; + +extern unsigned long all_cluster_ids[]; + +static inline int cluster_first_cpu(struct sched_cluster *cluster) +{ + return cpumask_first(&cluster->cpus); +} + +struct related_thread_group { + int id; + raw_spinlock_t lock; + struct list_head tasks; + struct list_head list; + struct sched_cluster *preferred_cluster; + struct rcu_head rcu; + u64 last_update; +}; + +extern struct list_head cluster_head; +extern struct sched_cluster *sched_cluster[NR_CPUS]; + +struct cpu_cycle { + u64 cycles; + u64 time; +}; + +#define for_each_sched_cluster(cluster) \ + list_for_each_entry_rcu(cluster, &cluster_head, list) + +extern unsigned int sched_disable_window_stats; +#endif /* CONFIG_SCHED_HMP */ + /* CFS-related fields in a runqueue */ struct cfs_rq { struct load_weight load; @@ -424,11 +511,12 @@ struct cfs_rq { struct list_head leaf_cfs_rq_list; struct task_group *tg; /* group that "owns" this runqueue */ -#ifdef CONFIG_SCHED_WALT - u64 cumulative_runnable_avg; +#ifdef CONFIG_CFS_BANDWIDTH + +#ifdef CONFIG_SCHED_HMP + struct hmp_sched_stats hmp_stats; #endif -#ifdef CONFIG_CFS_BANDWIDTH int runtime_enabled; u64 runtime_expires; s64 runtime_remaining; @@ -698,6 +786,38 @@ struct rq { u64 max_idle_balance_cost; #endif +#ifdef CONFIG_SCHED_HMP + struct sched_cluster *cluster; + struct cpumask freq_domain_cpumask; + struct hmp_sched_stats hmp_stats; + + int cstate, wakeup_latency, wakeup_energy; + u64 window_start; + u64 load_reported_window; + unsigned long hmp_flags; + + u64 cur_irqload; + u64 avg_irqload; + u64 irqload_ts; + unsigned int static_cpu_pwr_cost; + struct task_struct *ed_task; + struct cpu_cycle cc; + u64 old_busy_time, old_busy_time_group; + u64 old_estimated_time; + u64 curr_runnable_sum; + u64 prev_runnable_sum; + u64 nt_curr_runnable_sum; + u64 nt_prev_runnable_sum; + struct group_cpu_time grp_time; + struct load_subtractions load_subs[NUM_TRACKED_WINDOWS]; + DECLARE_BITMAP_ARRAY(top_tasks_bitmap, + NUM_TRACKED_WINDOWS, NUM_LOAD_INDICES); + u8 *top_tasks[NUM_TRACKED_WINDOWS]; + u8 curr_table; + int prev_top; + int curr_top; +#endif + #ifdef CONFIG_SCHED_WALT u64 cumulative_runnable_avg; u64 window_start; @@ -711,7 +831,6 @@ struct rq { u64 cum_window_demand; #endif /* CONFIG_SCHED_WALT */ - #ifdef CONFIG_IRQ_TIME_ACCOUNTING u64 prev_irq_time; #endif @@ -986,6 +1105,652 @@ static inline void sched_ttwu_pending(void) { } #include "stats.h" #include "auto_group.h" +enum sched_boost_policy { + SCHED_BOOST_NONE, + SCHED_BOOST_ON_BIG, + SCHED_BOOST_ON_ALL, +}; + +#ifdef CONFIG_SCHED_HMP + +#define WINDOW_STATS_RECENT 0 +#define WINDOW_STATS_MAX 1 +#define WINDOW_STATS_MAX_RECENT_AVG 2 +#define WINDOW_STATS_AVG 3 +#define WINDOW_STATS_INVALID_POLICY 4 + +#define SCHED_UPMIGRATE_MIN_NICE 15 +#define EXITING_TASK_MARKER 0xdeaddead + +#define UP_MIGRATION 1 +#define DOWN_MIGRATION 2 +#define IRQLOAD_MIGRATION 3 + +extern struct mutex policy_mutex; +extern unsigned int sched_ravg_window; +extern unsigned int sched_disable_window_stats; +extern unsigned int max_possible_freq; +extern unsigned int min_max_freq; +extern unsigned int pct_task_load(struct task_struct *p); +extern unsigned int max_possible_efficiency; +extern unsigned int min_possible_efficiency; +extern unsigned int max_capacity; +extern unsigned int min_capacity; +extern unsigned int max_load_scale_factor; +extern unsigned int max_possible_capacity; +extern unsigned int min_max_possible_capacity; +extern unsigned int max_power_cost; +extern unsigned int sched_init_task_load_windows; +extern unsigned int up_down_migrate_scale_factor; +extern unsigned int sysctl_sched_restrict_cluster_spill; +extern unsigned int sched_pred_alert_load; +extern struct sched_cluster init_cluster; +extern unsigned int __read_mostly sched_short_sleep_task_threshold; +extern unsigned int __read_mostly sched_long_cpu_selection_threshold; +extern unsigned int __read_mostly sched_big_waker_task_load; +extern unsigned int __read_mostly sched_small_wakee_task_load; +extern unsigned int __read_mostly sched_spill_load; +extern unsigned int __read_mostly sched_upmigrate; +extern unsigned int __read_mostly sched_downmigrate; +extern unsigned int __read_mostly sched_load_granule; + +extern void init_new_task_load(struct task_struct *p); +extern u64 sched_ktime_clock(void); +extern int got_boost_kick(void); +extern int register_cpu_cycle_counter_cb(struct cpu_cycle_counter_cb *cb); +extern void update_task_ravg(struct task_struct *p, struct rq *rq, int event, + u64 wallclock, u64 irqtime); +extern bool early_detection_notify(struct rq *rq, u64 wallclock); +extern void clear_ed_task(struct task_struct *p, struct rq *rq); +extern void fixup_busy_time(struct task_struct *p, int new_cpu); +extern void clear_boost_kick(int cpu); +extern void clear_hmp_request(int cpu); +extern void mark_task_starting(struct task_struct *p); +extern void set_window_start(struct rq *rq); +extern void update_cluster_topology(void); +extern void note_task_waking(struct task_struct *p, u64 wallclock); +extern void set_task_last_switch_out(struct task_struct *p, u64 wallclock); +extern void init_clusters(void); +extern void reset_cpu_hmp_stats(int cpu, int reset_cra); +extern unsigned int max_task_load(void); +extern void sched_account_irqtime(int cpu, struct task_struct *curr, + u64 delta, u64 wallclock); +extern void sched_account_irqstart(int cpu, struct task_struct *curr, + u64 wallclock); +extern unsigned int cpu_temp(int cpu); +extern unsigned int nr_eligible_big_tasks(int cpu); +extern int update_preferred_cluster(struct related_thread_group *grp, + struct task_struct *p, u32 old_load); +extern void set_preferred_cluster(struct related_thread_group *grp); +extern void add_new_task_to_grp(struct task_struct *new); +extern unsigned int update_freq_aggregate_threshold(unsigned int threshold); +extern void update_avg_burst(struct task_struct *p); +extern void update_avg(u64 *avg, u64 sample); + +#define NO_BOOST 0 +#define FULL_THROTTLE_BOOST 1 +#define CONSERVATIVE_BOOST 2 +#define RESTRAINED_BOOST 3 + +static inline struct sched_cluster *cpu_cluster(int cpu) +{ + return cpu_rq(cpu)->cluster; +} + +static inline int cpu_capacity(int cpu) +{ + return cpu_rq(cpu)->cluster->capacity; +} + +static inline int cpu_max_possible_capacity(int cpu) +{ + return cpu_rq(cpu)->cluster->max_possible_capacity; +} + +static inline int cpu_load_scale_factor(int cpu) +{ + return cpu_rq(cpu)->cluster->load_scale_factor; +} + +static inline int cpu_efficiency(int cpu) +{ + return cpu_rq(cpu)->cluster->efficiency; +} + +static inline unsigned int cpu_cur_freq(int cpu) +{ + return cpu_rq(cpu)->cluster->cur_freq; +} + +static inline unsigned int cpu_min_freq(int cpu) +{ + return cpu_rq(cpu)->cluster->min_freq; +} + +static inline unsigned int cluster_max_freq(struct sched_cluster *cluster) +{ + /* + * Governor and thermal driver don't know the other party's mitigation + * voting. So struct cluster saves both and return min() for current + * cluster fmax. + */ + return min(cluster->max_mitigated_freq, cluster->max_freq); +} + +static inline unsigned int cpu_max_freq(int cpu) +{ + return cluster_max_freq(cpu_rq(cpu)->cluster); +} + +static inline unsigned int cpu_max_possible_freq(int cpu) +{ + return cpu_rq(cpu)->cluster->max_possible_freq; +} + +static inline int same_cluster(int src_cpu, int dst_cpu) +{ + return cpu_rq(src_cpu)->cluster == cpu_rq(dst_cpu)->cluster; +} + +static inline int cpu_max_power_cost(int cpu) +{ + return cpu_rq(cpu)->cluster->max_power_cost; +} + +static inline int cpu_min_power_cost(int cpu) +{ + return cpu_rq(cpu)->cluster->min_power_cost; +} + +static inline u32 cpu_cycles_to_freq(u64 cycles, u64 period) +{ + return div64_u64(cycles, period); +} + +static inline bool hmp_capable(void) +{ + return max_possible_capacity != min_max_possible_capacity; +} + +static inline bool is_max_capacity_cpu(int cpu) +{ + return cpu_max_possible_capacity(cpu) == max_possible_capacity; +} + +static inline bool is_min_capacity_cpu(int cpu) +{ + return cpu_max_possible_capacity(cpu) == min_max_possible_capacity; +} + +/* + * 'load' is in reference to "best cpu" at its best frequency. + * Scale that in reference to a given cpu, accounting for how bad it is + * in reference to "best cpu". + */ +static inline u64 scale_load_to_cpu(u64 task_load, int cpu) +{ + u64 lsf = cpu_load_scale_factor(cpu); + + if (lsf != 1024) { + task_load *= lsf; + task_load /= 1024; + } + + return task_load; +} + +static inline unsigned int task_load(struct task_struct *p) +{ + return p->ravg.demand; +} + +static inline void +inc_cumulative_runnable_avg(struct hmp_sched_stats *stats, + struct task_struct *p) +{ + u32 task_load; + + if (sched_disable_window_stats) + return; + + task_load = sched_disable_window_stats ? 0 : p->ravg.demand; + + stats->cumulative_runnable_avg += task_load; + stats->pred_demands_sum += p->ravg.pred_demand; +} + +static inline void +dec_cumulative_runnable_avg(struct hmp_sched_stats *stats, + struct task_struct *p) +{ + u32 task_load; + + if (sched_disable_window_stats) + return; + + task_load = sched_disable_window_stats ? 0 : p->ravg.demand; + + stats->cumulative_runnable_avg -= task_load; + + BUG_ON((s64)stats->cumulative_runnable_avg < 0); + + stats->pred_demands_sum -= p->ravg.pred_demand; + BUG_ON((s64)stats->pred_demands_sum < 0); +} + +static inline void +fixup_cumulative_runnable_avg(struct hmp_sched_stats *stats, + struct task_struct *p, s64 task_load_delta, + s64 pred_demand_delta) +{ + if (sched_disable_window_stats) + return; + + stats->cumulative_runnable_avg += task_load_delta; + BUG_ON((s64)stats->cumulative_runnable_avg < 0); + + stats->pred_demands_sum += pred_demand_delta; + BUG_ON((s64)stats->pred_demands_sum < 0); +} + +#define pct_to_real(tunable) \ + (div64_u64((u64)tunable * (u64)max_task_load(), 100)) + +#define real_to_pct(tunable) \ + (div64_u64((u64)tunable * (u64)100, (u64)max_task_load())) + +#define SCHED_HIGH_IRQ_TIMEOUT 3 +static inline u64 sched_irqload(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + s64 delta; + + delta = get_jiffies_64() - rq->irqload_ts; + /* + * Current context can be preempted by irq and rq->irqload_ts can be + * updated by irq context so that delta can be negative. + * But this is okay and we can safely return as this means there + * was recent irq occurrence. + */ + + if (delta < SCHED_HIGH_IRQ_TIMEOUT) + return rq->avg_irqload; + else + return 0; +} + +static inline int sched_cpu_high_irqload(int cpu) +{ + return sched_irqload(cpu) >= sysctl_sched_cpu_high_irqload; +} + +static inline bool task_in_related_thread_group(struct task_struct *p) +{ + return !!(rcu_access_pointer(p->grp) != NULL); +} + +static inline +struct related_thread_group *task_related_thread_group(struct task_struct *p) +{ + return rcu_dereference(p->grp); +} + +#define PRED_DEMAND_DELTA ((s64)new_pred_demand - p->ravg.pred_demand) + +extern void +check_for_freq_change(struct rq *rq, bool check_pred, bool check_groups); + +extern void notify_migration(int src_cpu, int dest_cpu, + bool src_cpu_dead, struct task_struct *p); + +/* Is frequency of two cpus synchronized with each other? */ +static inline int same_freq_domain(int src_cpu, int dst_cpu) +{ + struct rq *rq = cpu_rq(src_cpu); + + if (src_cpu == dst_cpu) + return 1; + + return cpumask_test_cpu(dst_cpu, &rq->freq_domain_cpumask); +} + +#define BOOST_KICK 0 +#define CPU_RESERVED 1 + +static inline int is_reserved(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + return test_bit(CPU_RESERVED, &rq->hmp_flags); +} + +static inline int mark_reserved(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + /* Name boost_flags as hmp_flags? */ + return test_and_set_bit(CPU_RESERVED, &rq->hmp_flags); +} + +static inline void clear_reserved(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + clear_bit(CPU_RESERVED, &rq->hmp_flags); +} + +static inline u64 cpu_cravg_sync(int cpu, int sync) +{ + struct rq *rq = cpu_rq(cpu); + u64 load; + + load = rq->hmp_stats.cumulative_runnable_avg; + + /* + * If load is being checked in a sync wakeup environment, + * we may want to discount the load of the currently running + * task. + */ + if (sync && cpu == smp_processor_id()) { + if (load > rq->curr->ravg.demand) + load -= rq->curr->ravg.demand; + else + load = 0; + } + + return load; +} + +static inline bool is_short_burst_task(struct task_struct *p) +{ + return p->ravg.avg_burst < sysctl_sched_short_burst && + p->ravg.avg_sleep_time > sysctl_sched_short_sleep; +} + +extern void check_for_migration(struct rq *rq, struct task_struct *p); +extern void pre_big_task_count_change(const struct cpumask *cpus); +extern void post_big_task_count_change(const struct cpumask *cpus); +extern void set_hmp_defaults(void); +extern int power_delta_exceeded(unsigned int cpu_cost, unsigned int base_cost); +extern unsigned int power_cost(int cpu, u64 demand); +extern void reset_all_window_stats(u64 window_start, unsigned int window_size); +extern int sched_boost(void); +extern int task_load_will_fit(struct task_struct *p, u64 task_load, int cpu, + enum sched_boost_policy boost_policy); +extern enum sched_boost_policy sched_boost_policy(void); +extern int task_will_fit(struct task_struct *p, int cpu); +extern u64 cpu_load(int cpu); +extern u64 cpu_load_sync(int cpu, int sync); +extern int preferred_cluster(struct sched_cluster *cluster, + struct task_struct *p); +extern void inc_nr_big_task(struct hmp_sched_stats *stats, + struct task_struct *p); +extern void dec_nr_big_task(struct hmp_sched_stats *stats, + struct task_struct *p); +extern void inc_rq_hmp_stats(struct rq *rq, + struct task_struct *p, int change_cra); +extern void dec_rq_hmp_stats(struct rq *rq, + struct task_struct *p, int change_cra); +extern void reset_hmp_stats(struct hmp_sched_stats *stats, int reset_cra); +extern int is_big_task(struct task_struct *p); +extern int upmigrate_discouraged(struct task_struct *p); +extern struct sched_cluster *rq_cluster(struct rq *rq); +extern int nr_big_tasks(struct rq *rq); +extern void fixup_nr_big_tasks(struct hmp_sched_stats *stats, + struct task_struct *p, s64 delta); +extern void reset_task_stats(struct task_struct *p); +extern void reset_cfs_rq_hmp_stats(int cpu, int reset_cra); +extern void _inc_hmp_sched_stats_fair(struct rq *rq, + struct task_struct *p, int change_cra); +extern u64 cpu_upmigrate_discourage_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft); +extern int cpu_upmigrate_discourage_write_u64(struct cgroup_subsys_state *css, + struct cftype *cft, u64 upmigrate_discourage); +extern void sched_boost_parse_dt(void); +extern void clear_top_tasks_bitmap(unsigned long *bitmap); + +#if defined(CONFIG_SCHED_TUNE) && defined(CONFIG_CGROUP_SCHEDTUNE) +extern bool task_sched_boost(struct task_struct *p); +extern int sync_cgroup_colocation(struct task_struct *p, bool insert); +extern bool same_schedtune(struct task_struct *tsk1, struct task_struct *tsk2); +extern void update_cgroup_boost_settings(void); +extern void restore_cgroup_boost_settings(void); + +#else +static inline bool +same_schedtune(struct task_struct *tsk1, struct task_struct *tsk2) +{ + return true; +} + +static inline bool task_sched_boost(struct task_struct *p) +{ + return true; +} + +static inline void update_cgroup_boost_settings(void) { } +static inline void restore_cgroup_boost_settings(void) { } +#endif + +extern int alloc_related_thread_groups(void); + +#else /* CONFIG_SCHED_HMP */ + +struct hmp_sched_stats; +struct related_thread_group; +struct sched_cluster; + +static inline enum sched_boost_policy sched_boost_policy(void) +{ + return SCHED_BOOST_NONE; +} + +static inline bool task_sched_boost(struct task_struct *p) +{ + return true; +} + +static inline int got_boost_kick(void) +{ + return 0; +} + +static inline void update_task_ravg(struct task_struct *p, struct rq *rq, + int event, u64 wallclock, u64 irqtime) { } + +static inline bool early_detection_notify(struct rq *rq, u64 wallclock) +{ + return 0; +} + +static inline void clear_ed_task(struct task_struct *p, struct rq *rq) { } +static inline void fixup_busy_time(struct task_struct *p, int new_cpu) { } +static inline void clear_boost_kick(int cpu) { } +static inline void clear_hmp_request(int cpu) { } +static inline void mark_task_starting(struct task_struct *p) { } +static inline void set_window_start(struct rq *rq) { } +static inline void init_clusters(void) {} +static inline void update_cluster_topology(void) { } +static inline void note_task_waking(struct task_struct *p, u64 wallclock) { } +static inline void set_task_last_switch_out(struct task_struct *p, + u64 wallclock) { } + +static inline int task_will_fit(struct task_struct *p, int cpu) +{ + return 1; +} + +static inline int select_best_cpu(struct task_struct *p, int target, + int reason, int sync) +{ + return 0; +} + +static inline unsigned int power_cost(int cpu, u64 demand) +{ + return SCHED_CAPACITY_SCALE; +} + +static inline int sched_boost(void) +{ + return 0; +} + +static inline int is_big_task(struct task_struct *p) +{ + return 0; +} + +static inline int nr_big_tasks(struct rq *rq) +{ + return 0; +} + +static inline int is_cpu_throttling_imminent(int cpu) +{ + return 0; +} + +static inline int is_task_migration_throttled(struct task_struct *p) +{ + return 0; +} + +static inline unsigned int cpu_temp(int cpu) +{ + return 0; +} + +static inline void +inc_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra) { } + +static inline void +dec_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra) { } + +static inline void +inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p) { } + +static inline void +dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p) { } + +static inline int +preferred_cluster(struct sched_cluster *cluster, struct task_struct *p) +{ + return 1; +} + +static inline struct sched_cluster *rq_cluster(struct rq *rq) +{ + return NULL; +} + +static inline void init_new_task_load(struct task_struct *p) +{ +} + +static inline u64 scale_load_to_cpu(u64 load, int cpu) +{ + return load; +} + +static inline unsigned int nr_eligible_big_tasks(int cpu) +{ + return 0; +} + +static inline bool is_max_capacity_cpu(int cpu) { return true; } + +static inline int pct_task_load(struct task_struct *p) { return 0; } + +static inline int cpu_capacity(int cpu) +{ + return SCHED_LOAD_SCALE; +} + +static inline int same_cluster(int src_cpu, int dst_cpu) { return 1; } + +static inline void inc_cumulative_runnable_avg(struct hmp_sched_stats *stats, + struct task_struct *p) +{ +} + +static inline void dec_cumulative_runnable_avg(struct hmp_sched_stats *stats, + struct task_struct *p) +{ +} + +static inline void sched_account_irqtime(int cpu, struct task_struct *curr, + u64 delta, u64 wallclock) +{ +} + +static inline void sched_account_irqstart(int cpu, struct task_struct *curr, + u64 wallclock) +{ +} + +static inline int sched_cpu_high_irqload(int cpu) { return 0; } + +static inline void set_preferred_cluster(struct related_thread_group *grp) { } + +static inline bool task_in_related_thread_group(struct task_struct *p) +{ + return false; +} + +static inline +struct related_thread_group *task_related_thread_group(struct task_struct *p) +{ + return NULL; +} + +static inline u32 task_load(struct task_struct *p) { return 0; } + +static inline int update_preferred_cluster(struct related_thread_group *grp, + struct task_struct *p, u32 old_load) +{ + return 0; +} + +static inline void add_new_task_to_grp(struct task_struct *new) {} + +#define PRED_DEMAND_DELTA (0) + +static inline void +check_for_freq_change(struct rq *rq, bool check_pred, bool check_groups) { } + +static inline void notify_migration(int src_cpu, int dest_cpu, + bool src_cpu_dead, struct task_struct *p) { } + +static inline int same_freq_domain(int src_cpu, int dst_cpu) +{ + return 1; +} + +static inline void check_for_migration(struct rq *rq, struct task_struct *p) { } +static inline void pre_big_task_count_change(void) { } +static inline void post_big_task_count_change(void) { } +static inline void set_hmp_defaults(void) { } + +static inline void clear_reserved(int cpu) { } +static inline void sched_boost_parse_dt(void) {} +static inline int alloc_related_thread_groups(void) { return 0; } + +#define trace_sched_cpu_load(...) +#define trace_sched_cpu_load_lb(...) +#define trace_sched_cpu_load_cgroup(...) +#define trace_sched_cpu_load_wakeup(...) + +static inline void update_avg_burst(struct task_struct *p) {} + +#endif /* CONFIG_SCHED_HMP */ + +/* + * Returns the rq capacity of any rq in a group. This does not play + * well with groups where rq capacity can change independently. + */ +#define group_rq_capacity(group) cpu_capacity(group_first_cpu(group)) + #ifdef CONFIG_CGROUP_SCHED /* @@ -1032,7 +1797,6 @@ static inline struct task_group *task_group(struct task_struct *p) { return NULL; } - #endif /* CONFIG_CGROUP_SCHED */ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) @@ -1090,7 +1854,7 @@ static __always_inline bool static_branch_##name(struct static_key *key) \ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) #else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */ -#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) +#define sched_feat(x) !!(sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ extern struct static_key_false sched_numa_balancing; @@ -1186,6 +1950,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) #define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ #define WF_FORK 0x02 /* child wakeup after fork */ #define WF_MIGRATED 0x4 /* internal use, task got migrated */ +#define WF_NO_NOTIFIER 0x08 /* do not notify governor */ /* * To aid in avoiding the subversion of "niceness" due to uneven distribution @@ -1240,19 +2005,41 @@ static const u32 prio_to_wmult[40] = { /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, }; +/* + * {de,en}queue flags: + * + * DEQUEUE_SLEEP - task is no longer runnable + * ENQUEUE_WAKEUP - task just became runnable + * + * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks + * are in a known state which allows modification. Such pairs + * should preserve as much state as possible. + * + * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location + * in the runqueue. + * + * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) + * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) + * ENQUEUE_WAKING - sched_class::task_waking was called + * + */ + +#define DEQUEUE_SLEEP 0x01 +#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ +#define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */ + #define ENQUEUE_WAKEUP 0x01 -#define ENQUEUE_HEAD 0x02 +#define ENQUEUE_RESTORE 0x02 +#define ENQUEUE_MOVE 0x04 + +#define ENQUEUE_HEAD 0x08 +#define ENQUEUE_REPLENISH 0x10 #ifdef CONFIG_SMP -#define ENQUEUE_WAKING 0x04 /* sched_class::task_waking was called */ +#define ENQUEUE_WAKING 0x20 #else #define ENQUEUE_WAKING 0x00 #endif -#define ENQUEUE_REPLENISH 0x08 -#define ENQUEUE_RESTORE 0x10 -#define ENQUEUE_WAKEUP_NEW 0x20 - -#define DEQUEUE_SLEEP 0x01 -#define DEQUEUE_SAVE 0x02 +#define ENQUEUE_WAKEUP_NEW 0x40 #define RETRY_TASK ((void *)-1UL) @@ -1319,6 +2106,12 @@ struct sched_class { #ifdef CONFIG_FAIR_GROUP_SCHED void (*task_change_group)(struct task_struct *p, int type); #endif +#ifdef CONFIG_SCHED_HMP + void (*inc_hmp_sched_stats)(struct rq *rq, struct task_struct *p); + void (*dec_hmp_sched_stats)(struct rq *rq, struct task_struct *p); + void (*fixup_hmp_sched_stats)(struct rq *rq, struct task_struct *p, + u32 new_task_load, u32 new_pred_demand); +#endif }; static inline void put_prev_task(struct rq *rq, struct task_struct *prev) @@ -1343,6 +2136,7 @@ extern void init_max_cpu_capacity(struct max_cpu_capacity *mcc); extern void update_group_capacity(struct sched_domain *sd, int cpu); extern void trigger_load_balance(struct rq *rq); +extern void nohz_balance_clear_nohz_mask(int cpu); extern void idle_enter_fair(struct rq *this_rq); extern void idle_exit_fair(struct rq *this_rq); @@ -1400,7 +2194,9 @@ static inline int idle_get_state_idx(struct rq *rq) } #endif +#ifdef CONFIG_SYSRQ_SCHED_DEBUG extern void sysrq_sched_debug_show(void); +#endif extern void sched_init_granularity(void); extern void update_max_interval(void); @@ -1428,6 +2224,7 @@ static inline void __add_nr_running(struct rq *rq, unsigned count) { unsigned prev_nr = rq->nr_running; + sched_update_nr_prod(cpu_of(rq), count, true); rq->nr_running = prev_nr + count; if (prev_nr < 2 && rq->nr_running >= 2) { @@ -1454,6 +2251,7 @@ static inline void __add_nr_running(struct rq *rq, unsigned count) static inline void __sub_nr_running(struct rq *rq, unsigned count) { + sched_update_nr_prod(cpu_of(rq), count, false); rq->nr_running -= count; } @@ -1617,6 +2415,7 @@ static inline unsigned long __cpu_util(int cpu, int delta) util = div64_u64(cpu_rq(cpu)->cumulative_runnable_avg, walt_ravg_window >> SCHED_LOAD_SHIFT); #endif + delta += util; if (delta < 0) return 0; @@ -1644,6 +2443,20 @@ static inline unsigned long cpu_util_freq(int cpu) #endif +#ifdef CONFIG_SCHED_HMP +/* + * HMP and EAS are orthogonal. Hopefully the compiler just elides out all code + * with the energy_aware() check, so that we don't even pay the comparison + * penalty at runtime. + */ +#define energy_aware() false +#else +static inline bool energy_aware(void) +{ + return sched_feat(ENERGY_AWARE); +} +#endif + static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); @@ -1884,6 +2697,11 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) __release(rq2->lock); } +/* + * task_may_not_preempt - check whether a task may not be preemptible soon + */ +extern bool task_may_not_preempt(struct task_struct *task, int cpu); + #else /* CONFIG_SMP */ /* @@ -1951,6 +2769,9 @@ enum rq_nohz_flag_bits { NOHZ_BALANCE_KICK, }; +#define NOHZ_KICK_ANY 0 +#define NOHZ_KICK_RESTRICT 1 + #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) #endif @@ -2032,6 +2853,18 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) { struct update_util_data *data; +#ifdef CONFIG_SCHED_HMP + /* + * Skip if we've already reported, but not if this is an inter-cluster + * migration + */ + if (!sched_disable_window_stats && + (rq->load_reported_window == rq->window_start) && + !(flags & SCHED_CPUFREQ_INTERCLUSTER_MIG)) + return; + rq->load_reported_window = rq->window_start; +#endif + data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); if (data) data->func(data, rq_clock(rq), flags); diff --git a/kernel/sched/sched_avg.c b/kernel/sched/sched_avg.c new file mode 100644 index 000000000000..f03ed685f102 --- /dev/null +++ b/kernel/sched/sched_avg.c @@ -0,0 +1,199 @@ +/* Copyright (c) 2012, 2015-2017, 2018 The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +/* + * Scheduler hook for average runqueue determination + */ +#include <linux/module.h> +#include <linux/percpu.h> +#include <linux/hrtimer.h> +#include <linux/sched.h> +#include <linux/math64.h> + +#include "sched.h" +#include <trace/events/sched.h> + +static DEFINE_PER_CPU(u64, nr_prod_sum); +static DEFINE_PER_CPU(u64, last_time); +static DEFINE_PER_CPU(u64, nr_big_prod_sum); +static DEFINE_PER_CPU(u64, nr); +static DEFINE_PER_CPU(u64, nr_max); + +static DEFINE_PER_CPU(unsigned long, iowait_prod_sum); +static DEFINE_PER_CPU(spinlock_t, nr_lock) = __SPIN_LOCK_UNLOCKED(nr_lock); +static s64 last_get_time; + +#define DIV64_U64_ROUNDUP(X, Y) div64_u64((X) + (Y - 1), Y) +/** + * sched_get_nr_running_avg + * @return: Average nr_running, iowait and nr_big_tasks value since last poll. + * Returns the avg * 100 to return up to two decimal points + * of accuracy. + * + * Obtains the average nr_running value since the last poll. + * This function may not be called concurrently with itself + */ +void sched_get_nr_running_avg(int *avg, int *iowait_avg, int *big_avg, + unsigned int *max_nr, unsigned int *big_max_nr) +{ + int cpu; + u64 curr_time = sched_clock(); + u64 diff = curr_time - last_get_time; + u64 tmp_avg = 0, tmp_iowait = 0, tmp_big_avg = 0; + + *avg = 0; + *iowait_avg = 0; + *big_avg = 0; + *max_nr = 0; + *big_max_nr = 0; + + if (!diff) + return; + + /* read and reset nr_running counts */ + for_each_possible_cpu(cpu) { + unsigned long flags; + + spin_lock_irqsave(&per_cpu(nr_lock, cpu), flags); + curr_time = sched_clock(); + diff = curr_time - per_cpu(last_time, cpu); + BUG_ON((s64)diff < 0); + + tmp_avg += per_cpu(nr_prod_sum, cpu); + tmp_avg += per_cpu(nr, cpu) * diff; + + tmp_big_avg += per_cpu(nr_big_prod_sum, cpu); + tmp_big_avg += nr_eligible_big_tasks(cpu) * diff; + + tmp_iowait += per_cpu(iowait_prod_sum, cpu); + tmp_iowait += nr_iowait_cpu(cpu) * diff; + + per_cpu(last_time, cpu) = curr_time; + + per_cpu(nr_prod_sum, cpu) = 0; + per_cpu(nr_big_prod_sum, cpu) = 0; + per_cpu(iowait_prod_sum, cpu) = 0; + + if (*max_nr < per_cpu(nr_max, cpu)) + *max_nr = per_cpu(nr_max, cpu); + + if (is_max_capacity_cpu(cpu)) { + if (*big_max_nr < per_cpu(nr_max, cpu)) + *big_max_nr = per_cpu(nr_max, cpu); + } + + per_cpu(nr_max, cpu) = per_cpu(nr, cpu); + spin_unlock_irqrestore(&per_cpu(nr_lock, cpu), flags); + } + + diff = curr_time - last_get_time; + last_get_time = curr_time; + + /* + * Any task running on BIG cluster and BIG tasks running on little + * cluster contributes to big_avg. Small or medium tasks can also + * run on BIG cluster when co-location and scheduler boost features + * are activated. We don't want these tasks to downmigrate to little + * cluster when BIG CPUs are available but isolated. Round up the + * average values so that core_ctl aggressively unisolate BIG CPUs. + */ + *avg = (int)DIV64_U64_ROUNDUP(tmp_avg, diff); + *big_avg = (int)DIV64_U64_ROUNDUP(tmp_big_avg, diff); + *iowait_avg = (int)DIV64_U64_ROUNDUP(tmp_iowait, diff); + + trace_sched_get_nr_running_avg(*avg, *big_avg, *iowait_avg, + *max_nr, *big_max_nr); + + BUG_ON(*avg < 0 || *big_avg < 0 || *iowait_avg < 0); + pr_debug("%s - avg:%d big_avg:%d iowait_avg:%d\n", + __func__, *avg, *big_avg, *iowait_avg); +} +EXPORT_SYMBOL(sched_get_nr_running_avg); + +static DEFINE_PER_CPU(atomic64_t, last_busy_time) = ATOMIC64_INIT(0); + +#define BUSY_NR_RUN 3 +#define BUSY_LOAD_FACTOR 10 + +#ifdef CONFIG_SCHED_HMP +static inline void update_last_busy_time(int cpu, bool dequeue, + unsigned long prev_nr_run, u64 curr_time) +{ + bool nr_run_trigger = false, load_trigger = false; + + if (!hmp_capable() || is_min_capacity_cpu(cpu)) + return; + + if (prev_nr_run >= BUSY_NR_RUN && per_cpu(nr, cpu) < BUSY_NR_RUN) + nr_run_trigger = true; + + if (dequeue) { + u64 load; + + load = cpu_rq(cpu)->hmp_stats.cumulative_runnable_avg; + load = scale_load_to_cpu(load, cpu); + + if (load * BUSY_LOAD_FACTOR > sched_ravg_window) + load_trigger = true; + } + + if (nr_run_trigger || load_trigger) + atomic64_set(&per_cpu(last_busy_time, cpu), curr_time); +} +#else +static inline void update_last_busy_time(int cpu, bool dequeue, + unsigned long prev_nr_run, u64 curr_time) +{ +} +#endif + +/** + * sched_update_nr_prod + * @cpu: The core id of the nr running driver. + * @delta: Adjust nr by 'delta' amount + * @inc: Whether we are increasing or decreasing the count + * @return: N/A + * + * Update average with latest nr_running value for CPU + */ +void sched_update_nr_prod(int cpu, long delta, bool inc) +{ + u64 diff; + u64 curr_time; + unsigned long flags, nr_running; + + spin_lock_irqsave(&per_cpu(nr_lock, cpu), flags); + nr_running = per_cpu(nr, cpu); + curr_time = sched_clock(); + diff = curr_time - per_cpu(last_time, cpu); + BUG_ON((s64)diff < 0); + per_cpu(last_time, cpu) = curr_time; + per_cpu(nr, cpu) = nr_running + (inc ? delta : -delta); + + BUG_ON((s64)per_cpu(nr, cpu) < 0); + + if (per_cpu(nr, cpu) > per_cpu(nr_max, cpu)) + per_cpu(nr_max, cpu) = per_cpu(nr, cpu); + + update_last_busy_time(cpu, !inc, nr_running, curr_time); + + per_cpu(nr_prod_sum, cpu) += nr_running * diff; + per_cpu(nr_big_prod_sum, cpu) += nr_eligible_big_tasks(cpu) * diff; + per_cpu(iowait_prod_sum, cpu) += nr_iowait_cpu(cpu) * diff; + spin_unlock_irqrestore(&per_cpu(nr_lock, cpu), flags); +} +EXPORT_SYMBOL(sched_update_nr_prod); + +u64 sched_get_cpu_last_busy_time(int cpu) +{ + return atomic64_read(&per_cpu(last_busy_time, cpu)); +} diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index a5567ccd8803..3278c81cefb1 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -1,5 +1,4 @@ #include "sched.h" -#include "walt.h" /* * stop-task scheduling class. @@ -19,6 +18,41 @@ select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags, } #endif /* CONFIG_SMP */ +#ifdef CONFIG_SCHED_HMP + +static void +inc_hmp_sched_stats_stop(struct rq *rq, struct task_struct *p) +{ + inc_cumulative_runnable_avg(&rq->hmp_stats, p); +} + +static void +dec_hmp_sched_stats_stop(struct rq *rq, struct task_struct *p) +{ + dec_cumulative_runnable_avg(&rq->hmp_stats, p); +} + +static void +fixup_hmp_sched_stats_stop(struct rq *rq, struct task_struct *p, + u32 new_task_load, u32 new_pred_demand) +{ + s64 task_load_delta = (s64)new_task_load - task_load(p); + s64 pred_demand_delta = PRED_DEMAND_DELTA; + + fixup_cumulative_runnable_avg(&rq->hmp_stats, p, task_load_delta, + pred_demand_delta); +} + +#else /* CONFIG_SCHED_HMP */ + +static inline void +inc_hmp_sched_stats_stop(struct rq *rq, struct task_struct *p) { } + +static inline void +dec_hmp_sched_stats_stop(struct rq *rq, struct task_struct *p) { } + +#endif /* CONFIG_SCHED_HMP */ + static void check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags) { @@ -44,14 +78,14 @@ static void enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) { add_nr_running(rq, 1); - walt_inc_cumulative_runnable_avg(rq, p); + inc_hmp_sched_stats_stop(rq, p); } static void dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) { sub_nr_running(rq, 1); - walt_dec_cumulative_runnable_avg(rq, p); + dec_hmp_sched_stats_stop(rq, p); } static void yield_task_stop(struct rq *rq) @@ -138,4 +172,9 @@ const struct sched_class stop_sched_class = { .prio_changed = prio_changed_stop, .switched_to = switched_to_stop, .update_curr = update_curr_stop, +#ifdef CONFIG_SCHED_HMP + .inc_hmp_sched_stats = inc_hmp_sched_stats_stop, + .dec_hmp_sched_stats = dec_hmp_sched_stats_stop, + .fixup_hmp_sched_stats = fixup_hmp_sched_stats_stop, +#endif }; diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index d444fc1a4d58..b84d13750604 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -121,6 +121,33 @@ struct schedtune { /* Boost value for tasks on that SchedTune CGroup */ int boost; +#ifdef CONFIG_SCHED_HMP + /* Toggle ability to override sched boost enabled */ + bool sched_boost_no_override; + + /* + * Controls whether a cgroup is eligible for sched boost or not. This + * can temporariliy be disabled by the kernel based on the no_override + * flag above. + */ + bool sched_boost_enabled; + + /* + * This tracks the default value of sched_boost_enabled and is used + * restore the value following any temporary changes to that flag. + */ + bool sched_boost_enabled_backup; + + /* + * Controls whether tasks of this cgroup should be colocated with each + * other and tasks of other cgroups that have the same flag turned on. + */ + bool colocate; + + /* Controls whether further updates are allowed to the colocate flag */ + bool colocate_update_disabled; +#endif + /* Performance Boost (B) region threshold params */ int perf_boost_idx; @@ -134,7 +161,7 @@ struct schedtune { static inline struct schedtune *css_st(struct cgroup_subsys_state *css) { - return css ? container_of(css, struct schedtune, css) : NULL; + return container_of(css, struct schedtune, css); } static inline struct schedtune *task_schedtune(struct task_struct *tsk) @@ -159,6 +186,13 @@ static inline struct schedtune *parent_st(struct schedtune *st) static struct schedtune root_schedtune = { .boost = 0, +#ifdef CONFIG_SCHED_HMP + .sched_boost_no_override = false, + .sched_boost_enabled = true, + .sched_boost_enabled_backup = true, + .colocate = false, + .colocate_update_disabled = false, +#endif .perf_boost_idx = 0, .perf_constrain_idx = 0, .prefer_idle = 0, @@ -239,6 +273,121 @@ struct boost_groups { /* Boost groups affecting each CPU in the system */ DEFINE_PER_CPU(struct boost_groups, cpu_boost_groups); +#ifdef CONFIG_SCHED_HMP +static inline void init_sched_boost(struct schedtune *st) +{ + st->sched_boost_no_override = false; + st->sched_boost_enabled = true; + st->sched_boost_enabled_backup = st->sched_boost_enabled; + st->colocate = false; + st->colocate_update_disabled = false; +} + +bool same_schedtune(struct task_struct *tsk1, struct task_struct *tsk2) +{ + return task_schedtune(tsk1) == task_schedtune(tsk2); +} + +void update_cgroup_boost_settings(void) +{ + int i; + + for (i = 0; i < BOOSTGROUPS_COUNT; i++) { + if (!allocated_group[i]) + break; + + if (allocated_group[i]->sched_boost_no_override) + continue; + + allocated_group[i]->sched_boost_enabled = false; + } +} + +void restore_cgroup_boost_settings(void) +{ + int i; + + for (i = 0; i < BOOSTGROUPS_COUNT; i++) { + if (!allocated_group[i]) + break; + + allocated_group[i]->sched_boost_enabled = + allocated_group[i]->sched_boost_enabled_backup; + } +} + +bool task_sched_boost(struct task_struct *p) +{ + struct schedtune *st = task_schedtune(p); + + return st->sched_boost_enabled; +} + +static u64 +sched_boost_override_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct schedtune *st = css_st(css); + + return st->sched_boost_no_override; +} + +static int sched_boost_override_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 override) +{ + struct schedtune *st = css_st(css); + + st->sched_boost_no_override = !!override; + + return 0; +} + +static u64 sched_boost_enabled_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct schedtune *st = css_st(css); + + return st->sched_boost_enabled; +} + +static int sched_boost_enabled_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 enable) +{ + struct schedtune *st = css_st(css); + + st->sched_boost_enabled = !!enable; + st->sched_boost_enabled_backup = st->sched_boost_enabled; + + return 0; +} + +static u64 sched_colocate_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct schedtune *st = css_st(css); + + return st->colocate; +} + +static int sched_colocate_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 colocate) +{ + struct schedtune *st = css_st(css); + + if (st->colocate_update_disabled) + return -EPERM; + + st->colocate = !!colocate; + st->colocate_update_disabled = true; + return 0; +} + +#else /* CONFIG_SCHED_HMP */ + +static inline void init_sched_boost(struct schedtune *st) { } + +#endif /* CONFIG_SCHED_HMP */ + static void schedtune_cpu_update(int cpu) { @@ -569,7 +718,7 @@ prefer_idle_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 prefer_idle) { struct schedtune *st = css_st(css); - st->prefer_idle = prefer_idle; + st->prefer_idle = !!prefer_idle; return 0; } @@ -619,6 +768,22 @@ boost_write(struct cgroup_subsys_state *css, struct cftype *cft, return 0; } +static void schedtune_attach(struct cgroup_taskset *tset) +{ + struct task_struct *task; + struct cgroup_subsys_state *css; + struct schedtune *st; + bool colocate; + + cgroup_taskset_first(tset, &css); + st = css_st(css); + + colocate = st->colocate; + + cgroup_taskset_for_each(task, css, tset) + sync_cgroup_colocation(task, colocate); +} + static struct cftype files[] = { { .name = "boost", @@ -630,6 +795,23 @@ static struct cftype files[] = { .read_u64 = prefer_idle_read, .write_u64 = prefer_idle_write, }, +#ifdef CONFIG_SCHED_HMP + { + .name = "sched_boost_no_override", + .read_u64 = sched_boost_override_read, + .write_u64 = sched_boost_override_write, + }, + { + .name = "sched_boost_enabled", + .read_u64 = sched_boost_enabled_read, + .write_u64 = sched_boost_enabled_write, + }, + { + .name = "colocate", + .read_u64 = sched_colocate_read, + .write_u64 = sched_colocate_write, + }, +#endif { } /* terminate */ }; @@ -683,6 +865,7 @@ schedtune_css_alloc(struct cgroup_subsys_state *parent_css) /* Initialize per CPUs boost group support */ st->idx = idx; + init_sched_boost(st); if (schedtune_boostgroup_init(st)) goto release; @@ -720,6 +903,7 @@ struct cgroup_subsys schedtune_cgrp_subsys = { .cancel_attach = schedtune_cancel_attach, .legacy_cftypes = files, .early_init = 1, + .attach = schedtune_attach, }; static inline void @@ -915,7 +1099,8 @@ schedtune_init(void) */ sd = rcu_dereference(per_cpu(sd_ea, cpumask_first(cpu_online_mask))); if (!sd) { - pr_info("schedtune: no energy model data\n"); + if (energy_aware()) + pr_warn("schedtune: no energy model data\n"); goto nodata; } diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 9a9203b15cde..a39f81c7e87a 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -175,7 +175,7 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) * * Returns valid seccomp BPF response codes. */ -static u32 seccomp_run_filters(struct seccomp_data *sd) +static u32 seccomp_run_filters(const struct seccomp_data *sd) { struct seccomp_data sd_local; u32 ret = SECCOMP_RET_ALLOW; @@ -579,20 +579,10 @@ void secure_computing_strict(int this_syscall) BUG(); } #else -int __secure_computing(void) -{ - u32 phase1_result = seccomp_phase1(NULL); - - if (likely(phase1_result == SECCOMP_PHASE1_OK)) - return 0; - else if (likely(phase1_result == SECCOMP_PHASE1_SKIP)) - return -1; - else - return seccomp_phase2(phase1_result); -} #ifdef CONFIG_SECCOMP_FILTER -static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd) +static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, + const bool recheck_after_trace) { u32 filter_ret, action; int data; @@ -624,10 +614,50 @@ static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd) goto skip; case SECCOMP_RET_TRACE: - return filter_ret; /* Save the rest for phase 2. */ + /* We've been put in this state by the ptracer already. */ + if (recheck_after_trace) + return 0; + + /* ENOSYS these calls if there is no tracer attached. */ + if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) { + syscall_set_return_value(current, + task_pt_regs(current), + -ENOSYS, 0); + goto skip; + } + + /* Allow the BPF to provide the event message */ + ptrace_event(PTRACE_EVENT_SECCOMP, data); + /* + * The delivery of a fatal signal during event + * notification may silently skip tracer notification, + * which could leave us with a potentially unmodified + * syscall that the tracer would have liked to have + * changed. Since the process is about to die, we just + * force the syscall to be skipped and let the signal + * kill the process and correctly handle any tracer exit + * notifications. + */ + if (fatal_signal_pending(current)) + goto skip; + /* Check if the tracer forced the syscall to be skipped. */ + this_syscall = syscall_get_nr(current, task_pt_regs(current)); + if (this_syscall < 0) + goto skip; + + /* + * Recheck the syscall, since it may have changed. This + * intentionally uses a NULL struct seccomp_data to force + * a reload of all registers. This does not goto skip since + * a skip would have already been reported. + */ + if (__seccomp_filter(this_syscall, NULL, true)) + return -1; + + return 0; case SECCOMP_RET_ALLOW: - return SECCOMP_PHASE1_OK; + return 0; case SECCOMP_RET_KILL: default: @@ -639,96 +669,38 @@ static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd) skip: audit_seccomp(this_syscall, 0, action); - return SECCOMP_PHASE1_SKIP; + return -1; +} +#else +static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, + const bool recheck_after_trace) +{ + BUG(); } #endif -/** - * seccomp_phase1() - run fast path seccomp checks on the current syscall - * @arg sd: The seccomp_data or NULL - * - * This only reads pt_regs via the syscall_xyz helpers. The only change - * it will make to pt_regs is via syscall_set_return_value, and it will - * only do that if it returns SECCOMP_PHASE1_SKIP. - * - * If sd is provided, it will not read pt_regs at all. - * - * It may also call do_exit or force a signal; these actions must be - * safe. - * - * If it returns SECCOMP_PHASE1_OK, the syscall passes checks and should - * be processed normally. - * - * If it returns SECCOMP_PHASE1_SKIP, then the syscall should not be - * invoked. In this case, seccomp_phase1 will have set the return value - * using syscall_set_return_value. - * - * If it returns anything else, then the return value should be passed - * to seccomp_phase2 from a context in which ptrace hooks are safe. - */ -u32 seccomp_phase1(struct seccomp_data *sd) +int __secure_computing(const struct seccomp_data *sd) { int mode = current->seccomp.mode; - int this_syscall = sd ? sd->nr : - syscall_get_nr(current, task_pt_regs(current)); + int this_syscall; if (config_enabled(CONFIG_CHECKPOINT_RESTORE) && unlikely(current->ptrace & PT_SUSPEND_SECCOMP)) - return SECCOMP_PHASE1_OK; + return 0; + + this_syscall = sd ? sd->nr : + syscall_get_nr(current, task_pt_regs(current)); switch (mode) { case SECCOMP_MODE_STRICT: __secure_computing_strict(this_syscall); /* may call do_exit */ - return SECCOMP_PHASE1_OK; -#ifdef CONFIG_SECCOMP_FILTER + return 0; case SECCOMP_MODE_FILTER: - return __seccomp_phase1_filter(this_syscall, sd); -#endif + return __seccomp_filter(this_syscall, sd, false); default: BUG(); } } - -/** - * seccomp_phase2() - finish slow path seccomp work for the current syscall - * @phase1_result: The return value from seccomp_phase1() - * - * This must be called from a context in which ptrace hooks can be used. - * - * Returns 0 if the syscall should be processed or -1 to skip the syscall. - */ -int seccomp_phase2(u32 phase1_result) -{ - struct pt_regs *regs = task_pt_regs(current); - u32 action = phase1_result & SECCOMP_RET_ACTION; - int data = phase1_result & SECCOMP_RET_DATA; - - BUG_ON(action != SECCOMP_RET_TRACE); - - audit_seccomp(syscall_get_nr(current, regs), 0, action); - - /* Skip these calls if there is no tracer. */ - if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) { - syscall_set_return_value(current, regs, - -ENOSYS, 0); - return -1; - } - - /* Allow the BPF to provide the event message */ - ptrace_event(PTRACE_EVENT_SECCOMP, data); - /* - * The delivery of a fatal signal during event - * notification may silently skip tracer notification. - * Terminating the task now avoids executing a system - * call that may not be intended. - */ - if (fatal_signal_pending(current)) - do_exit(SIGSYS); - if (syscall_get_nr(current, regs) < 0) - return -1; /* Explicit request to skip. */ - - return 0; -} #endif /* CONFIG_HAVE_ARCH_SECCOMP_FILTER */ long prctl_get_seccomp(void) diff --git a/kernel/smp.c b/kernel/smp.c index d903c02223af..3f300d3fd1f6 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -14,6 +14,7 @@ #include <linux/smp.h> #include <linux/cpu.h> #include <linux/sched.h> +#include <linux/suspend.h> #include "smpboot.h" @@ -32,6 +33,9 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data); static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue); static void flush_smp_call_function_queue(bool warn_cpu_offline); +/* CPU mask indicating which CPUs to bring online during smp_init() */ +static bool have_boot_cpu_mask; +static cpumask_var_t boot_cpu_mask; static int hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) @@ -548,6 +552,19 @@ static int __init maxcpus(char *str) early_param("maxcpus", maxcpus); +static int __init boot_cpus(char *str) +{ + alloc_bootmem_cpumask_var(&boot_cpu_mask); + if (cpulist_parse(str, boot_cpu_mask) < 0) { + pr_warn("SMP: Incorrect boot_cpus cpumask\n"); + return -EINVAL; + } + have_boot_cpu_mask = true; + return 0; +} + +early_param("boot_cpus", boot_cpus); + /* Setup number of possible processor ids */ int nr_cpu_ids __read_mostly = NR_CPUS; EXPORT_SYMBOL(nr_cpu_ids); @@ -563,6 +580,21 @@ void __weak smp_announce(void) printk(KERN_INFO "Brought up %d CPUs\n", num_online_cpus()); } +/* Should the given CPU be booted during smp_init() ? */ +static inline bool boot_cpu(int cpu) +{ + if (!have_boot_cpu_mask) + return true; + + return cpumask_test_cpu(cpu, boot_cpu_mask); +} + +static inline void free_boot_cpu_mask(void) +{ + if (have_boot_cpu_mask) /* Allocated from boot_cpus() */ + free_bootmem_cpumask_var(boot_cpu_mask); +} + /* Called by boot processor to activate the rest. */ void __init smp_init(void) { @@ -574,10 +606,12 @@ void __init smp_init(void) for_each_present_cpu(cpu) { if (num_online_cpus() >= setup_max_cpus) break; - if (!cpu_online(cpu)) + if (!cpu_online(cpu) && boot_cpu(cpu)) cpu_up(cpu); } + free_boot_cpu_mask(); + /* Any cleanup work */ smp_announce(); smp_cpus_done(setup_max_cpus); @@ -733,8 +767,9 @@ void wake_up_all_idle_cpus(void) for_each_online_cpu(cpu) { if (cpu == smp_processor_id()) continue; - - wake_up_if_idle(cpu); + if (suspend_freeze_state == FREEZE_STATE_ENTER || + !cpu_isolated(cpu)) + wake_up_if_idle(cpu); } preempt_enable(); } diff --git a/kernel/smpboot.c b/kernel/smpboot.c index d264f59bff56..979248f5e21a 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -13,6 +13,7 @@ #include <linux/percpu.h> #include <linux/kthread.h> #include <linux/smpboot.h> +#include <linux/kmemleak.h> #include "smpboot.h" @@ -121,7 +122,45 @@ static int smpboot_thread_fn(void *data) } if (kthread_should_park()) { + /* + * Serialize against wakeup. If we take the lock first, + * wakeup is skipped. If we run later, we observe, + * TASK_RUNNING update from wakeup path, before moving + * forward. This helps avoid the race, where wakeup + * observes TASK_INTERRUPTIBLE, and also observes + * the TASK_PARKED in kthread_parkme() before updating + * task state to TASK_RUNNING. In this case, kthread + * gets parked in TASK_RUNNING state. This results + * in panic later on in kthread_unpark(), as it sees + * KTHREAD_IS_PARKED flag set but fails to rebind the + * kthread, due to it being not in TASK_PARKED state. + * + * Control thread Hotplug Thread + * + * kthread_park() + * set KTHREAD_SHOULD_PARK + * smpboot_thread_fn() + * set_current_state( + * TASK_INTERRUPTIBLE); + * kthread_parkme() + * + * wake_up_process() + * + * raw_spin_lock_irqsave(&p->pi_lock, flags); + * if (!(p->state & state)) + * goto out; + * + * __set_current_state( + * TASK_PARKED); + * + * if (p->on_rq && ttwu_remote(p, wake_flags)) + * ttwu_remote() + * p->state = TASK_RUNNING; + * schedule(); + */ + raw_spin_lock(¤t->pi_lock); __set_current_state(TASK_RUNNING); + raw_spin_unlock(¤t->pi_lock); preempt_enable(); if (ht->park && td->status == HP_THREAD_ACTIVE) { BUG_ON(td->cpu != smp_processor_id()); @@ -177,6 +216,8 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu) td = kzalloc_node(sizeof(*td), GFP_KERNEL, cpu_to_node(cpu)); if (!td) return -ENOMEM; + + kmemleak_not_leak(td); td->cpu = cpu; td->ht = ht; diff --git a/kernel/softirq.c b/kernel/softirq.c index 359be4f39986..d69b77fc7cc1 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -57,6 +57,13 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp DEFINE_PER_CPU(struct task_struct *, ksoftirqd); +/* + * active_softirqs -- per cpu, a mask of softirqs that are being handled, + * with the expectation that approximate answers are acceptable and therefore + * no synchronization. + */ +DEFINE_PER_CPU(__u32, active_softirqs); + const char * const softirq_to_name[NR_SOFTIRQS] = { "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", "TASKLET", "SCHED", "HRTIMER", "RCU" @@ -227,6 +234,16 @@ static inline bool lockdep_softirq_start(void) { return false; } static inline void lockdep_softirq_end(bool in_hardirq) { } #endif +#define softirq_deferred_for_rt(pending) \ +({ \ + __u32 deferred = 0; \ + if (cpupri_check_rt()) { \ + deferred = pending & LONG_SOFTIRQ_MASK; \ + pending &= ~LONG_SOFTIRQ_MASK; \ + } \ + deferred; \ +}) + asmlinkage __visible void __softirq_entry __do_softirq(void) { unsigned long end = jiffies + MAX_SOFTIRQ_TIME; @@ -234,6 +251,7 @@ asmlinkage __visible void __softirq_entry __do_softirq(void) int max_restart = MAX_SOFTIRQ_RESTART; struct softirq_action *h; bool in_hardirq; + __u32 deferred; __u32 pending; int softirq_bit; @@ -245,14 +263,15 @@ asmlinkage __visible void __softirq_entry __do_softirq(void) current->flags &= ~PF_MEMALLOC; pending = local_softirq_pending(); + deferred = softirq_deferred_for_rt(pending); account_irq_enter_time(current); - __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET); in_hardirq = lockdep_softirq_start(); restart: /* Reset the pending bitmask before enabling irqs */ - set_softirq_pending(0); + set_softirq_pending(deferred); + __this_cpu_write(active_softirqs, pending); local_irq_enable(); @@ -282,18 +301,21 @@ restart: pending >>= softirq_bit; } + __this_cpu_write(active_softirqs, 0); rcu_bh_qs(); local_irq_disable(); pending = local_softirq_pending(); + deferred = softirq_deferred_for_rt(pending); + if (pending) { if (time_before(jiffies, end) && !need_resched() && --max_restart) goto restart; - - wakeup_softirqd(); } + if (pending | deferred) + wakeup_softirqd(); lockdep_softirq_end(in_hardirq); account_irq_exit_time(current); __local_bh_enable(SOFTIRQ_OFFSET); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9d357d3f2e78..fa543fd3a6db 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -98,6 +98,10 @@ #if defined(CONFIG_SYSCTL) /* External variables not in a header file. */ +#ifdef CONFIG_USB +int deny_new_usb __read_mostly = 0; +EXPORT_SYMBOL(deny_new_usb); +#endif extern int suid_dumpable; #ifdef CONFIG_COREDUMP extern int core_uses_pid; @@ -125,6 +129,7 @@ static int __maybe_unused neg_one = -1; static int zero; static int __maybe_unused one = 1; static int __maybe_unused two = 2; +static int __maybe_unused three = 3; static int __maybe_unused four = 4; static unsigned long zero_ul; static unsigned long one_ul = 1; @@ -133,6 +138,10 @@ static int one_hundred = 100; #ifdef CONFIG_PRINTK static int ten_thousand = 10000; #endif +#ifdef CONFIG_SCHED_HMP +static int one_thousand = 1000; +static int max_freq_reporting_policy = FREQ_REPORT_INVALID_POLICY - 1; +#endif /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; @@ -288,62 +297,259 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, -#ifdef CONFIG_SCHED_DEBUG +#if defined(CONFIG_PREEMPT_TRACER) || defined(CONFIG_IRQSOFF_TRACER) { - .procname = "sched_min_granularity_ns", - .data = &sysctl_sched_min_granularity, + .procname = "preemptoff_tracing_threshold_ns", + .data = &sysctl_preemptoff_tracing_threshold_ns, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "irqsoff_tracing_threshold_ns", + .data = &sysctl_irqsoff_tracing_threshold_ns, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_SCHED_HMP + { + .procname = "sched_freq_reporting_policy", + .data = &sysctl_sched_freq_reporting_policy, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = sched_proc_update_handler, - .extra1 = &min_sched_granularity_ns, - .extra2 = &max_sched_granularity_ns, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &max_freq_reporting_policy, }, { - .procname = "sched_latency_ns", - .data = &sysctl_sched_latency, + .procname = "sched_freq_inc_notify", + .data = &sysctl_sched_freq_inc_notify, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = sched_proc_update_handler, - .extra1 = &min_sched_granularity_ns, - .extra2 = &max_sched_granularity_ns, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, }, { - .procname = "sched_sync_hint_enable", - .data = &sysctl_sched_sync_hint_enable, + .procname = "sched_freq_dec_notify", + .data = &sysctl_sched_freq_dec_notify, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + }, + { + .procname = "sched_cpu_high_irqload", + .data = &sysctl_sched_cpu_high_irqload, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sched_ravg_hist_size", + .data = &sysctl_sched_ravg_hist_size, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_window_update_handler, + }, + { + .procname = "sched_window_stats_policy", + .data = &sysctl_sched_window_stats_policy, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_window_update_handler, }, -#ifdef CONFIG_SCHED_WALT { - .procname = "sched_use_walt_cpu_util", - .data = &sysctl_sched_use_walt_cpu_util, + .procname = "sched_spill_load", + .data = &sysctl_sched_spill_load_pct, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = sched_hmp_proc_update_handler, + .extra1 = &zero, + .extra2 = &one_hundred, }, { - .procname = "sched_use_walt_task_util", - .data = &sysctl_sched_use_walt_task_util, + .procname = "sched_spill_nr_run", + .data = &sysctl_sched_spill_nr_run, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + }, + { + .procname = "sched_upmigrate", + .data = &sysctl_sched_upmigrate_pct, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_hmp_proc_update_handler, + .extra1 = &zero, + .extra2 = &one_hundred, + }, + { + .procname = "sched_downmigrate", + .data = &sysctl_sched_downmigrate_pct, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_hmp_proc_update_handler, + .extra1 = &zero, + .extra2 = &one_hundred, + }, + { + .procname = "sched_group_upmigrate", + .data = &sysctl_sched_group_upmigrate_pct, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_hmp_proc_update_handler, + .extra1 = &zero, + }, + { + .procname = "sched_group_downmigrate", + .data = &sysctl_sched_group_downmigrate_pct, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_hmp_proc_update_handler, + .extra1 = &zero, + }, + { + .procname = "sched_init_task_load", + .data = &sysctl_sched_init_task_load_pct, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_hmp_proc_update_handler, + .extra1 = &zero, + .extra2 = &one_hundred, + }, + { + .procname = "sched_select_prev_cpu_us", + .data = &sysctl_sched_select_prev_cpu_us, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_hmp_proc_update_handler, + .extra1 = &zero, + }, + { + .procname = "sched_restrict_cluster_spill", + .data = &sysctl_sched_restrict_cluster_spill, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, + { + .procname = "sched_small_wakee_task_load", + .data = &sysctl_sched_small_wakee_task_load_pct, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_hmp_proc_update_handler, + .extra1 = &zero, + .extra2 = &one_hundred, + }, + { + .procname = "sched_big_waker_task_load", + .data = &sysctl_sched_big_waker_task_load_pct, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_hmp_proc_update_handler, + .extra1 = &zero, + .extra2 = &one_hundred, + }, + { + .procname = "sched_prefer_sync_wakee_to_waker", + .data = &sysctl_sched_prefer_sync_wakee_to_waker, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, + { + .procname = "sched_enable_thread_grouping", + .data = &sysctl_sched_enable_thread_grouping, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, }, { - .procname = "sched_walt_init_task_load_pct", - .data = &sysctl_sched_walt_init_task_load_pct, + .procname = "sched_pred_alert_freq", + .data = &sysctl_sched_pred_alert_freq, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + }, + { + .procname = "sched_freq_aggregate", + .data = &sysctl_sched_freq_aggregate, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_window_update_handler, + }, + { + .procname = "sched_freq_aggregate_threshold", + .data = &sysctl_sched_freq_aggregate_threshold_pct, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_hmp_proc_update_handler, + .extra1 = &zero, + /* + * Special handling for sched_freq_aggregate_threshold_pct + * which can be greater than 100. Use 1000 as an upper bound + * value which works for all practical use cases. + */ + .extra2 = &one_thousand, + }, + { + .procname = "sched_boost", + .data = &sysctl_sched_boost, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_boost_handler, + .extra1 = &zero, + .extra2 = &three, + }, + { + .procname = "sched_short_burst_ns", + .data = &sysctl_sched_short_burst, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec, }, { - .procname = "sched_walt_cpu_high_irqload", - .data = &sysctl_sched_walt_cpu_high_irqload, + .procname = "sched_short_sleep_ns", + .data = &sysctl_sched_short_sleep, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif /* CONFIG_SCHED_HMP */ +#ifdef CONFIG_SCHED_DEBUG + { + .procname = "sched_min_granularity_ns", + .data = &sysctl_sched_min_granularity, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_proc_update_handler, + .extra1 = &min_sched_granularity_ns, + .extra2 = &max_sched_granularity_ns, + }, + { + .procname = "sched_latency_ns", + .data = &sysctl_sched_latency, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_proc_update_handler, + .extra1 = &min_sched_granularity_ns, + .extra2 = &max_sched_granularity_ns, + }, + { + .procname = "sched_sync_hint_enable", + .data = &sysctl_sched_sync_hint_enable, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec, }, -#endif { .procname = "sched_cstate_aware", .data = &sysctl_sched_cstate_aware, @@ -881,6 +1087,17 @@ static struct ctl_table kern_table[] = { .extra2 = &two, }, #endif +#ifdef CONFIG_USB + { + .procname = "deny_new_usb", + .data = &deny_new_usb, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax_sysadmin, + .extra1 = &zero, + .extra2 = &one, + }, +#endif { .procname = "ngroups_max", .data = &ngroups_max, @@ -1238,6 +1455,27 @@ static struct ctl_table kern_table[] = { .extra2 = &one, }, #endif +#if defined(CONFIG_ARM) || defined(CONFIG_ARM64) + { + .procname = "boot_reason", + .data = &boot_reason, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, + + { + .procname = "cold_boot", + .data = &cold_boot, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, +#endif +/* + * NOTE: do not add new entries to this table unless you have read + * Documentation/sysctl/ctl_unnumbered.txt + */ { } }; @@ -1664,6 +1902,22 @@ static struct ctl_table vm_table[] = { .extra2 = (void *)&mmap_rnd_compat_bits_max, }, #endif +#ifdef CONFIG_SWAP + { + .procname = "swap_ratio", + .data = &sysctl_swap_ratio, + .maxlen = sizeof(sysctl_swap_ratio), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + }, + { + .procname = "swap_ratio_enable", + .data = &sysctl_swap_ratio_enable, + .maxlen = sizeof(sysctl_swap_ratio_enable), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + }, +#endif { } }; @@ -2153,15 +2407,7 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, int write, void *data) { if (write) { - if (*negp) { - if (*lvalp > (unsigned long) INT_MAX + 1) - return -EINVAL; - *valp = -*lvalp; - } else { - if (*lvalp > (unsigned long) INT_MAX) - return -EINVAL; - *valp = *lvalp; - } + *valp = *negp ? -*lvalp : *lvalp; } else { int val = *valp; if (val < 0) { diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 10a1d7dc9313..d7612fcba10a 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -138,6 +138,8 @@ static const struct bin_table bin_kern_table[] = { { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" }, { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" }, { CTL_INT, KERN_PANIC_ON_WARN, "panic_on_warn" }, + { CTL_INT, KERN_BOOT_REASON, "boot_reason" }, + { CTL_INT, KERN_COLD_BOOT, "cold_boot" }, {} }; @@ -253,6 +255,7 @@ static const struct bin_table bin_net_ipv4_conf_vars_table[] = { { CTL_INT, NET_IPV4_CONF_NOPOLICY, "disable_policy" }, { CTL_INT, NET_IPV4_CONF_FORCE_IGMP_VERSION, "force_igmp_version" }, { CTL_INT, NET_IPV4_CONF_PROMOTE_SECONDARIES, "promote_secondaries" }, + { CTL_INT, NET_IPV4_CONF_NF_IPV4_DEFRAG_SKIP, "nf_ipv4_defrag_skip" }, {} }; @@ -523,6 +526,7 @@ static const struct bin_table bin_net_ipv6_conf_var_table[] = { { CTL_INT, NET_IPV6_PROXY_NDP, "proxy_ndp" }, { CTL_INT, NET_IPV6_ACCEPT_SOURCE_ROUTE, "accept_source_route" }, { CTL_INT, NET_IPV6_ACCEPT_RA_FROM_LOCAL, "accept_ra_from_local" }, + { CTL_INT, NET_IPV6_ACCEPT_RA_PREFIX_ROUTE, "accept_ra_prefix_route" }, {} }; diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 0737a50380d7..8acdd6ae532c 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -54,7 +54,11 @@ static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; -static const struct nla_policy cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] = { +/* + * We have to use TASKSTATS_CMD_ATTR_MAX here, it is the maxattr in the family. + * Make sure they are always aligned. + */ +static const struct nla_policy cgroupstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = { [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 }, }; diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 49eca0beed32..b9b881ebaff3 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -9,6 +9,7 @@ ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y) endif obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o -obj-$(CONFIG_TIMER_STATS) += timer_stats.o obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o obj-$(CONFIG_TEST_UDELAY) += test_udelay.o + +ccflags-y += -Idrivers/cpuidle diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 015d432bcb08..4171fee2d4ec 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -26,6 +26,10 @@ #include <linux/workqueue.h> #include <linux/freezer.h> +#ifdef CONFIG_MSM_PM +#include "lpm-levels.h" +#endif + /** * struct alarm_base - Alarm timer bases * @lock: Lock for syncrhonized access to the base @@ -53,6 +57,18 @@ static struct rtc_timer rtctimer; static struct rtc_device *rtcdev; static DEFINE_SPINLOCK(rtcdev_lock); +static void alarmtimer_triggered_func(void *p) +{ + struct rtc_device *rtc = rtcdev; + + if (!(rtc->irq_data & RTC_AF)) + return; + __pm_wakeup_event(ws, 2 * MSEC_PER_SEC); +} + +static struct rtc_task alarmtimer_rtc_task = { + .func = alarmtimer_triggered_func +}; /** * alarmtimer_get_rtcdev - Return selected rtcdevice * @@ -63,7 +79,7 @@ static DEFINE_SPINLOCK(rtcdev_lock); struct rtc_device *alarmtimer_get_rtcdev(void) { unsigned long flags; - struct rtc_device *ret; + struct rtc_device *ret = NULL; spin_lock_irqsave(&rtcdev_lock, flags); ret = rtcdev; @@ -77,24 +93,36 @@ static int alarmtimer_rtc_add_device(struct device *dev, struct class_interface *class_intf) { unsigned long flags; + int err = 0; struct rtc_device *rtc = to_rtc_device(dev); - if (rtcdev) return -EBUSY; - if (!rtc->ops->set_alarm) return -1; - if (!device_may_wakeup(rtc->dev.parent)) - return -1; spin_lock_irqsave(&rtcdev_lock, flags); if (!rtcdev) { + err = rtc_irq_register(rtc, &alarmtimer_rtc_task); + if (err) + goto rtc_irq_reg_err; rtcdev = rtc; /* hold a reference so it doesn't go away */ get_device(dev); } + +rtc_irq_reg_err: spin_unlock_irqrestore(&rtcdev_lock, flags); - return 0; + return err; + +} + +static void alarmtimer_rtc_remove_device(struct device *dev, + struct class_interface *class_intf) +{ + if (rtcdev && dev == &rtcdev->dev) { + rtc_irq_unregister(rtcdev, &alarmtimer_rtc_task); + rtcdev = NULL; + } } static inline void alarmtimer_rtc_timer_init(void) @@ -104,6 +132,7 @@ static inline void alarmtimer_rtc_timer_init(void) static struct class_interface alarmtimer_rtc_interface = { .add_dev = &alarmtimer_rtc_add_device, + .remove_dev = &alarmtimer_rtc_remove_device, }; static int alarmtimer_rtc_interface_setup(void) @@ -217,6 +246,68 @@ EXPORT_SYMBOL_GPL(alarm_expires_remaining); * set an rtc timer to fire that far into the future, which * will wake us from suspend. */ +#if defined(CONFIG_RTC_DRV_QPNP) && defined(CONFIG_MSM_PM) +static int alarmtimer_suspend(struct device *dev) +{ + struct rtc_time tm; + ktime_t min, now; + unsigned long flags; + struct rtc_device *rtc; + int i; + int ret = 0; + + spin_lock_irqsave(&freezer_delta_lock, flags); + min = freezer_delta; + freezer_delta = ktime_set(0, 0); + spin_unlock_irqrestore(&freezer_delta_lock, flags); + + rtc = alarmtimer_get_rtcdev(); + /* If we have no rtcdev, just return */ + if (!rtc) + return 0; + + /* Find the soonest timer to expire*/ + for (i = 0; i < ALARM_NUMTYPE; i++) { + struct alarm_base *base = &alarm_bases[i]; + struct timerqueue_node *next; + ktime_t delta; + + spin_lock_irqsave(&base->lock, flags); + next = timerqueue_getnext(&base->timerqueue); + spin_unlock_irqrestore(&base->lock, flags); + if (!next) + continue; + delta = ktime_sub(next->expires, base->gettime()); + if (!min.tv64 || (delta.tv64 < min.tv64)) + min = delta; + } + if (min.tv64 == 0) + return 0; + + if (ktime_to_ns(min) < 2 * NSEC_PER_SEC) { + __pm_wakeup_event(ws, 2 * MSEC_PER_SEC); + return -EBUSY; + } + + /* Setup a timer to fire that far in the future */ + rtc_timer_cancel(rtc, &rtctimer); + rtc_read_time(rtc, &tm); + now = rtc_tm_to_ktime(tm); + now = ktime_add(now, min); + if (poweron_alarm) { + uint64_t msec = 0; + + msec = ktime_to_ms(min); + lpm_suspend_wake_time(msec); + } else { + /* Set alarm, if in the past reject suspend briefly to handle */ + ret = rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0)); + if (ret < 0) + __pm_wakeup_event(ws, MSEC_PER_SEC); + } + return ret; +} +#else static int alarmtimer_suspend(struct device *dev) { struct rtc_time tm; @@ -271,11 +362,30 @@ static int alarmtimer_suspend(struct device *dev) __pm_wakeup_event(ws, MSEC_PER_SEC); return ret; } +#endif +static int alarmtimer_resume(struct device *dev) +{ + struct rtc_device *rtc; + + rtc = alarmtimer_get_rtcdev(); + /* If we have no rtcdev, just return */ + if (!rtc) + return 0; + rtc_timer_cancel(rtc, &rtctimer); + + return 0; +} + #else static int alarmtimer_suspend(struct device *dev) { return 0; } + +static int alarmtimer_resume(struct device *dev) +{ + return 0; +} #endif static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type) @@ -810,6 +920,7 @@ out: /* Suspend hook structures */ static const struct dev_pm_ops alarmtimer_pm_ops = { .suspend = alarmtimer_suspend, + .resume = alarmtimer_resume, }; static struct platform_driver alarmtimer_driver = { diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index a20368e1a720..3b43159ab41f 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -108,7 +108,7 @@ static int finished_booting; #ifdef CONFIG_CLOCKSOURCE_WATCHDOG static void clocksource_watchdog_work(struct work_struct *work); -static void clocksource_select(void); +static void clocksource_select(bool force); static LIST_HEAD(watchdog_list); static struct clocksource *watchdog; @@ -422,7 +422,7 @@ static int clocksource_watchdog_kthread(void *data) { mutex_lock(&clocksource_mutex); if (__clocksource_watchdog_kthread()) - clocksource_select(); + clocksource_select(false); mutex_unlock(&clocksource_mutex); return 0; } @@ -562,11 +562,12 @@ static inline void clocksource_update_max_deferment(struct clocksource *cs) #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET -static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur) +static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur, + bool force) { struct clocksource *cs; - if (!finished_booting || list_empty(&clocksource_list)) + if ((!finished_booting && !force) || list_empty(&clocksource_list)) return NULL; /* @@ -584,13 +585,13 @@ static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur) return NULL; } -static void __clocksource_select(bool skipcur) +static void __clocksource_select(bool skipcur, bool force) { bool oneshot = tick_oneshot_mode_active(); struct clocksource *best, *cs; /* Find the best suitable clocksource */ - best = clocksource_find_best(oneshot, skipcur); + best = clocksource_find_best(oneshot, skipcur, force); if (!best) return; @@ -630,22 +631,40 @@ static void __clocksource_select(bool skipcur) * Select the clocksource with the best rating, or the clocksource, * which is selected by userspace override. */ -static void clocksource_select(void) +static void clocksource_select(bool force) { - __clocksource_select(false); + return __clocksource_select(false, force); } static void clocksource_select_fallback(void) { - __clocksource_select(true); + __clocksource_select(true, false); } #else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */ -static inline void clocksource_select(void) { } + +static inline void clocksource_select(bool force) { } static inline void clocksource_select_fallback(void) { } #endif +/** + * clocksource_select_force - Force re-selection of the best clocksource + * among registered clocksources + * + * clocksource_select() can't select the best clocksource before + * calling clocksource_done_booting() and since clocksource_select() + * should be called with clocksource_mutex held, provide a new API + * can be called from other files to select best clockrouce irrespective + * of finished_booting flag. + */ +void clocksource_select_force(void) +{ + mutex_lock(&clocksource_mutex); + clocksource_select(true); + mutex_unlock(&clocksource_mutex); +} + /* * clocksource_done_booting - Called near the end of core bootup * @@ -662,7 +681,7 @@ static int __init clocksource_done_booting(void) * Run the watchdog first to eliminate unstable clock sources */ __clocksource_watchdog_kthread(); - clocksource_select(); + clocksource_select(false); mutex_unlock(&clocksource_mutex); return 0; } @@ -751,6 +770,7 @@ void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq } EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale); + /** * __clocksource_register_scale - Used to install new clocksources * @cs: clocksource to be registered @@ -772,7 +792,7 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) mutex_lock(&clocksource_mutex); clocksource_enqueue(cs); clocksource_enqueue_watchdog(cs); - clocksource_select(); + clocksource_select(false); clocksource_select_watchdog(false); mutex_unlock(&clocksource_mutex); return 0; @@ -795,7 +815,7 @@ void clocksource_change_rating(struct clocksource *cs, int rating) { mutex_lock(&clocksource_mutex); __clocksource_change_rating(cs, rating); - clocksource_select(); + clocksource_select(false); clocksource_select_watchdog(false); mutex_unlock(&clocksource_mutex); } @@ -899,7 +919,7 @@ static ssize_t sysfs_override_clocksource(struct device *dev, ret = sysfs_get_uname(buf, override_name, count); if (ret >= 0) - clocksource_select(); + clocksource_select(false); mutex_unlock(&clocksource_mutex); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 1879ba35d8f5..79fadcad21ff 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -784,34 +784,6 @@ void hrtimers_resume(void) clock_was_set_delayed(); } -static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer) -{ -#ifdef CONFIG_TIMER_STATS - if (timer->start_site) - return; - timer->start_site = __builtin_return_address(0); - memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); - timer->start_pid = current->pid; -#endif -} - -static inline void timer_stats_hrtimer_clear_start_info(struct hrtimer *timer) -{ -#ifdef CONFIG_TIMER_STATS - timer->start_site = NULL; -#endif -} - -static inline void timer_stats_account_hrtimer(struct hrtimer *timer) -{ -#ifdef CONFIG_TIMER_STATS - if (likely(!timer_stats_active)) - return; - timer_stats_update_stats(timer, timer->start_pid, timer->start_site, - timer->function, timer->start_comm, 0); -#endif -} - /* * Counterpart to lock_hrtimer_base above: */ @@ -888,8 +860,7 @@ static int enqueue_hrtimer(struct hrtimer *timer, base->cpu_base->active_bases |= 1 << base->index; - /* Pairs with the lockless read in hrtimer_is_queued() */ - WRITE_ONCE(timer->state, HRTIMER_STATE_ENQUEUED); + timer->state |= HRTIMER_STATE_ENQUEUED; return timerqueue_add(&base->active, &timer->node); } @@ -909,12 +880,9 @@ static void __remove_hrtimer(struct hrtimer *timer, u8 newstate, int reprogram) { struct hrtimer_cpu_base *cpu_base = base->cpu_base; - u8 state = timer->state; - /* Pairs with the lockless read in hrtimer_is_queued() */ - WRITE_ONCE(timer->state, newstate); - if (!(state & HRTIMER_STATE_ENQUEUED)) - return; + if (!(timer->state & HRTIMER_STATE_ENQUEUED)) + goto out; if (!timerqueue_del(&base->active, &timer->node)) cpu_base->active_bases &= ~(1 << base->index); @@ -931,6 +899,13 @@ static void __remove_hrtimer(struct hrtimer *timer, if (reprogram && timer == cpu_base->next_timer) hrtimer_force_reprogram(cpu_base, 1); #endif + +out: + /* + * We need to preserve PINNED state here, otherwise we may end up + * migrating pinned hrtimers as well. + */ + timer->state = newstate | (timer->state & HRTIMER_STATE_PINNED); } /* @@ -939,9 +914,8 @@ static void __remove_hrtimer(struct hrtimer *timer, static inline int remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart) { - u8 state = timer->state; - - if (state & HRTIMER_STATE_ENQUEUED) { + if (hrtimer_is_queued(timer)) { + u8 state = timer->state; int reprogram; /* @@ -953,13 +927,13 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool rest * rare case and less expensive than a smp call. */ debug_deactivate(timer); - timer_stats_hrtimer_clear_start_info(timer); reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases); if (!restart) state = HRTIMER_STATE_INACTIVE; __remove_hrtimer(timer, base, state, reprogram); + timer->state &= ~HRTIMER_STATE_PINNED; return 1; } return 0; @@ -1011,7 +985,9 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, /* Switch the timer base, if necessary: */ new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED); - timer_stats_hrtimer_set_start_info(timer); + /* Update pinned state */ + timer->state &= ~HRTIMER_STATE_PINNED; + timer->state |= (!!(mode & HRTIMER_MODE_PINNED)) << HRTIMER_PINNED_SHIFT; leftmost = enqueue_hrtimer(timer, new_base); if (!leftmost) @@ -1154,12 +1130,6 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, base = hrtimer_clockid_to_base(clock_id); timer->base = &cpu_base->clock_base[base]; timerqueue_init(&timer->node); - -#ifdef CONFIG_TIMER_STATS - timer->start_site = NULL; - timer->start_pid = -1; - memset(timer->start_comm, 0, TASK_COMM_LEN); -#endif } /** @@ -1192,8 +1162,8 @@ bool hrtimer_active(const struct hrtimer *timer) cpu_base = READ_ONCE(timer->base->cpu_base); seq = raw_read_seqcount_begin(&cpu_base->seq); - if (timer->state != HRTIMER_STATE_INACTIVE || - cpu_base->running == timer) + if (((timer->state & ~HRTIMER_STATE_PINNED) != + HRTIMER_STATE_INACTIVE) || cpu_base->running == timer) return true; } while (read_seqcount_retry(&cpu_base->seq, seq) || @@ -1243,7 +1213,6 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, raw_write_seqcount_barrier(&cpu_base->seq); __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0); - timer_stats_account_hrtimer(timer); fn = timer->function; /* @@ -1631,17 +1600,23 @@ static void init_hrtimers_cpu(int cpu) hrtimer_init_hres(cpu_base); } -#ifdef CONFIG_HOTPLUG_CPU - +#if defined(CONFIG_HOTPLUG_CPU) static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, - struct hrtimer_clock_base *new_base) + struct hrtimer_clock_base *new_base, + bool remove_pinned) { struct hrtimer *timer; struct timerqueue_node *node; + struct timerqueue_head pinned; + int is_pinned; + bool is_hotplug = !cpu_online(old_base->cpu_base->cpu); + + timerqueue_init_head(&pinned); while ((node = timerqueue_getnext(&old_base->active))) { timer = container_of(node, struct hrtimer, node); - BUG_ON(hrtimer_callback_running(timer)); + if (is_hotplug) + BUG_ON(hrtimer_callback_running(timer)); debug_deactivate(timer); /* @@ -1650,6 +1625,13 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, * under us on another CPU */ __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0); + + is_pinned = timer->state & HRTIMER_STATE_PINNED; + if (!remove_pinned && is_pinned) { + timerqueue_add(&pinned, &timer->node); + continue; + } + timer->base = new_base; /* * Enqueue the timers on the new cpu. This does not @@ -1661,17 +1643,23 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, */ enqueue_hrtimer(timer, new_base); } + + /* Re-queue pinned timers for non-hotplug usecase */ + while ((node = timerqueue_getnext(&pinned))) { + timer = container_of(node, struct hrtimer, node); + + timerqueue_del(&pinned, &timer->node); + enqueue_hrtimer(timer, old_base); + } } -static void migrate_hrtimers(int scpu) +static void __migrate_hrtimers(int scpu, bool remove_pinned) { struct hrtimer_cpu_base *old_base, *new_base; + unsigned long flags; int i; - BUG_ON(cpu_online(scpu)); - tick_cancel_sched_timer(scpu); - - local_irq_disable(); + local_irq_save(flags); old_base = &per_cpu(hrtimer_bases, scpu); new_base = this_cpu_ptr(&hrtimer_bases); /* @@ -1683,7 +1671,7 @@ static void migrate_hrtimers(int scpu) for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { migrate_hrtimer_list(&old_base->clock_base[i], - &new_base->clock_base[i]); + &new_base->clock_base[i], remove_pinned); } raw_spin_unlock(&old_base->lock); @@ -1691,7 +1679,20 @@ static void migrate_hrtimers(int scpu) /* Check, if we got expired work to do */ __hrtimer_peek_ahead_timers(); - local_irq_enable(); + local_irq_restore(flags); +} + +static void migrate_hrtimers(int scpu) +{ + BUG_ON(cpu_online(scpu)); + tick_cancel_sched_timer(scpu); + + __migrate_hrtimers(scpu, true); +} + +void hrtimer_quiesce_cpu(void *cpup) +{ + __migrate_hrtimers(*(int *)cpup, false); } #endif /* CONFIG_HOTPLUG_CPU */ @@ -1799,15 +1800,19 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, * You can set the task state as follows - * * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to - * pass before the routine returns. + * pass before the routine returns unless the current task is explicitly + * woken up, (e.g. by wake_up_process()). * * %TASK_INTERRUPTIBLE - the routine may return early if a signal is - * delivered to the current task. + * delivered to the current task or the current task is explicitly woken + * up. * * The current task state is guaranteed to be TASK_RUNNING when this * routine returns. * - * Returns 0 when the timer has expired otherwise -EINTR + * Returns 0 when the timer has expired. If the task was woken before the + * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or + * by an explicit wakeup, it returns -EINTR. */ int __sched schedule_hrtimeout_range(ktime_t *expires, u64 delta, const enum hrtimer_mode mode) @@ -1829,15 +1834,19 @@ EXPORT_SYMBOL_GPL(schedule_hrtimeout_range); * You can set the task state as follows - * * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to - * pass before the routine returns. + * pass before the routine returns unless the current task is explicitly + * woken up, (e.g. by wake_up_process()). * * %TASK_INTERRUPTIBLE - the routine may return early if a signal is - * delivered to the current task. + * delivered to the current task or the current task is explicitly woken + * up. * * The current task state is guaranteed to be TASK_RUNNING when this * routine returns. * - * Returns 0 when the timer has expired otherwise -EINTR + * Returns 0 when the timer has expired. If the task was woken before the + * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or + * by an explicit wakeup, it returns -EINTR. */ int __sched schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode) diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 8fc68e60c795..0a4f0c09bcbf 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1250,10 +1250,11 @@ void run_posix_cpu_timers(struct task_struct *tsk) void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, cputime_t *newval, cputime_t *oldval) { - unsigned long long now; + unsigned long long now = 0; WARN_ON_ONCE(clock_idx == CPUCLOCK_SCHED); - cpu_timer_sample_group(clock_idx, tsk, &now); + if (cpu_timer_sample_group(clock_idx, tsk, &now)) + return; if (oldval) { /* diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 382b159d8592..1986ab6e5943 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -70,6 +70,11 @@ struct clock_data { static struct hrtimer sched_clock_timer; static int irqtime = -1; +static int initialized; +static u64 suspend_ns; +static u64 suspend_cycles; +static u64 resume_cycles; + core_param(irqtime, irqtime, int, 0400); @@ -236,6 +241,11 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) pr_debug("Registered %pF as sched_clock source\n", read); } +int sched_clock_initialized(void) +{ + return initialized; +} + void __init sched_clock_postinit(void) { /* @@ -254,6 +264,8 @@ void __init sched_clock_postinit(void) hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); sched_clock_timer.function = sched_clock_poll; hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); + + initialized = 1; } /* @@ -279,6 +291,11 @@ static int sched_clock_suspend(void) struct clock_read_data *rd = &cd.read_data[0]; update_sched_clock(); + + suspend_ns = rd->epoch_ns; + suspend_cycles = rd->epoch_cyc; + pr_info("suspend ns:%17llu suspend cycles:%17llu\n", + rd->epoch_ns, rd->epoch_cyc); hrtimer_cancel(&sched_clock_timer); rd->read_sched_clock = suspended_sched_clock_read; @@ -290,6 +307,8 @@ static void sched_clock_resume(void) struct clock_read_data *rd = &cd.read_data[0]; rd->epoch_cyc = cd.actual_read_sched_clock(); + resume_cycles = rd->epoch_cyc; + pr_info("resume cycles:%17llu\n", rd->epoch_cyc); hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); rd->read_sched_clock = cd.actual_read_sched_clock; } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 4cd1145ec260..d675f8b06110 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -19,11 +19,13 @@ #include <linux/percpu.h> #include <linux/profile.h> #include <linux/sched.h> +#include <linux/timer.h> #include <linux/module.h> #include <linux/irq_work.h> #include <linux/posix-timers.h> #include <linux/perf_event.h> #include <linux/context_tracking.h> +#include <linux/rq_stats.h> #include <asm/irq_regs.h> @@ -31,6 +33,10 @@ #include <trace/events/timer.h> +struct rq_data rq_info; +struct workqueue_struct *rq_wq; +spinlock_t rq_lock; + /* * Per cpu nohz control structure */ @@ -41,6 +47,21 @@ static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); */ static ktime_t last_jiffies_update; +u64 jiffy_to_ktime_ns(u64 *now, u64 *jiffy_ktime_ns) +{ + u64 cur_jiffies; + unsigned long seq; + + do { + seq = read_seqbegin(&jiffies_lock); + *now = ktime_get_ns(); + *jiffy_ktime_ns = ktime_to_ns(last_jiffies_update); + cur_jiffies = get_jiffies_64(); + } while (read_seqretry(&jiffies_lock, seq)); + + return cur_jiffies; +} + struct tick_sched *tick_get_tick_sched(int cpu) { return &per_cpu(tick_cpu_sched, cpu); @@ -143,7 +164,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) * when we go busy again does not account too much ticks. */ if (ts->tick_stopped) { - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sched(); if (is_idle_task(current)) ts->idle_jiffies++; } @@ -430,7 +451,7 @@ static void tick_nohz_update_jiffies(ktime_t now) tick_do_update_jiffies64(now); local_irq_restore(flags); - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sched(); } /* @@ -716,7 +737,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) update_cpu_load_nohz(); calc_load_exit_idle(); - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sched(); /* * Cancel the scheduled timer and restore the tick */ @@ -804,6 +825,11 @@ static void __tick_nohz_idle_enter(struct tick_sched *ts) now = tick_nohz_start_idle(ts); +#ifdef CONFIG_SMP + if (check_pending_deferrable_timers(cpu)) + raise_softirq_irqoff(TIMER_SOFTIRQ); +#endif + if (can_stop_idle_tick(cpu, ts)) { int was_stopped = ts->tick_stopped; @@ -1076,6 +1102,51 @@ void tick_irq_enter(void) * High resolution timer specific code */ #ifdef CONFIG_HIGH_RES_TIMERS +static void update_rq_stats(void) +{ + unsigned long jiffy_gap = 0; + unsigned int rq_avg = 0; + unsigned long flags = 0; + + jiffy_gap = jiffies - rq_info.rq_poll_last_jiffy; + + if (jiffy_gap >= rq_info.rq_poll_jiffies) { + + spin_lock_irqsave(&rq_lock, flags); + + if (!rq_info.rq_avg) + rq_info.rq_poll_total_jiffies = 0; + + rq_avg = nr_running() * 10; + + if (rq_info.rq_poll_total_jiffies) { + rq_avg = (rq_avg * jiffy_gap) + + (rq_info.rq_avg * + rq_info.rq_poll_total_jiffies); + do_div(rq_avg, + rq_info.rq_poll_total_jiffies + jiffy_gap); + } + + rq_info.rq_avg = rq_avg; + rq_info.rq_poll_total_jiffies += jiffy_gap; + rq_info.rq_poll_last_jiffy = jiffies; + + spin_unlock_irqrestore(&rq_lock, flags); + } +} + +static void wakeup_user(void) +{ + unsigned long jiffy_gap; + + jiffy_gap = jiffies - rq_info.def_timer_last_jiffy; + + if (jiffy_gap >= rq_info.def_timer_jiffies) { + rq_info.def_timer_last_jiffy = jiffies; + queue_work(rq_wq, &rq_info.def_timer_work); + } +} + /* * We rearm the timer until we get disabled by the idle code. * Called with interrupts disabled. @@ -1093,9 +1164,23 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) * Do not call, when we are not in irq context and have * no valid regs pointer */ - if (regs) + if (regs) { tick_sched_handle(ts, regs); + if (rq_info.init == 1 && + tick_do_timer_cpu == smp_processor_id()) { + /* + * update run queue statistics + */ + update_rq_stats(); + + /* + * wakeup user if needed + */ + wakeup_user(); + } + } + /* No need to reprogram if we are in idle or full dynticks mode */ if (unlikely(ts->tick_stopped)) return HRTIMER_NORESTART; @@ -1208,3 +1293,8 @@ int tick_check_oneshot_change(int allow_nohz) tick_nohz_switch_to_nohz(); return 0; } + +ktime_t * get_next_event_cpu(unsigned int cpu) +{ + return &(per_cpu(tick_cpu_device, cpu).evtdev->next_event); +} diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 7d9067f1c056..790988e66289 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1671,9 +1671,12 @@ static __always_inline void timekeeping_freqadjust(struct timekeeper *tk, { s64 interval = tk->cycle_interval; s64 xinterval = tk->xtime_interval; + u32 base = tk->tkr_mono.clock->mult; + u32 max = tk->tkr_mono.clock->maxadj; + u32 cur_adj = tk->tkr_mono.mult; s64 tick_error; bool negative; - u32 adj; + u32 adj_scale; /* Remove any current error adj from freq calculation */ if (tk->ntp_err_mult) @@ -1692,13 +1695,33 @@ static __always_inline void timekeeping_freqadjust(struct timekeeper *tk, /* preserve the direction of correction */ negative = (tick_error < 0); - /* Sort out the magnitude of the correction */ + /* If any adjustment would pass the max, just return */ + if (negative && (cur_adj - 1) <= (base - max)) + return; + if (!negative && (cur_adj + 1) >= (base + max)) + return; + /* + * Sort out the magnitude of the correction, but + * avoid making so large a correction that we go + * over the max adjustment. + */ + adj_scale = 0; tick_error = abs(tick_error); - for (adj = 0; tick_error > interval; adj++) + while (tick_error > interval) { + u32 adj = 1 << (adj_scale + 1); + + /* Check if adjustment gets us within 1 unit from the max */ + if (negative && (cur_adj - adj) <= (base - max)) + break; + if (!negative && (cur_adj + adj) >= (base + max)) + break; + + adj_scale++; tick_error >>= 1; + } /* scale the corrections */ - timekeeping_apply_adjustment(tk, offset, negative, adj); + timekeeping_apply_adjustment(tk, offset, negative, adj_scale); } /* diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 8e9e4b86f23e..67646a316436 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -95,12 +95,16 @@ struct tvec_base { struct tvec tv5; } ____cacheline_aligned; +static inline void __run_timers(struct tvec_base *base); static DEFINE_PER_CPU(struct tvec_base, tvec_bases); #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) unsigned int sysctl_timer_migration = 1; +struct tvec_base tvec_base_deferrable; +static atomic_t deferrable_pending; + void timers_update_migration(bool update_nohz) { bool on = sysctl_timer_migration && tick_nohz_active; @@ -136,18 +140,66 @@ int timer_migration_handler(struct ctl_table *table, int write, } static inline struct tvec_base *get_target_base(struct tvec_base *base, - int pinned) + int pinned, u32 timer_flags) { + if (!pinned && !(timer_flags & TIMER_PINNED_ON_CPU) && + (timer_flags & TIMER_DEFERRABLE)) + return &tvec_base_deferrable; if (pinned || !base->migration_enabled) return this_cpu_ptr(&tvec_bases); return per_cpu_ptr(&tvec_bases, get_nohz_timer_target()); } + +static inline void __run_deferrable_timers(void) +{ + if ((atomic_cmpxchg(&deferrable_pending, 1, 0) && + tick_do_timer_cpu == TICK_DO_TIMER_NONE) || + tick_do_timer_cpu == smp_processor_id()) { + if (time_after_eq(jiffies, + tvec_base_deferrable.timer_jiffies)) + __run_timers(&tvec_base_deferrable); + } +} + +static inline void init_timer_deferrable_global(void) +{ + tvec_base_deferrable.cpu = nr_cpu_ids; + spin_lock_init(&tvec_base_deferrable.lock); + tvec_base_deferrable.timer_jiffies = jiffies; + tvec_base_deferrable.next_timer = tvec_base_deferrable.timer_jiffies; +} + +static inline struct tvec_base *get_timer_base(u32 timer_flags) +{ + if (!(timer_flags & TIMER_PINNED_ON_CPU) && + timer_flags & TIMER_DEFERRABLE) + return &tvec_base_deferrable; + else + return per_cpu_ptr(&tvec_bases, timer_flags & TIMER_CPUMASK); +} #else static inline struct tvec_base *get_target_base(struct tvec_base *base, - int pinned) + int pinned, u32 timer_flags) { return this_cpu_ptr(&tvec_bases); } + +static inline void __run_deferrable_timers(void) +{ +} + +static inline void init_timer_deferrable_global(void) +{ + /* + * initialize cpu unbound deferrable timer base only when CONFIG_SMP. + * UP kernel handles the timers with cpu 0 timer base. + */ +} + +static inline struct tvec_base *get_timer_base(u32 timer_flags) +{ + return per_cpu_ptr(&tvec_bases, timer_flags & TIMER_CPUMASK); +} #endif static unsigned long round_jiffies_common(unsigned long j, int cpu, @@ -449,38 +501,6 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) } } -#ifdef CONFIG_TIMER_STATS -void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr) -{ - if (timer->start_site) - return; - - timer->start_site = addr; - memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); - timer->start_pid = current->pid; -} - -static void timer_stats_account_timer(struct timer_list *timer) -{ - void *site; - - /* - * start_site can be concurrently reset by - * timer_stats_timer_clear_start_info() - */ - site = READ_ONCE(timer->start_site); - if (likely(!site)) - return; - - timer_stats_update_stats(timer, timer->start_pid, site, - timer->function, timer->start_comm, - timer->flags); -} - -#else -static void timer_stats_account_timer(struct timer_list *timer) {} -#endif - #ifdef CONFIG_DEBUG_OBJECTS_TIMERS static struct debug_obj_descr timer_debug_descr; @@ -683,11 +703,6 @@ static void do_init_timer(struct timer_list *timer, unsigned int flags, timer->entry.pprev = NULL; timer->flags = flags | raw_smp_processor_id(); timer->slack = -1; -#ifdef CONFIG_TIMER_STATS - timer->start_site = NULL; - timer->start_pid = -1; - memset(timer->start_comm, 0, TASK_COMM_LEN); -#endif lockdep_init_map(&timer->lockdep_map, name, key, 0); } @@ -776,7 +791,7 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer, tf = READ_ONCE(timer->flags); if (!(tf & TIMER_MIGRATING)) { - base = per_cpu_ptr(&tvec_bases, tf & TIMER_CPUMASK); + base = get_timer_base(tf); spin_lock_irqsave(&base->lock, *flags); if (timer->flags == tf) return base; @@ -794,7 +809,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned long flags; int ret = 0; - timer_stats_timer_set_start_info(timer); BUG_ON(!timer->function); base = lock_timer_base(timer, &flags); @@ -805,7 +819,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, debug_activate(timer, expires); - new_base = get_target_base(base, pinned); + new_base = get_target_base(base, pinned, timer->flags); if (base != new_base) { /* @@ -827,6 +841,10 @@ __mod_timer(struct timer_list *timer, unsigned long expires, } } + if (pinned == TIMER_PINNED) + timer->flags |= TIMER_PINNED_ON_CPU; + else + timer->flags &= ~TIMER_PINNED_ON_CPU; timer->expires = expires; internal_add_timer(base, timer); @@ -989,7 +1007,6 @@ void add_timer_on(struct timer_list *timer, int cpu) struct tvec_base *base; unsigned long flags; - timer_stats_timer_set_start_info(timer); BUG_ON(timer_pending(timer) || !timer->function); /* @@ -1008,6 +1025,7 @@ void add_timer_on(struct timer_list *timer, int cpu) (timer->flags & ~TIMER_BASEMASK) | cpu); } + timer->flags |= TIMER_PINNED_ON_CPU; debug_activate(timer, timer->expires); internal_add_timer(base, timer); spin_unlock_irqrestore(&base->lock, flags); @@ -1033,7 +1051,6 @@ int del_timer(struct timer_list *timer) debug_assert_init(timer); - timer_stats_timer_clear_start_info(timer); if (timer_pending(timer)) { base = lock_timer_base(timer, &flags); ret = detach_if_pending(timer, base, true); @@ -1061,10 +1078,9 @@ int try_to_del_timer_sync(struct timer_list *timer) base = lock_timer_base(timer, &flags); - if (base->running_timer != timer) { - timer_stats_timer_clear_start_info(timer); + if (base->running_timer != timer) ret = detach_if_pending(timer, base, true); - } + spin_unlock_irqrestore(&base->lock, flags); return ret; @@ -1248,8 +1264,6 @@ static inline void __run_timers(struct tvec_base *base) data = timer->data; irqsafe = timer->flags & TIMER_IRQSAFE; - timer_stats_account_timer(timer); - base->running_timer = timer; detach_expired_timer(timer, base); @@ -1377,6 +1391,30 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) return DIV_ROUND_UP_ULL(nextevt, TICK_NSEC) * TICK_NSEC; } +#ifdef CONFIG_SMP +/* + * check_pending_deferrable_timers - Check for unbound deferrable timer expiry. + * @cpu - Current CPU + * + * The function checks whether any global deferrable pending timers + * are exipired or not. This function does not check cpu bounded + * diferrable pending timers expiry. + * + * The function returns true when a cpu unbounded deferrable timer is expired. + */ +bool check_pending_deferrable_timers(int cpu) +{ + if (cpu == tick_do_timer_cpu || + tick_do_timer_cpu == TICK_DO_TIMER_NONE) { + if (time_after_eq(jiffies, tvec_base_deferrable.timer_jiffies) + && !atomic_cmpxchg(&deferrable_pending, 0, 1)) { + return true; + } + } + return false; +} +#endif + /** * get_next_timer_interrupt - return the time (clock mono) of the next timer * @basej: base time jiffies @@ -1441,6 +1479,8 @@ static void run_timer_softirq(struct softirq_action *h) { struct tvec_base *base = this_cpu_ptr(&tvec_bases); + __run_deferrable_timers(); + if (time_after_eq(jiffies, base->timer_jiffies)) __run_timers(base); } @@ -1483,11 +1523,12 @@ static void process_timeout(unsigned long __data) * You can set the task state as follows - * * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to - * pass before the routine returns. The routine will return 0 + * pass before the routine returns unless the current task is explicitly + * woken up, (e.g. by wake_up_process())". * * %TASK_INTERRUPTIBLE - the routine may return early if a signal is - * delivered to the current task. In this case the remaining time - * in jiffies will be returned, or 0 if the timer expired in time + * delivered to the current task or the current task is explicitly woken + * up. * * The current task state is guaranteed to be TASK_RUNNING when this * routine returns. @@ -1496,7 +1537,9 @@ static void process_timeout(unsigned long __data) * the CPU away without a bound on the timeout. In this case the return * value will be %MAX_SCHEDULE_TIMEOUT. * - * In all cases the return value is guaranteed to be non-negative. + * Returns 0 when the timer has expired otherwise the remaining time in + * jiffies will be returned. In all cases the return value is guaranteed + * to be non-negative. */ signed long __sched schedule_timeout(signed long timeout) { @@ -1574,56 +1617,82 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout) } EXPORT_SYMBOL(schedule_timeout_uninterruptible); -#ifdef CONFIG_HOTPLUG_CPU -static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head) +#if defined(CONFIG_HOTPLUG_CPU) +static void migrate_timer_list(struct tvec_base *new_base, + struct hlist_head *head, bool remove_pinned) { struct timer_list *timer; int cpu = new_base->cpu; + struct hlist_node *n; + int is_pinned; + + hlist_for_each_entry_safe(timer, n, head, entry) { + is_pinned = timer->flags & TIMER_PINNED_ON_CPU; + if (!remove_pinned && is_pinned) + continue; - while (!hlist_empty(head)) { - timer = hlist_entry(head->first, struct timer_list, entry); - /* We ignore the accounting on the dying cpu */ - detach_timer(timer, false); + detach_if_pending(timer, get_timer_base(timer->flags), false); timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu; internal_add_timer(new_base, timer); } } -static void migrate_timers(int cpu) +static void __migrate_timers(int cpu, bool remove_pinned) { struct tvec_base *old_base; struct tvec_base *new_base; + unsigned long flags; int i; - BUG_ON(cpu_online(cpu)); old_base = per_cpu_ptr(&tvec_bases, cpu); new_base = get_cpu_ptr(&tvec_bases); /* * The caller is globally serialized and nobody else * takes two locks at once, deadlock is not possible. */ - spin_lock_irq(&new_base->lock); + spin_lock_irqsave(&new_base->lock, flags); spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); - BUG_ON(old_base->running_timer); + /* + * If we're in the hotplug path, kill the system if there's a running + * timer. It's ok to have a running timer in the isolation case - the + * currently running or just expired timers are off of the timer wheel + * and so everything else can be migrated off. + */ + if (!cpu_online(cpu)) + BUG_ON(old_base->running_timer); for (i = 0; i < TVR_SIZE; i++) - migrate_timer_list(new_base, old_base->tv1.vec + i); + migrate_timer_list(new_base, old_base->tv1.vec + i, + remove_pinned); for (i = 0; i < TVN_SIZE; i++) { - migrate_timer_list(new_base, old_base->tv2.vec + i); - migrate_timer_list(new_base, old_base->tv3.vec + i); - migrate_timer_list(new_base, old_base->tv4.vec + i); - migrate_timer_list(new_base, old_base->tv5.vec + i); + migrate_timer_list(new_base, old_base->tv2.vec + i, + remove_pinned); + migrate_timer_list(new_base, old_base->tv3.vec + i, + remove_pinned); + migrate_timer_list(new_base, old_base->tv4.vec + i, + remove_pinned); + migrate_timer_list(new_base, old_base->tv5.vec + i, + remove_pinned); } - old_base->active_timers = 0; - old_base->all_timers = 0; - spin_unlock(&old_base->lock); - spin_unlock_irq(&new_base->lock); + spin_unlock_irqrestore(&new_base->lock, flags); put_cpu_ptr(&tvec_bases); } +/* Migrate timers from 'cpu' to this_cpu */ +static void migrate_timers(int cpu) +{ + BUG_ON(cpu_online(cpu)); + __migrate_timers(cpu, true); +} + +void timer_quiesce_cpu(void *cpup) +{ + __migrate_timers(*(int *)cpup, false); +} + static int timer_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { @@ -1664,12 +1733,13 @@ static void __init init_timer_cpus(void) for_each_possible_cpu(cpu) init_timer_cpu(cpu); + + init_timer_deferrable_global(); } void __init init_timers(void) { init_timer_cpus(); - init_timer_stats(); timer_register_cpu_notifier(); open_softirq(TIMER_SOFTIRQ, run_timer_softirq); } @@ -1703,16 +1773,6 @@ unsigned long msleep_interruptible(unsigned int msecs) EXPORT_SYMBOL(msleep_interruptible); -static void __sched do_usleep_range(unsigned long min, unsigned long max) -{ - ktime_t kmin; - u64 delta; - - kmin = ktime_set(0, min * NSEC_PER_USEC); - delta = (u64)(max - min) * NSEC_PER_USEC; - schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL); -} - /** * usleep_range - Drop in replacement for udelay where wakeup is flexible * @min: Minimum time in usecs to sleep @@ -1720,7 +1780,14 @@ static void __sched do_usleep_range(unsigned long min, unsigned long max) */ void __sched usleep_range(unsigned long min, unsigned long max) { - __set_current_state(TASK_UNINTERRUPTIBLE); - do_usleep_range(min, max); + ktime_t exp = ktime_add_us(ktime_get(), min); + u64 delta = (u64)(max - min) * NSEC_PER_USEC; + + for (;;) { + __set_current_state(TASK_UNINTERRUPTIBLE); + /* Do not return before the requested sleep time has elapsed */ + if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS)) + break; + } } EXPORT_SYMBOL(usleep_range); diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index b7c5d230b4b2..7b67a9fb3788 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -63,21 +63,11 @@ static void print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer, int idx, u64 now) { -#ifdef CONFIG_TIMER_STATS - char tmp[TASK_COMM_LEN + 1]; -#endif SEQ_printf(m, " #%d: ", idx); print_name_offset(m, taddr); SEQ_printf(m, ", "); print_name_offset(m, timer->function); SEQ_printf(m, ", S:%02x", timer->state); -#ifdef CONFIG_TIMER_STATS - SEQ_printf(m, ", "); - print_name_offset(m, timer->start_site); - memcpy(tmp, timer->start_comm, TASK_COMM_LEN); - tmp[TASK_COMM_LEN] = 0; - SEQ_printf(m, ", %s/%d", tmp, timer->start_pid); -#endif SEQ_printf(m, "\n"); SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n", (unsigned long long)ktime_to_ns(hrtimer_get_softexpires(timer)), diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c deleted file mode 100644 index 7e4d715f9c22..000000000000 --- a/kernel/time/timer_stats.c +++ /dev/null @@ -1,425 +0,0 @@ -/* - * kernel/time/timer_stats.c - * - * Collect timer usage statistics. - * - * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar - * Copyright(C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> - * - * timer_stats is based on timer_top, a similar functionality which was part of - * Con Kolivas dyntick patch set. It was developed by Daniel Petrini at the - * Instituto Nokia de Tecnologia - INdT - Manaus. timer_top's design was based - * on dynamic allocation of the statistics entries and linear search based - * lookup combined with a global lock, rather than the static array, hash - * and per-CPU locking which is used by timer_stats. It was written for the - * pre hrtimer kernel code and therefore did not take hrtimers into account. - * Nevertheless it provided the base for the timer_stats implementation and - * was a helpful source of inspiration. Kudos to Daniel and the Nokia folks - * for this effort. - * - * timer_top.c is - * Copyright (C) 2005 Instituto Nokia de Tecnologia - INdT - Manaus - * Written by Daniel Petrini <d.pensator@gmail.com> - * timer_top.c was released under the GNU General Public License version 2 - * - * We export the addresses and counting of timer functions being called, - * the pid and cmdline from the owner process if applicable. - * - * Start/stop data collection: - * # echo [1|0] >/proc/timer_stats - * - * Display the information collected so far: - * # cat /proc/timer_stats - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/proc_fs.h> -#include <linux/module.h> -#include <linux/spinlock.h> -#include <linux/sched.h> -#include <linux/seq_file.h> -#include <linux/kallsyms.h> - -#include <asm/uaccess.h> - -/* - * This is our basic unit of interest: a timer expiry event identified - * by the timer, its start/expire functions and the PID of the task that - * started the timer. We count the number of times an event happens: - */ -struct entry { - /* - * Hash list: - */ - struct entry *next; - - /* - * Hash keys: - */ - void *timer; - void *start_func; - void *expire_func; - pid_t pid; - - /* - * Number of timeout events: - */ - unsigned long count; - u32 flags; - - /* - * We save the command-line string to preserve - * this information past task exit: - */ - char comm[TASK_COMM_LEN + 1]; - -} ____cacheline_aligned_in_smp; - -/* - * Spinlock protecting the tables - not taken during lookup: - */ -static DEFINE_RAW_SPINLOCK(table_lock); - -/* - * Per-CPU lookup locks for fast hash lookup: - */ -static DEFINE_PER_CPU(raw_spinlock_t, tstats_lookup_lock); - -/* - * Mutex to serialize state changes with show-stats activities: - */ -static DEFINE_MUTEX(show_mutex); - -/* - * Collection status, active/inactive: - */ -int __read_mostly timer_stats_active; - -/* - * Beginning/end timestamps of measurement: - */ -static ktime_t time_start, time_stop; - -/* - * tstat entry structs only get allocated while collection is - * active and never freed during that time - this simplifies - * things quite a bit. - * - * They get freed when a new collection period is started. - */ -#define MAX_ENTRIES_BITS 10 -#define MAX_ENTRIES (1UL << MAX_ENTRIES_BITS) - -static unsigned long nr_entries; -static struct entry entries[MAX_ENTRIES]; - -static atomic_t overflow_count; - -/* - * The entries are in a hash-table, for fast lookup: - */ -#define TSTAT_HASH_BITS (MAX_ENTRIES_BITS - 1) -#define TSTAT_HASH_SIZE (1UL << TSTAT_HASH_BITS) -#define TSTAT_HASH_MASK (TSTAT_HASH_SIZE - 1) - -#define __tstat_hashfn(entry) \ - (((unsigned long)(entry)->timer ^ \ - (unsigned long)(entry)->start_func ^ \ - (unsigned long)(entry)->expire_func ^ \ - (unsigned long)(entry)->pid ) & TSTAT_HASH_MASK) - -#define tstat_hashentry(entry) (tstat_hash_table + __tstat_hashfn(entry)) - -static struct entry *tstat_hash_table[TSTAT_HASH_SIZE] __read_mostly; - -static void reset_entries(void) -{ - nr_entries = 0; - memset(entries, 0, sizeof(entries)); - memset(tstat_hash_table, 0, sizeof(tstat_hash_table)); - atomic_set(&overflow_count, 0); -} - -static struct entry *alloc_entry(void) -{ - if (nr_entries >= MAX_ENTRIES) - return NULL; - - return entries + nr_entries++; -} - -static int match_entries(struct entry *entry1, struct entry *entry2) -{ - return entry1->timer == entry2->timer && - entry1->start_func == entry2->start_func && - entry1->expire_func == entry2->expire_func && - entry1->pid == entry2->pid; -} - -/* - * Look up whether an entry matching this item is present - * in the hash already. Must be called with irqs off and the - * lookup lock held: - */ -static struct entry *tstat_lookup(struct entry *entry, char *comm) -{ - struct entry **head, *curr, *prev; - - head = tstat_hashentry(entry); - curr = *head; - - /* - * The fastpath is when the entry is already hashed, - * we do this with the lookup lock held, but with the - * table lock not held: - */ - while (curr) { - if (match_entries(curr, entry)) - return curr; - - curr = curr->next; - } - /* - * Slowpath: allocate, set up and link a new hash entry: - */ - prev = NULL; - curr = *head; - - raw_spin_lock(&table_lock); - /* - * Make sure we have not raced with another CPU: - */ - while (curr) { - if (match_entries(curr, entry)) - goto out_unlock; - - prev = curr; - curr = curr->next; - } - - curr = alloc_entry(); - if (curr) { - *curr = *entry; - curr->count = 0; - curr->next = NULL; - memcpy(curr->comm, comm, TASK_COMM_LEN); - - smp_mb(); /* Ensure that curr is initialized before insert */ - - if (prev) - prev->next = curr; - else - *head = curr; - } - out_unlock: - raw_spin_unlock(&table_lock); - - return curr; -} - -/** - * timer_stats_update_stats - Update the statistics for a timer. - * @timer: pointer to either a timer_list or a hrtimer - * @pid: the pid of the task which set up the timer - * @startf: pointer to the function which did the timer setup - * @timerf: pointer to the timer callback function of the timer - * @comm: name of the process which set up the timer - * @tflags: The flags field of the timer - * - * When the timer is already registered, then the event counter is - * incremented. Otherwise the timer is registered in a free slot. - */ -void timer_stats_update_stats(void *timer, pid_t pid, void *startf, - void *timerf, char *comm, u32 tflags) -{ - /* - * It doesn't matter which lock we take: - */ - raw_spinlock_t *lock; - struct entry *entry, input; - unsigned long flags; - - if (likely(!timer_stats_active)) - return; - - lock = &per_cpu(tstats_lookup_lock, raw_smp_processor_id()); - - input.timer = timer; - input.start_func = startf; - input.expire_func = timerf; - input.pid = pid; - input.flags = tflags; - - raw_spin_lock_irqsave(lock, flags); - if (!timer_stats_active) - goto out_unlock; - - entry = tstat_lookup(&input, comm); - if (likely(entry)) - entry->count++; - else - atomic_inc(&overflow_count); - - out_unlock: - raw_spin_unlock_irqrestore(lock, flags); -} - -static void print_name_offset(struct seq_file *m, unsigned long addr) -{ - char symname[KSYM_NAME_LEN]; - - if (lookup_symbol_name(addr, symname) < 0) - seq_printf(m, "<%p>", (void *)addr); - else - seq_printf(m, "%s", symname); -} - -static int tstats_show(struct seq_file *m, void *v) -{ - struct timespec period; - struct entry *entry; - unsigned long ms; - long events = 0; - ktime_t time; - int i; - - mutex_lock(&show_mutex); - /* - * If still active then calculate up to now: - */ - if (timer_stats_active) - time_stop = ktime_get(); - - time = ktime_sub(time_stop, time_start); - - period = ktime_to_timespec(time); - ms = period.tv_nsec / 1000000; - - seq_puts(m, "Timer Stats Version: v0.3\n"); - seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms); - if (atomic_read(&overflow_count)) - seq_printf(m, "Overflow: %d entries\n", atomic_read(&overflow_count)); - seq_printf(m, "Collection: %s\n", timer_stats_active ? "active" : "inactive"); - - for (i = 0; i < nr_entries; i++) { - entry = entries + i; - if (entry->flags & TIMER_DEFERRABLE) { - seq_printf(m, "%4luD, %5d %-16s ", - entry->count, entry->pid, entry->comm); - } else { - seq_printf(m, " %4lu, %5d %-16s ", - entry->count, entry->pid, entry->comm); - } - - print_name_offset(m, (unsigned long)entry->start_func); - seq_puts(m, " ("); - print_name_offset(m, (unsigned long)entry->expire_func); - seq_puts(m, ")\n"); - - events += entry->count; - } - - ms += period.tv_sec * 1000; - if (!ms) - ms = 1; - - if (events && period.tv_sec) - seq_printf(m, "%ld total events, %ld.%03ld events/sec\n", - events, events * 1000 / ms, - (events * 1000000 / ms) % 1000); - else - seq_printf(m, "%ld total events\n", events); - - mutex_unlock(&show_mutex); - - return 0; -} - -/* - * After a state change, make sure all concurrent lookup/update - * activities have stopped: - */ -static void sync_access(void) -{ - unsigned long flags; - int cpu; - - for_each_online_cpu(cpu) { - raw_spinlock_t *lock = &per_cpu(tstats_lookup_lock, cpu); - - raw_spin_lock_irqsave(lock, flags); - /* nothing */ - raw_spin_unlock_irqrestore(lock, flags); - } -} - -static ssize_t tstats_write(struct file *file, const char __user *buf, - size_t count, loff_t *offs) -{ - char ctl[2]; - - if (count != 2 || *offs) - return -EINVAL; - - if (copy_from_user(ctl, buf, count)) - return -EFAULT; - - mutex_lock(&show_mutex); - switch (ctl[0]) { - case '0': - if (timer_stats_active) { - timer_stats_active = 0; - time_stop = ktime_get(); - sync_access(); - } - break; - case '1': - if (!timer_stats_active) { - reset_entries(); - time_start = ktime_get(); - smp_mb(); - timer_stats_active = 1; - } - break; - default: - count = -EINVAL; - } - mutex_unlock(&show_mutex); - - return count; -} - -static int tstats_open(struct inode *inode, struct file *filp) -{ - return single_open(filp, tstats_show, NULL); -} - -static const struct file_operations tstats_fops = { - .open = tstats_open, - .read = seq_read, - .write = tstats_write, - .llseek = seq_lseek, - .release = single_release, -}; - -void __init init_timer_stats(void) -{ - int cpu; - - for_each_possible_cpu(cpu) - raw_spin_lock_init(&per_cpu(tstats_lookup_lock, cpu)); -} - -static int __init init_tstats_procfs(void) -{ - struct proc_dir_entry *pe; - - pe = proc_create("timer_stats", 0600, NULL, &tstats_fops); - if (!pe) - return -ENOMEM; - return 0; -} -__initcall(init_tstats_procfs); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 006eefb6ede0..d8b00f94c63d 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -89,6 +89,31 @@ config RING_BUFFER_ALLOW_SWAP Allow the use of ring_buffer_swap_cpu. Adds a very slight overhead to tracing when enabled. +config IPC_LOGGING + bool "Debug Logging for IPC Drivers" + select GENERIC_TRACER + help + This option allows the debug logging for IPC Drivers. + + If in doubt, say no. + +config QCOM_RTB + bool "Register tracing" + help + Add support for logging different events to a small uncached + region. This is designed to aid in debugging reset cases where the + caches may not be flushed before the target resets. + +config QCOM_RTB_SEPARATE_CPUS + bool "Separate entries for each cpu" + depends on QCOM_RTB + depends on SMP + help + Under some circumstances, it may be beneficial to give dedicated space + for each cpu to log accesses. Selecting this option will log each cpu + separately. This will guarantee that the last acesses for each cpu + will be logged but there will be fewer entries per cpu + # All tracer options should select GENERIC_TRACER. For those options that are # enabled by all tracers (context switch and event tracer) they select TRACING. # This allows those options to appear when no other tracer is selected. But the @@ -111,6 +136,20 @@ config GENERIC_TRACER bool select TRACING +if TRACING + +config DISABLE_TRACE_PRINTK + bool "Force disable trace_printk() usage" + default y + help + When trace_printk() is used in any of the kernel source, it enables + debugging functions which are not desired for production kernel. + Enabling this option will replace trace_printk() with pr_debug(). + + If in doubt, say Y. + +endif + # # Minimum requirements an architecture has to meet for us to # be able to offer generic tracing facilities: @@ -499,6 +538,19 @@ config FUNCTION_PROFILER If in doubt, say N. +config CPU_FREQ_SWITCH_PROFILER + bool "CPU frequency switch time profiler" + select GENERIC_TRACER + help + This option enables the CPU frequency switch profiler. A file is + created in debugfs called "cpu_freq_switch_profile_enabled", which + defaults to zero. When a 1 is echoed into this file, profiling begins. + When a zero is echoed, profiling stops. A "cpu_freq_switch" file is + also created in the trace_stats directory; this file shows the + switches that have occurred and duration statistics. + + If in doubt, say N. + config FTRACE_MCOUNT_RECORD def_bool y depends on DYNAMIC_FTRACE diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 4b35fb97ae44..a0177ae43058 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -41,6 +41,7 @@ obj-$(CONFIG_PREEMPTIRQ_EVENTS) += trace_irqsoff.o obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o +obj-$(CONFIG_CPU_FREQ_SWITCH_PROFILER) += trace_cpu_freq_switch.o obj-$(CONFIG_NOP_TRACER) += trace_nop.o obj-$(CONFIG_STACK_TRACER) += trace_stack.o obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o @@ -70,7 +71,12 @@ endif obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o obj-$(CONFIG_UPROBE_EVENT) += trace_uprobe.o obj-$(CONFIG_GPU_TRACEPOINTS) += gpu-traces.o +obj-$(CONFIG_QCOM_RTB) += msm_rtb.o obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o +obj-$(CONFIG_IPC_LOGGING) += ipc_logging.o +ifdef CONFIG_DEBUG_FS +obj-$(CONFIG_IPC_LOGGING) += ipc_logging_debug.o +endif libftrace-y := ftrace.o diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index c142e100840e..baa82e3ab2c0 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -203,9 +203,9 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), * blk_io_trace structure and places it in a per-cpu subbuffer. */ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, - int rw, u32 what, int error, int pdu_len, void *pdu_data) + int rw, u32 what, int error, int pdu_len, + void *pdu_data, struct task_struct *tsk) { - struct task_struct *tsk = current; struct ring_buffer_event *event = NULL; struct ring_buffer *buffer = NULL; struct blk_io_trace *t; @@ -783,6 +783,7 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq, unsigned int nr_bytes, u32 what) { struct blk_trace *bt; + struct task_struct *tsk = current; rcu_read_lock(); bt = rcu_dereference(q->blk_trace); @@ -791,14 +792,28 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq, return; } + /* + * Use the bio context for all events except ISSUE and + * COMPLETE events. + * + * Not all the pages in the bio are dirtied by the same task but + * most likely it will be, since the sectors accessed on the device + * must be adjacent. + */ + if (!((what == BLK_TA_ISSUE) || (what == BLK_TA_COMPLETE)) && + bio_has_data(rq->bio) && rq->bio->bi_io_vec && + rq->bio->bi_io_vec->bv_page && + rq->bio->bi_io_vec->bv_page->tsk_dirty) + tsk = rq->bio->bi_io_vec->bv_page->tsk_dirty; + if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { what |= BLK_TC_ACT(BLK_TC_PC); __blk_add_trace(bt, 0, nr_bytes, rq->cmd_flags, - what, rq->errors, rq->cmd_len, rq->cmd); + what, rq->errors, rq->cmd_len, rq->cmd, tsk); } else { what |= BLK_TC_ACT(BLK_TC_FS); __blk_add_trace(bt, blk_rq_pos(rq), nr_bytes, - rq->cmd_flags, what, rq->errors, 0, NULL); + rq->cmd_flags, what, rq->errors, 0, NULL, tsk); } rcu_read_unlock(); } @@ -851,6 +866,7 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, u32 what, int error) { struct blk_trace *bt; + struct task_struct *tsk = current; rcu_read_lock(); bt = rcu_dereference(q->blk_trace); @@ -859,8 +875,17 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, return; } + /* + * Not all the pages in the bio are dirtied by the same task but + * most likely it will be, since the sectors accessed on the device + * must be adjacent. + */ + if (bio_has_data(bio) && bio->bi_io_vec && bio->bi_io_vec->bv_page && + bio->bi_io_vec->bv_page->tsk_dirty) + tsk = bio->bi_io_vec->bv_page->tsk_dirty; + __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, - bio->bi_rw, what, error, 0, NULL); + bio->bi_rw, what, error, 0, NULL, tsk); rcu_read_unlock(); } @@ -911,7 +936,8 @@ static void blk_add_trace_getrq(void *ignore, rcu_read_lock(); bt = rcu_dereference(q->blk_trace); if (bt) - __blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0, NULL); + __blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0, + NULL, current); rcu_read_unlock(); } } @@ -930,7 +956,7 @@ static void blk_add_trace_sleeprq(void *ignore, bt = rcu_dereference(q->blk_trace); if (bt) __blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ, - 0, 0, NULL); + 0, 0, NULL, current); rcu_read_unlock(); } } @@ -942,7 +968,8 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q) rcu_read_lock(); bt = rcu_dereference(q->blk_trace); if (bt) - __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); + __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, + current); rcu_read_unlock(); } @@ -962,7 +989,8 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q, else what = BLK_TA_UNPLUG_TIMER; - __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu); + __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, + current); } rcu_read_unlock(); } @@ -972,15 +1000,21 @@ static void blk_add_trace_split(void *ignore, unsigned int pdu) { struct blk_trace *bt; + struct task_struct *tsk = current; rcu_read_lock(); bt = rcu_dereference(q->blk_trace); if (bt) { __be64 rpdu = cpu_to_be64(pdu); + if (bio_has_data(bio) && bio->bi_io_vec && + bio->bi_io_vec->bv_page && + bio->bi_io_vec->bv_page->tsk_dirty) + tsk = bio->bi_io_vec->bv_page->tsk_dirty; + __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio->bi_rw, BLK_TA_SPLIT, - bio->bi_error, sizeof(rpdu), &rpdu); + bio->bi_error, sizeof(rpdu), &rpdu, tsk); } rcu_read_unlock(); } @@ -1004,6 +1038,7 @@ static void blk_add_trace_bio_remap(void *ignore, { struct blk_trace *bt; struct blk_io_trace_remap r; + struct task_struct *tsk = current; rcu_read_lock(); bt = rcu_dereference(q->blk_trace); @@ -1016,9 +1051,14 @@ static void blk_add_trace_bio_remap(void *ignore, r.device_to = cpu_to_be32(bio->bi_bdev->bd_dev); r.sector_from = cpu_to_be64(from); + if (bio_has_data(bio) && bio->bi_io_vec && + bio->bi_io_vec->bv_page && + bio->bi_io_vec->bv_page->tsk_dirty) + tsk = bio->bi_io_vec->bv_page->tsk_dirty; + __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio->bi_rw, BLK_TA_REMAP, bio->bi_error, - sizeof(r), &r); + sizeof(r), &r, tsk); rcu_read_unlock(); } @@ -1042,6 +1082,7 @@ static void blk_add_trace_rq_remap(void *ignore, { struct blk_trace *bt; struct blk_io_trace_remap r; + struct task_struct *tsk = current; rcu_read_lock(); bt = rcu_dereference(q->blk_trace); @@ -1054,9 +1095,14 @@ static void blk_add_trace_rq_remap(void *ignore, r.device_to = cpu_to_be32(disk_devt(rq->rq_disk)); r.sector_from = cpu_to_be64(from); + if (bio_has_data(rq->bio) && rq->bio->bi_io_vec && + rq->bio->bi_io_vec->bv_page && + rq->bio->bi_io_vec->bv_page->tsk_dirty) + tsk = rq->bio->bi_io_vec->bv_page->tsk_dirty; + __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rq_data_dir(rq), BLK_TA_REMAP, !!rq->errors, - sizeof(r), &r); + sizeof(r), &r, tsk); rcu_read_unlock(); } @@ -1076,6 +1122,7 @@ void blk_add_driver_data(struct request_queue *q, void *data, size_t len) { struct blk_trace *bt; + struct task_struct *tsk = current; rcu_read_lock(); bt = rcu_dereference(q->blk_trace); @@ -1084,12 +1131,17 @@ void blk_add_driver_data(struct request_queue *q, return; } + if (bio_has_data(rq->bio) && rq->bio->bi_io_vec && + rq->bio->bi_io_vec->bv_page && + rq->bio->bi_io_vec->bv_page->tsk_dirty) + tsk = rq->bio->bi_io_vec->bv_page->tsk_dirty; + if (rq->cmd_type == REQ_TYPE_BLOCK_PC) __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0, - BLK_TA_DRV_DATA, rq->errors, len, data); + BLK_TA_DRV_DATA, rq->errors, len, data, tsk); else __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0, - BLK_TA_DRV_DATA, rq->errors, len, data); + BLK_TA_DRV_DATA, rq->errors, len, data, tsk); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(blk_add_driver_data); diff --git a/kernel/trace/ipc_logging.c b/kernel/trace/ipc_logging.c new file mode 100644 index 000000000000..ed29c38cd7fb --- /dev/null +++ b/kernel/trace/ipc_logging.c @@ -0,0 +1,876 @@ +/* Copyright (c) 2012-2017, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include <asm/arch_timer.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/jiffies.h> +#include <linux/debugfs.h> +#include <linux/io.h> +#include <linux/idr.h> +#include <linux/string.h> +#include <linux/sched.h> +#include <linux/wait.h> +#include <linux/delay.h> +#include <linux/completion.h> +#include <linux/ipc_logging.h> + +#include "ipc_logging_private.h" + +#define LOG_PAGE_DATA_SIZE sizeof(((struct ipc_log_page *)0)->data) +#define LOG_PAGE_FLAG (1 << 31) + +static LIST_HEAD(ipc_log_context_list); +static DEFINE_RWLOCK(context_list_lock_lha1); +static void *get_deserialization_func(struct ipc_log_context *ilctxt, + int type); + +static struct ipc_log_page *get_first_page(struct ipc_log_context *ilctxt) +{ + struct ipc_log_page_header *p_pghdr; + struct ipc_log_page *pg = NULL; + + if (!ilctxt) + return NULL; + p_pghdr = list_first_entry(&ilctxt->page_list, + struct ipc_log_page_header, list); + pg = container_of(p_pghdr, struct ipc_log_page, hdr); + return pg; +} + +/** + * is_nd_read_empty - Returns true if no data is available to read in log + * + * @ilctxt: logging context + * @returns: > 1 if context is empty; 0 if not empty; <0 for failure + * + * This is for the debugfs read pointer which allows for a non-destructive read. + * There may still be data in the log, but it may have already been read. + */ +static int is_nd_read_empty(struct ipc_log_context *ilctxt) +{ + if (!ilctxt) + return -EINVAL; + + return ((ilctxt->nd_read_page == ilctxt->write_page) && + (ilctxt->nd_read_page->hdr.nd_read_offset == + ilctxt->write_page->hdr.write_offset)); +} + +/** + * is_read_empty - Returns true if no data is available in log + * + * @ilctxt: logging context + * @returns: > 1 if context is empty; 0 if not empty; <0 for failure + * + * This is for the actual log contents. If it is empty, then there + * is no data at all in the log. + */ +static int is_read_empty(struct ipc_log_context *ilctxt) +{ + if (!ilctxt) + return -EINVAL; + + return ((ilctxt->read_page == ilctxt->write_page) && + (ilctxt->read_page->hdr.read_offset == + ilctxt->write_page->hdr.write_offset)); +} + +/** + * is_nd_read_equal_read - Return true if the non-destructive read is equal to + * the destructive read + * + * @ilctxt: logging context + * @returns: true if nd read is equal to read; false otherwise + */ +static bool is_nd_read_equal_read(struct ipc_log_context *ilctxt) +{ + uint16_t read_offset; + uint16_t nd_read_offset; + + if (ilctxt->nd_read_page == ilctxt->read_page) { + read_offset = ilctxt->read_page->hdr.read_offset; + nd_read_offset = ilctxt->nd_read_page->hdr.nd_read_offset; + + if (read_offset == nd_read_offset) + return true; + } + + return false; +} + + +static struct ipc_log_page *get_next_page(struct ipc_log_context *ilctxt, + struct ipc_log_page *cur_pg) +{ + struct ipc_log_page_header *p_pghdr; + struct ipc_log_page *pg = NULL; + + if (!ilctxt || !cur_pg) + return NULL; + + if (ilctxt->last_page == cur_pg) + return ilctxt->first_page; + + p_pghdr = list_first_entry(&cur_pg->hdr.list, + struct ipc_log_page_header, list); + pg = container_of(p_pghdr, struct ipc_log_page, hdr); + + return pg; +} + +/** + * ipc_log_read - do non-destructive read of the log + * + * @ilctxt: Logging context + * @data: Data pointer to receive the data + * @data_size: Number of bytes to read (must be <= bytes available in log) + * + * This read will update a runtime read pointer, but will not affect the actual + * contents of the log which allows for reading the logs continuously while + * debugging and if the system crashes, then the full logs can still be + * extracted. + */ +static void ipc_log_read(struct ipc_log_context *ilctxt, + void *data, int data_size) +{ + int bytes_to_read; + + bytes_to_read = MIN(LOG_PAGE_DATA_SIZE + - ilctxt->nd_read_page->hdr.nd_read_offset, + data_size); + + memcpy(data, (ilctxt->nd_read_page->data + + ilctxt->nd_read_page->hdr.nd_read_offset), bytes_to_read); + + if (bytes_to_read != data_size) { + /* not enough space, wrap read to next page */ + ilctxt->nd_read_page->hdr.nd_read_offset = 0; + ilctxt->nd_read_page = get_next_page(ilctxt, + ilctxt->nd_read_page); + BUG_ON(ilctxt->nd_read_page == NULL); + + memcpy((data + bytes_to_read), + (ilctxt->nd_read_page->data + + ilctxt->nd_read_page->hdr.nd_read_offset), + (data_size - bytes_to_read)); + bytes_to_read = (data_size - bytes_to_read); + } + ilctxt->nd_read_page->hdr.nd_read_offset += bytes_to_read; +} + +/** + * ipc_log_drop - do destructive read of the log + * + * @ilctxt: Logging context + * @data: Data pointer to receive the data (or NULL) + * @data_size: Number of bytes to read (must be <= bytes available in log) + */ +static void ipc_log_drop(struct ipc_log_context *ilctxt, void *data, + int data_size) +{ + int bytes_to_read; + bool push_nd_read; + + bytes_to_read = MIN(LOG_PAGE_DATA_SIZE + - ilctxt->read_page->hdr.read_offset, + data_size); + if (data) + memcpy(data, (ilctxt->read_page->data + + ilctxt->read_page->hdr.read_offset), bytes_to_read); + + if (bytes_to_read != data_size) { + /* not enough space, wrap read to next page */ + push_nd_read = is_nd_read_equal_read(ilctxt); + + ilctxt->read_page->hdr.read_offset = 0; + if (push_nd_read) { + ilctxt->read_page->hdr.nd_read_offset = 0; + ilctxt->read_page = get_next_page(ilctxt, + ilctxt->read_page); + BUG_ON(ilctxt->read_page == NULL); + ilctxt->nd_read_page = ilctxt->read_page; + } else { + ilctxt->read_page = get_next_page(ilctxt, + ilctxt->read_page); + BUG_ON(ilctxt->read_page == NULL); + } + + if (data) + memcpy((data + bytes_to_read), + (ilctxt->read_page->data + + ilctxt->read_page->hdr.read_offset), + (data_size - bytes_to_read)); + + bytes_to_read = (data_size - bytes_to_read); + } + + /* update non-destructive read pointer if necessary */ + push_nd_read = is_nd_read_equal_read(ilctxt); + ilctxt->read_page->hdr.read_offset += bytes_to_read; + ilctxt->write_avail += data_size; + + if (push_nd_read) + ilctxt->nd_read_page->hdr.nd_read_offset += bytes_to_read; +} + +/** + * msg_read - Reads a message. + * + * If a message is read successfully, then the message context + * will be set to: + * .hdr message header .size and .type values + * .offset beginning of message data + * + * @ilctxt Logging context + * @ectxt Message context + * + * @returns 0 - no message available; >0 message size; <0 error + */ +static int msg_read(struct ipc_log_context *ilctxt, + struct encode_context *ectxt) +{ + struct tsv_header hdr; + + if (!ectxt) + return -EINVAL; + + if (is_nd_read_empty(ilctxt)) + return 0; + + ipc_log_read(ilctxt, &hdr, sizeof(hdr)); + ectxt->hdr.type = hdr.type; + ectxt->hdr.size = hdr.size; + ectxt->offset = sizeof(hdr); + ipc_log_read(ilctxt, (ectxt->buff + ectxt->offset), + (int)hdr.size); + + return sizeof(hdr) + (int)hdr.size; +} + +/** + * msg_drop - Drops a message. + * + * @ilctxt Logging context + */ +static void msg_drop(struct ipc_log_context *ilctxt) +{ + struct tsv_header hdr; + + if (!is_read_empty(ilctxt)) { + ipc_log_drop(ilctxt, &hdr, sizeof(hdr)); + ipc_log_drop(ilctxt, NULL, (int)hdr.size); + } +} + +/* + * Commits messages to the FIFO. If the FIFO is full, then enough + * messages are dropped to create space for the new message. + */ +void ipc_log_write(void *ctxt, struct encode_context *ectxt) +{ + struct ipc_log_context *ilctxt = (struct ipc_log_context *)ctxt; + int bytes_to_write; + unsigned long flags; + + if (!ilctxt || !ectxt) { + pr_err("%s: Invalid ipc_log or encode context\n", __func__); + return; + } + + read_lock_irqsave(&context_list_lock_lha1, flags); + spin_lock(&ilctxt->context_lock_lhb1); + while (ilctxt->write_avail <= ectxt->offset) + msg_drop(ilctxt); + + bytes_to_write = MIN(LOG_PAGE_DATA_SIZE + - ilctxt->write_page->hdr.write_offset, + ectxt->offset); + memcpy((ilctxt->write_page->data + + ilctxt->write_page->hdr.write_offset), + ectxt->buff, bytes_to_write); + + if (bytes_to_write != ectxt->offset) { + uint64_t t_now = sched_clock(); + + ilctxt->write_page->hdr.write_offset += bytes_to_write; + ilctxt->write_page->hdr.end_time = t_now; + + ilctxt->write_page = get_next_page(ilctxt, ilctxt->write_page); + BUG_ON(ilctxt->write_page == NULL); + ilctxt->write_page->hdr.write_offset = 0; + ilctxt->write_page->hdr.start_time = t_now; + memcpy((ilctxt->write_page->data + + ilctxt->write_page->hdr.write_offset), + (ectxt->buff + bytes_to_write), + (ectxt->offset - bytes_to_write)); + bytes_to_write = (ectxt->offset - bytes_to_write); + } + ilctxt->write_page->hdr.write_offset += bytes_to_write; + ilctxt->write_avail -= ectxt->offset; + complete(&ilctxt->read_avail); + spin_unlock(&ilctxt->context_lock_lhb1); + read_unlock_irqrestore(&context_list_lock_lha1, flags); +} +EXPORT_SYMBOL(ipc_log_write); + +/* + * Starts a new message after which you can add serialized data and + * then complete the message by calling msg_encode_end(). + */ +void msg_encode_start(struct encode_context *ectxt, uint32_t type) +{ + if (!ectxt) { + pr_err("%s: Invalid encode context\n", __func__); + return; + } + + ectxt->hdr.type = type; + ectxt->hdr.size = 0; + ectxt->offset = sizeof(ectxt->hdr); +} +EXPORT_SYMBOL(msg_encode_start); + +/* + * Completes the message + */ +void msg_encode_end(struct encode_context *ectxt) +{ + if (!ectxt) { + pr_err("%s: Invalid encode context\n", __func__); + return; + } + + /* finalize data size */ + ectxt->hdr.size = ectxt->offset - sizeof(ectxt->hdr); + BUG_ON(ectxt->hdr.size > MAX_MSG_SIZE); + memcpy(ectxt->buff, &ectxt->hdr, sizeof(ectxt->hdr)); +} +EXPORT_SYMBOL(msg_encode_end); + +/* + * Helper funtion used to write data to a message context. + * + * @ectxt context initialized by calling msg_encode_start() + * @data data to write + * @size number of bytes of data to write + */ +static inline int tsv_write_data(struct encode_context *ectxt, + void *data, uint32_t size) +{ + if (!ectxt) { + pr_err("%s: Invalid encode context\n", __func__); + return -EINVAL; + } + if ((ectxt->offset + size) > MAX_MSG_SIZE) { + pr_err("%s: No space to encode further\n", __func__); + return -EINVAL; + } + + memcpy((void *)(ectxt->buff + ectxt->offset), data, size); + ectxt->offset += size; + return 0; +} + +/* + * Helper function that writes a type to the context. + * + * @ectxt context initialized by calling msg_encode_start() + * @type primitive type + * @size size of primitive in bytes + */ +static inline int tsv_write_header(struct encode_context *ectxt, + uint32_t type, uint32_t size) +{ + struct tsv_header hdr; + + hdr.type = (unsigned char)type; + hdr.size = (unsigned char)size; + return tsv_write_data(ectxt, &hdr, sizeof(hdr)); +} + +/* + * Writes the current timestamp count. + * + * @ectxt context initialized by calling msg_encode_start() + */ +int tsv_timestamp_write(struct encode_context *ectxt) +{ + int ret; + uint64_t t_now = sched_clock(); + + ret = tsv_write_header(ectxt, TSV_TYPE_TIMESTAMP, sizeof(t_now)); + if (ret) + return ret; + return tsv_write_data(ectxt, &t_now, sizeof(t_now)); +} +EXPORT_SYMBOL(tsv_timestamp_write); + +/* + * Writes the current QTimer timestamp count. + * + * @ectxt context initialized by calling msg_encode_start() + */ +int tsv_qtimer_write(struct encode_context *ectxt) +{ + int ret; + uint64_t t_now = arch_counter_get_cntvct(); + + ret = tsv_write_header(ectxt, TSV_TYPE_QTIMER, sizeof(t_now)); + if (ret) + return ret; + return tsv_write_data(ectxt, &t_now, sizeof(t_now)); +} +EXPORT_SYMBOL(tsv_qtimer_write); + +/* + * Writes a data pointer. + * + * @ectxt context initialized by calling msg_encode_start() + * @pointer pointer value to write + */ +int tsv_pointer_write(struct encode_context *ectxt, void *pointer) +{ + int ret; + ret = tsv_write_header(ectxt, TSV_TYPE_POINTER, sizeof(pointer)); + if (ret) + return ret; + return tsv_write_data(ectxt, &pointer, sizeof(pointer)); +} +EXPORT_SYMBOL(tsv_pointer_write); + +/* + * Writes a 32-bit integer value. + * + * @ectxt context initialized by calling msg_encode_start() + * @n integer to write + */ +int tsv_int32_write(struct encode_context *ectxt, int32_t n) +{ + int ret; + ret = tsv_write_header(ectxt, TSV_TYPE_INT32, sizeof(n)); + if (ret) + return ret; + return tsv_write_data(ectxt, &n, sizeof(n)); +} +EXPORT_SYMBOL(tsv_int32_write); + +/* + * Writes a byte array. + * + * @ectxt context initialized by calling msg_write_start() + * @data Beginning address of data + * @data_size Size of data to be written + */ +int tsv_byte_array_write(struct encode_context *ectxt, + void *data, int data_size) +{ + int ret; + ret = tsv_write_header(ectxt, TSV_TYPE_BYTE_ARRAY, data_size); + if (ret) + return ret; + return tsv_write_data(ectxt, data, data_size); +} +EXPORT_SYMBOL(tsv_byte_array_write); + +/* + * Helper function to log a string + * + * @ilctxt ipc_log_context created using ipc_log_context_create() + * @fmt Data specified using format specifiers + */ +int ipc_log_string(void *ilctxt, const char *fmt, ...) +{ + struct encode_context ectxt; + int avail_size, data_size, hdr_size = sizeof(struct tsv_header); + va_list arg_list; + + if (!ilctxt) + return -EINVAL; + + msg_encode_start(&ectxt, TSV_TYPE_STRING); + tsv_timestamp_write(&ectxt); + tsv_qtimer_write(&ectxt); + avail_size = (MAX_MSG_SIZE - (ectxt.offset + hdr_size)); + va_start(arg_list, fmt); + data_size = vscnprintf((ectxt.buff + ectxt.offset + hdr_size), + avail_size, fmt, arg_list); + va_end(arg_list); + tsv_write_header(&ectxt, TSV_TYPE_BYTE_ARRAY, data_size); + ectxt.offset += data_size; + msg_encode_end(&ectxt); + ipc_log_write(ilctxt, &ectxt); + return 0; +} +EXPORT_SYMBOL(ipc_log_string); + +/** + * ipc_log_extract - Reads and deserializes log + * + * @ctxt: logging context + * @buff: buffer to receive the data + * @size: size of the buffer + * @returns: 0 if no data read; >0 number of bytes read; < 0 error + * + * If no data is available to be read, then the ilctxt::read_avail + * completion is reinitialized. This allows clients to block + * until new log data is save. + */ +int ipc_log_extract(void *ctxt, char *buff, int size) +{ + struct encode_context ectxt; + struct decode_context dctxt; + void (*deserialize_func)(struct encode_context *ectxt, + struct decode_context *dctxt); + struct ipc_log_context *ilctxt = (struct ipc_log_context *)ctxt; + unsigned long flags; + + if (size < MAX_MSG_DECODED_SIZE) + return -EINVAL; + + dctxt.output_format = OUTPUT_DEBUGFS; + dctxt.buff = buff; + dctxt.size = size; + read_lock_irqsave(&context_list_lock_lha1, flags); + spin_lock(&ilctxt->context_lock_lhb1); + while (dctxt.size >= MAX_MSG_DECODED_SIZE && + !is_nd_read_empty(ilctxt)) { + msg_read(ilctxt, &ectxt); + deserialize_func = get_deserialization_func(ilctxt, + ectxt.hdr.type); + spin_unlock(&ilctxt->context_lock_lhb1); + read_unlock_irqrestore(&context_list_lock_lha1, flags); + if (deserialize_func) + deserialize_func(&ectxt, &dctxt); + else + pr_err("%s: unknown message 0x%x\n", + __func__, ectxt.hdr.type); + read_lock_irqsave(&context_list_lock_lha1, flags); + spin_lock(&ilctxt->context_lock_lhb1); + } + if ((size - dctxt.size) == 0) + reinit_completion(&ilctxt->read_avail); + spin_unlock(&ilctxt->context_lock_lhb1); + read_unlock_irqrestore(&context_list_lock_lha1, flags); + return size - dctxt.size; +} +EXPORT_SYMBOL(ipc_log_extract); + +/* + * Helper funtion used to read data from a message context. + * + * @ectxt context initialized by calling msg_read() + * @data data to read + * @size number of bytes of data to read + */ +static void tsv_read_data(struct encode_context *ectxt, + void *data, uint32_t size) +{ + BUG_ON((ectxt->offset + size) > MAX_MSG_SIZE); + memcpy(data, (ectxt->buff + ectxt->offset), size); + ectxt->offset += size; +} + +/* + * Helper function that reads a type from the context and updates the + * context pointers. + * + * @ectxt context initialized by calling msg_read() + * @hdr type header + */ +static void tsv_read_header(struct encode_context *ectxt, + struct tsv_header *hdr) +{ + BUG_ON((ectxt->offset + sizeof(*hdr)) > MAX_MSG_SIZE); + memcpy(hdr, (ectxt->buff + ectxt->offset), sizeof(*hdr)); + ectxt->offset += sizeof(*hdr); +} + +/* + * Reads a timestamp. + * + * @ectxt context initialized by calling msg_read() + * @dctxt deserialization context + * @format output format (appended to %6u.09u timestamp format) + */ +void tsv_timestamp_read(struct encode_context *ectxt, + struct decode_context *dctxt, const char *format) +{ + struct tsv_header hdr; + uint64_t val; + unsigned long nanosec_rem; + + tsv_read_header(ectxt, &hdr); + BUG_ON(hdr.type != TSV_TYPE_TIMESTAMP); + tsv_read_data(ectxt, &val, sizeof(val)); + nanosec_rem = do_div(val, 1000000000U); + IPC_SPRINTF_DECODE(dctxt, "[%6u.%09lu%s/", + (unsigned)val, nanosec_rem, format); +} +EXPORT_SYMBOL(tsv_timestamp_read); + +/* + * Reads a QTimer timestamp. + * + * @ectxt context initialized by calling msg_read() + * @dctxt deserialization context + * @format output format (appended to %#18llx timestamp format) + */ +void tsv_qtimer_read(struct encode_context *ectxt, + struct decode_context *dctxt, const char *format) +{ + struct tsv_header hdr; + uint64_t val; + + tsv_read_header(ectxt, &hdr); + BUG_ON(hdr.type != TSV_TYPE_QTIMER); + tsv_read_data(ectxt, &val, sizeof(val)); + + /* + * This gives 16 hex digits of output. The # prefix prepends + * a 0x, and these characters count as part of the number. + */ + IPC_SPRINTF_DECODE(dctxt, "%#18llx]%s", val, format); +} +EXPORT_SYMBOL(tsv_qtimer_read); + +/* + * Reads a data pointer. + * + * @ectxt context initialized by calling msg_read() + * @dctxt deserialization context + * @format output format + */ +void tsv_pointer_read(struct encode_context *ectxt, + struct decode_context *dctxt, const char *format) +{ + struct tsv_header hdr; + void *val; + + tsv_read_header(ectxt, &hdr); + BUG_ON(hdr.type != TSV_TYPE_POINTER); + tsv_read_data(ectxt, &val, sizeof(val)); + + IPC_SPRINTF_DECODE(dctxt, format, val); +} +EXPORT_SYMBOL(tsv_pointer_read); + +/* + * Reads a 32-bit integer value. + * + * @ectxt context initialized by calling msg_read() + * @dctxt deserialization context + * @format output format + */ +int32_t tsv_int32_read(struct encode_context *ectxt, + struct decode_context *dctxt, const char *format) +{ + struct tsv_header hdr; + int32_t val; + + tsv_read_header(ectxt, &hdr); + BUG_ON(hdr.type != TSV_TYPE_INT32); + tsv_read_data(ectxt, &val, sizeof(val)); + + IPC_SPRINTF_DECODE(dctxt, format, val); + return val; +} +EXPORT_SYMBOL(tsv_int32_read); + +/* + * Reads a byte array/string. + * + * @ectxt context initialized by calling msg_read() + * @dctxt deserialization context + * @format output format + */ +void tsv_byte_array_read(struct encode_context *ectxt, + struct decode_context *dctxt, const char *format) +{ + struct tsv_header hdr; + + tsv_read_header(ectxt, &hdr); + BUG_ON(hdr.type != TSV_TYPE_BYTE_ARRAY); + tsv_read_data(ectxt, dctxt->buff, hdr.size); + dctxt->buff += hdr.size; + dctxt->size -= hdr.size; +} +EXPORT_SYMBOL(tsv_byte_array_read); + +int add_deserialization_func(void *ctxt, int type, + void (*dfunc)(struct encode_context *, + struct decode_context *)) +{ + struct ipc_log_context *ilctxt = (struct ipc_log_context *)ctxt; + struct dfunc_info *df_info; + unsigned long flags; + + if (!ilctxt || !dfunc) + return -EINVAL; + + df_info = kmalloc(sizeof(struct dfunc_info), GFP_KERNEL); + if (!df_info) + return -ENOSPC; + + read_lock_irqsave(&context_list_lock_lha1, flags); + spin_lock(&ilctxt->context_lock_lhb1); + df_info->type = type; + df_info->dfunc = dfunc; + list_add_tail(&df_info->list, &ilctxt->dfunc_info_list); + spin_unlock(&ilctxt->context_lock_lhb1); + read_unlock_irqrestore(&context_list_lock_lha1, flags); + return 0; +} +EXPORT_SYMBOL(add_deserialization_func); + +static void *get_deserialization_func(struct ipc_log_context *ilctxt, + int type) +{ + struct dfunc_info *df_info = NULL; + + if (!ilctxt) + return NULL; + + list_for_each_entry(df_info, &ilctxt->dfunc_info_list, list) { + if (df_info->type == type) + return df_info->dfunc; + } + return NULL; +} + +/** + * ipc_log_context_create: Create a debug log context + * Should not be called from atomic context + * + * @max_num_pages: Number of pages of logging space required (max. 10) + * @mod_name : Name of the directory entry under DEBUGFS + * @user_version : Version number of user-defined message formats + * + * returns context id on success, NULL on failure + */ +void *ipc_log_context_create(int max_num_pages, + const char *mod_name, uint16_t user_version) +{ + struct ipc_log_context *ctxt; + struct ipc_log_page *pg = NULL; + int page_cnt; + unsigned long flags; + + ctxt = kzalloc(sizeof(struct ipc_log_context), GFP_KERNEL); + if (!ctxt) { + pr_err("%s: cannot create ipc_log_context\n", __func__); + return 0; + } + + init_completion(&ctxt->read_avail); + INIT_LIST_HEAD(&ctxt->page_list); + INIT_LIST_HEAD(&ctxt->dfunc_info_list); + spin_lock_init(&ctxt->context_lock_lhb1); + for (page_cnt = 0; page_cnt < max_num_pages; page_cnt++) { + pg = kzalloc(sizeof(struct ipc_log_page), GFP_KERNEL); + if (!pg) { + pr_err("%s: cannot create ipc_log_page\n", __func__); + goto release_ipc_log_context; + } + pg->hdr.log_id = (uint64_t)(uintptr_t)ctxt; + pg->hdr.page_num = LOG_PAGE_FLAG | page_cnt; + pg->hdr.ctx_offset = (int64_t)((uint64_t)(uintptr_t)ctxt - + (uint64_t)(uintptr_t)&pg->hdr); + + /* set magic last to signal that page init is complete */ + pg->hdr.magic = IPC_LOGGING_MAGIC_NUM; + pg->hdr.nmagic = ~(IPC_LOGGING_MAGIC_NUM); + + spin_lock_irqsave(&ctxt->context_lock_lhb1, flags); + list_add_tail(&pg->hdr.list, &ctxt->page_list); + spin_unlock_irqrestore(&ctxt->context_lock_lhb1, flags); + } + + ctxt->log_id = (uint64_t)(uintptr_t)ctxt; + ctxt->version = IPC_LOG_VERSION; + strlcpy(ctxt->name, mod_name, IPC_LOG_MAX_CONTEXT_NAME_LEN); + ctxt->user_version = user_version; + ctxt->first_page = get_first_page(ctxt); + ctxt->last_page = pg; + ctxt->write_page = ctxt->first_page; + ctxt->read_page = ctxt->first_page; + ctxt->nd_read_page = ctxt->first_page; + ctxt->write_avail = max_num_pages * LOG_PAGE_DATA_SIZE; + ctxt->header_size = sizeof(struct ipc_log_page_header); + create_ctx_debugfs(ctxt, mod_name); + + /* set magic last to signal context init is complete */ + ctxt->magic = IPC_LOG_CONTEXT_MAGIC_NUM; + ctxt->nmagic = ~(IPC_LOG_CONTEXT_MAGIC_NUM); + + write_lock_irqsave(&context_list_lock_lha1, flags); + list_add_tail(&ctxt->list, &ipc_log_context_list); + write_unlock_irqrestore(&context_list_lock_lha1, flags); + return (void *)ctxt; + +release_ipc_log_context: + while (page_cnt-- > 0) { + pg = get_first_page(ctxt); + list_del(&pg->hdr.list); + kfree(pg); + } + kfree(ctxt); + return 0; +} +EXPORT_SYMBOL(ipc_log_context_create); + +/* + * Destroy debug log context + * + * @ctxt: debug log context created by calling ipc_log_context_create API. + */ +int ipc_log_context_destroy(void *ctxt) +{ + struct ipc_log_context *ilctxt = (struct ipc_log_context *)ctxt; + struct ipc_log_page *pg = NULL; + unsigned long flags; + + if (!ilctxt) + return 0; + + while (!list_empty(&ilctxt->page_list)) { + pg = get_first_page(ctxt); + list_del(&pg->hdr.list); + kfree(pg); + } + + write_lock_irqsave(&context_list_lock_lha1, flags); + list_del(&ilctxt->list); + write_unlock_irqrestore(&context_list_lock_lha1, flags); + + debugfs_remove_recursive(ilctxt->dent); + + kfree(ilctxt); + return 0; +} +EXPORT_SYMBOL(ipc_log_context_destroy); + +static int __init ipc_logging_init(void) +{ + check_and_create_debugfs(); + return 0; +} + +module_init(ipc_logging_init); + +MODULE_DESCRIPTION("ipc logging"); +MODULE_LICENSE("GPL v2"); diff --git a/kernel/trace/ipc_logging_debug.c b/kernel/trace/ipc_logging_debug.c new file mode 100644 index 000000000000..a54538798f2b --- /dev/null +++ b/kernel/trace/ipc_logging_debug.c @@ -0,0 +1,184 @@ +/* Copyright (c) 2012-2015, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/jiffies.h> +#include <linux/debugfs.h> +#include <linux/io.h> +#include <linux/idr.h> +#include <linux/string.h> +#include <linux/sched.h> +#include <linux/wait.h> +#include <linux/delay.h> +#include <linux/completion.h> +#include <linux/ipc_logging.h> + +#include "ipc_logging_private.h" + +static DEFINE_MUTEX(ipc_log_debugfs_init_lock); +static struct dentry *root_dent; + +static int debug_log(struct ipc_log_context *ilctxt, + char *buff, int size, int cont) +{ + int i = 0; + int ret; + + if (size < MAX_MSG_DECODED_SIZE) { + pr_err("%s: buffer size %d < %d\n", __func__, size, + MAX_MSG_DECODED_SIZE); + return -ENOMEM; + } + do { + i = ipc_log_extract(ilctxt, buff, size - 1); + if (cont && i == 0) { + ret = wait_for_completion_interruptible( + &ilctxt->read_avail); + if (ret < 0) + return ret; + } + } while (cont && i == 0); + + return i; +} + +/* + * VFS Read operation helper which dispatches the call to the debugfs + * read command stored in file->private_data. + * + * @file File structure + * @buff user buffer + * @count size of user buffer + * @ppos file position to read from (only a value of 0 is accepted) + * @cont 1 = continuous mode (don't return 0 to signal end-of-file) + * + * @returns ==0 end of file + * >0 number of bytes read + * <0 error + */ +static ssize_t debug_read_helper(struct file *file, char __user *buff, + size_t count, loff_t *ppos, int cont) +{ + struct ipc_log_context *ilctxt = file->private_data; + char *buffer; + int bsize; + + buffer = kmalloc(count, GFP_KERNEL); + if (!buffer) + return -ENOMEM; + + bsize = debug_log(ilctxt, buffer, count, cont); + if (bsize > 0) { + if (copy_to_user(buff, buffer, bsize)) { + kfree(buffer); + return -EFAULT; + } + *ppos += bsize; + } + kfree(buffer); + return bsize; +} + +static ssize_t debug_read(struct file *file, char __user *buff, + size_t count, loff_t *ppos) +{ + return debug_read_helper(file, buff, count, ppos, 0); +} + +static ssize_t debug_read_cont(struct file *file, char __user *buff, + size_t count, loff_t *ppos) +{ + return debug_read_helper(file, buff, count, ppos, 1); +} + +static int debug_open(struct inode *inode, struct file *file) +{ + file->private_data = inode->i_private; + return 0; +} + +static const struct file_operations debug_ops = { + .read = debug_read, + .open = debug_open, +}; + +static const struct file_operations debug_ops_cont = { + .read = debug_read_cont, + .open = debug_open, +}; + +static void debug_create(const char *name, mode_t mode, + struct dentry *dent, + struct ipc_log_context *ilctxt, + const struct file_operations *fops) +{ + debugfs_create_file(name, mode, dent, ilctxt, fops); +} + +static void dfunc_string(struct encode_context *ectxt, + struct decode_context *dctxt) +{ + tsv_timestamp_read(ectxt, dctxt, ""); + tsv_qtimer_read(ectxt, dctxt, " "); + tsv_byte_array_read(ectxt, dctxt, ""); + + /* add trailing \n if necessary */ + if (*(dctxt->buff - 1) != '\n') { + if (dctxt->size) { + ++dctxt->buff; + --dctxt->size; + } + *(dctxt->buff - 1) = '\n'; + } +} + +void check_and_create_debugfs(void) +{ + mutex_lock(&ipc_log_debugfs_init_lock); + if (!root_dent) { + root_dent = debugfs_create_dir("ipc_logging", 0); + + if (IS_ERR(root_dent)) { + pr_err("%s: unable to create debugfs %ld\n", + __func__, PTR_ERR(root_dent)); + root_dent = NULL; + } + } + mutex_unlock(&ipc_log_debugfs_init_lock); +} +EXPORT_SYMBOL(check_and_create_debugfs); + +void create_ctx_debugfs(struct ipc_log_context *ctxt, + const char *mod_name) +{ + if (!root_dent) + check_and_create_debugfs(); + + if (root_dent) { + ctxt->dent = debugfs_create_dir(mod_name, root_dent); + if (!IS_ERR(ctxt->dent)) { + debug_create("log", 0444, ctxt->dent, + ctxt, &debug_ops); + debug_create("log_cont", 0444, ctxt->dent, + ctxt, &debug_ops_cont); + } + } + add_deserialization_func((void *)ctxt, + TSV_TYPE_STRING, dfunc_string); +} +EXPORT_SYMBOL(create_ctx_debugfs); diff --git a/kernel/trace/ipc_logging_private.h b/kernel/trace/ipc_logging_private.h new file mode 100644 index 000000000000..3ac950695086 --- /dev/null +++ b/kernel/trace/ipc_logging_private.h @@ -0,0 +1,165 @@ +/* Copyright (c) 2012-2015, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ +#ifndef _IPC_LOGGING_PRIVATE_H +#define _IPC_LOGGING_PRIVATE_H + +#include <linux/ipc_logging.h> + +#define IPC_LOG_VERSION 0x0003 +#define IPC_LOG_MAX_CONTEXT_NAME_LEN 32 + +/** + * struct ipc_log_page_header - Individual log page header + * + * @magic: Magic number (used for log extraction) + * @nmagic: Inverse of magic number (used for log extraction) + * @page_num: Index of page (0.. N - 1) (note top bit is always set) + * @read_offset: Read offset in page + * @write_offset: Write offset in page (or 0xFFFF if full) + * @log_id: ID of logging context that owns this page + * @start_time: Scheduler clock for first write time in page + * @end_time: Scheduler clock for last write time in page + * @ctx_offset: Signed offset from page to the logging context. Used to + * optimize ram-dump extraction. + * + * @list: Linked list of pages that make up a log + * @nd_read_offset: Non-destructive read offset used for debugfs + * + * The first part of the structure defines data that is used to extract the + * logs from a memory dump and elements in this section should not be changed + * or re-ordered. New local data structures can be added to the end of the + * structure since they will be ignored by the extraction tool. + */ +struct ipc_log_page_header { + uint32_t magic; + uint32_t nmagic; + uint32_t page_num; + uint16_t read_offset; + uint16_t write_offset; + uint64_t log_id; + uint64_t start_time; + uint64_t end_time; + int64_t ctx_offset; + + /* add local data structures after this point */ + struct list_head list; + uint16_t nd_read_offset; +}; + +/** + * struct ipc_log_page - Individual log page + * + * @hdr: Log page header + * @data: Log data + * + * Each log consists of 1 to N log pages. Data size is adjusted to always fit + * the structure into a single kernel page. + */ +struct ipc_log_page { + struct ipc_log_page_header hdr; + char data[PAGE_SIZE - sizeof(struct ipc_log_page_header)]; +}; + +/** + * struct ipc_log_context - main logging context + * + * @magic: Magic number (used for log extraction) + * @nmagic: Inverse of magic number (used for log extraction) + * @version: IPC Logging version of log format + * @user_version: Version number for user-defined messages + * @header_size: Size of the log header which is used to determine the offset + * of ipc_log_page::data + * @log_id: Log ID (assigned when log is created) + * @name: Name of the log used to uniquely identify the log during extraction + * + * @list: List of log contexts (struct ipc_log_context) + * @page_list: List of log pages (struct ipc_log_page) + * @first_page: First page in list of logging pages + * @last_page: Last page in list of logging pages + * @write_page: Current write page + * @read_page: Current read page (for internal reads) + * @nd_read_page: Current debugfs extraction page (non-destructive) + * + * @write_avail: Number of bytes available to write in all pages + * @dent: Debugfs node for run-time log extraction + * @dfunc_info_list: List of deserialization functions + * @context_lock_lhb1: Lock for entire structure + * @read_avail: Completed when new data is added to the log + */ +struct ipc_log_context { + uint32_t magic; + uint32_t nmagic; + uint32_t version; + uint16_t user_version; + uint16_t header_size; + uint64_t log_id; + char name[IPC_LOG_MAX_CONTEXT_NAME_LEN]; + + /* add local data structures after this point */ + struct list_head list; + struct list_head page_list; + struct ipc_log_page *first_page; + struct ipc_log_page *last_page; + struct ipc_log_page *write_page; + struct ipc_log_page *read_page; + struct ipc_log_page *nd_read_page; + + uint32_t write_avail; + struct dentry *dent; + struct list_head dfunc_info_list; + spinlock_t context_lock_lhb1; + struct completion read_avail; +}; + +struct dfunc_info { + struct list_head list; + int type; + void (*dfunc) (struct encode_context *, struct decode_context *); +}; + +enum { + TSV_TYPE_INVALID, + TSV_TYPE_TIMESTAMP, + TSV_TYPE_POINTER, + TSV_TYPE_INT32, + TSV_TYPE_BYTE_ARRAY, + TSV_TYPE_QTIMER, +}; + +enum { + OUTPUT_DEBUGFS, +}; + +#define IPC_LOG_CONTEXT_MAGIC_NUM 0x25874452 +#define IPC_LOGGING_MAGIC_NUM 0x52784425 +#define MIN(x, y) ((x) < (y) ? (x) : (y)) +#define IS_MSG_TYPE(x) (((x) > TSV_TYPE_MSG_START) && \ + ((x) < TSV_TYPE_MSG_END)) +#define MAX_MSG_DECODED_SIZE (MAX_MSG_SIZE*4) + +#if (defined(CONFIG_DEBUG_FS)) +void check_and_create_debugfs(void); + +void create_ctx_debugfs(struct ipc_log_context *ctxt, + const char *mod_name); +#else +void check_and_create_debugfs(void) +{ +} + +void create_ctx_debugfs(struct ipc_log_context *ctxt, const char *mod_name) +{ +} +#endif + +#endif diff --git a/kernel/trace/msm_rtb.c b/kernel/trace/msm_rtb.c new file mode 100644 index 000000000000..587082117842 --- /dev/null +++ b/kernel/trace/msm_rtb.c @@ -0,0 +1,346 @@ +/* + * Copyright (c) 2013-2017, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/atomic.h> +#include <linux/export.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/dma-mapping.h> +#include <linux/mod_devicetable.h> +#include <linux/platform_device.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/atomic.h> +#include <linux/of.h> +#include <linux/of_address.h> +#include <linux/io.h> +#include <asm-generic/sizes.h> +#include <linux/msm_rtb.h> +#include <asm/timex.h> +#include <soc/qcom/minidump.h> + +#define SENTINEL_BYTE_1 0xFF +#define SENTINEL_BYTE_2 0xAA +#define SENTINEL_BYTE_3 0xFF + +#define RTB_COMPAT_STR "qcom,msm-rtb" + +/* Write + * 1) 3 bytes sentinel + * 2) 1 bytes of log type + * 3) 8 bytes of where the caller came from + * 4) 4 bytes index + * 4) 8 bytes extra data from the caller + * 5) 8 bytes of timestamp + * 6) 8 bytes of cyclecount + * + * Total = 40 bytes. + */ +struct msm_rtb_layout { + unsigned char sentinel[3]; + unsigned char log_type; + uint32_t idx; + uint64_t caller; + uint64_t data; + uint64_t timestamp; + uint64_t cycle_count; +} __attribute__ ((__packed__)); + + +struct msm_rtb_state { + struct msm_rtb_layout *rtb; + phys_addr_t phys; + int nentries; + int size; + int enabled; + int initialized; + uint32_t filter; + int step_size; +}; + +#if defined(CONFIG_QCOM_RTB_SEPARATE_CPUS) +DEFINE_PER_CPU(atomic_t, msm_rtb_idx_cpu); +#else +static atomic_t msm_rtb_idx; +#endif + +static struct msm_rtb_state msm_rtb = { + .filter = 1 << LOGK_LOGBUF, + .enabled = 1, +}; + +module_param_named(filter, msm_rtb.filter, uint, 0644); +module_param_named(enable, msm_rtb.enabled, int, 0644); + +static int msm_rtb_panic_notifier(struct notifier_block *this, + unsigned long event, void *ptr) +{ + msm_rtb.enabled = 0; + return NOTIFY_DONE; +} + +static struct notifier_block msm_rtb_panic_blk = { + .notifier_call = msm_rtb_panic_notifier, + .priority = INT_MAX, +}; + +int notrace msm_rtb_event_should_log(enum logk_event_type log_type) +{ + return msm_rtb.initialized && msm_rtb.enabled && + ((1 << (log_type & ~LOGTYPE_NOPC)) & msm_rtb.filter); +} +EXPORT_SYMBOL(msm_rtb_event_should_log); + +static void msm_rtb_emit_sentinel(struct msm_rtb_layout *start) +{ + start->sentinel[0] = SENTINEL_BYTE_1; + start->sentinel[1] = SENTINEL_BYTE_2; + start->sentinel[2] = SENTINEL_BYTE_3; +} + +static void msm_rtb_write_type(enum logk_event_type log_type, + struct msm_rtb_layout *start) +{ + start->log_type = (char)log_type; +} + +static void msm_rtb_write_caller(uint64_t caller, struct msm_rtb_layout *start) +{ + start->caller = caller; +} + +static void msm_rtb_write_idx(uint32_t idx, + struct msm_rtb_layout *start) +{ + start->idx = idx; +} + +static void msm_rtb_write_data(uint64_t data, struct msm_rtb_layout *start) +{ + start->data = data; +} + +static void msm_rtb_write_timestamp(struct msm_rtb_layout *start) +{ + start->timestamp = sched_clock(); +} + +static void msm_rtb_write_cyclecount(struct msm_rtb_layout *start) +{ + start->cycle_count = get_cycles(); +} + +static void uncached_logk_pc_idx(enum logk_event_type log_type, uint64_t caller, + uint64_t data, int idx) +{ + struct msm_rtb_layout *start; + + start = &msm_rtb.rtb[idx & (msm_rtb.nentries - 1)]; + + msm_rtb_emit_sentinel(start); + msm_rtb_write_type(log_type, start); + msm_rtb_write_caller(caller, start); + msm_rtb_write_idx(idx, start); + msm_rtb_write_data(data, start); + msm_rtb_write_timestamp(start); + msm_rtb_write_cyclecount(start); + mb(); + + return; +} + +static void uncached_logk_timestamp(int idx) +{ + unsigned long long timestamp; + + timestamp = sched_clock(); + uncached_logk_pc_idx(LOGK_TIMESTAMP|LOGTYPE_NOPC, + (uint64_t)lower_32_bits(timestamp), + (uint64_t)upper_32_bits(timestamp), idx); +} + +#if defined(CONFIG_QCOM_RTB_SEPARATE_CPUS) +static int msm_rtb_get_idx(void) +{ + int cpu, i, offset; + atomic_t *index; + + /* + * ideally we would use get_cpu but this is a close enough + * approximation for our purposes. + */ + cpu = raw_smp_processor_id(); + + index = &per_cpu(msm_rtb_idx_cpu, cpu); + + i = atomic_add_return(msm_rtb.step_size, index); + i -= msm_rtb.step_size; + + /* Check if index has wrapped around */ + offset = (i & (msm_rtb.nentries - 1)) - + ((i - msm_rtb.step_size) & (msm_rtb.nentries - 1)); + if (offset < 0) { + uncached_logk_timestamp(i); + i = atomic_add_return(msm_rtb.step_size, index); + i -= msm_rtb.step_size; + } + + return i; +} +#else +static int msm_rtb_get_idx(void) +{ + int i, offset; + + i = atomic_inc_return(&msm_rtb_idx); + i--; + + /* Check if index has wrapped around */ + offset = (i & (msm_rtb.nentries - 1)) - + ((i - 1) & (msm_rtb.nentries - 1)); + if (offset < 0) { + uncached_logk_timestamp(i); + i = atomic_inc_return(&msm_rtb_idx); + i--; + } + + return i; +} +#endif + +int notrace uncached_logk_pc(enum logk_event_type log_type, void *caller, + void *data) +{ + int i; + + if (!msm_rtb_event_should_log(log_type)) + return 0; + + i = msm_rtb_get_idx(); + uncached_logk_pc_idx(log_type, (uint64_t)((unsigned long) caller), + (uint64_t)((unsigned long) data), i); + + return 1; +} +EXPORT_SYMBOL(uncached_logk_pc); + +noinline int notrace uncached_logk(enum logk_event_type log_type, void *data) +{ + return uncached_logk_pc(log_type, __builtin_return_address(0), data); +} +EXPORT_SYMBOL(uncached_logk); + +static int msm_rtb_probe(struct platform_device *pdev) +{ + struct msm_rtb_platform_data *d = pdev->dev.platform_data; + struct md_region md_entry; +#if defined(CONFIG_QCOM_RTB_SEPARATE_CPUS) + unsigned int cpu; +#endif + int ret; + + if (!pdev->dev.of_node) { + msm_rtb.size = d->size; + } else { + u64 size; + struct device_node *pnode; + + pnode = of_parse_phandle(pdev->dev.of_node, + "linux,contiguous-region", 0); + if (pnode != NULL) { + const u32 *addr; + + addr = of_get_address(pnode, 0, &size, NULL); + if (!addr) { + of_node_put(pnode); + return -EINVAL; + } + of_node_put(pnode); + } else { + ret = of_property_read_u32(pdev->dev.of_node, + "qcom,rtb-size", + (u32 *)&size); + if (ret < 0) + return ret; + + } + + msm_rtb.size = size; + } + + if (msm_rtb.size <= 0 || msm_rtb.size > SZ_1M) + return -EINVAL; + + msm_rtb.rtb = dma_alloc_coherent(&pdev->dev, msm_rtb.size, + &msm_rtb.phys, + GFP_KERNEL); + + if (!msm_rtb.rtb) + return -ENOMEM; + + msm_rtb.nentries = msm_rtb.size / sizeof(struct msm_rtb_layout); + + /* Round this down to a power of 2 */ + msm_rtb.nentries = __rounddown_pow_of_two(msm_rtb.nentries); + + memset(msm_rtb.rtb, 0, msm_rtb.size); + + strlcpy(md_entry.name, "KRTB_BUF", sizeof(md_entry.name)); + md_entry.virt_addr = (uintptr_t)msm_rtb.rtb; + md_entry.phys_addr = msm_rtb.phys; + md_entry.size = msm_rtb.size; + if (msm_minidump_add_region(&md_entry)) + pr_info("Failed to add RTB in Minidump\n"); + +#if defined(CONFIG_QCOM_RTB_SEPARATE_CPUS) + for_each_possible_cpu(cpu) { + atomic_t *a = &per_cpu(msm_rtb_idx_cpu, cpu); + atomic_set(a, cpu); + } + msm_rtb.step_size = num_possible_cpus(); +#else + atomic_set(&msm_rtb_idx, 0); + msm_rtb.step_size = 1; +#endif + + atomic_notifier_chain_register(&panic_notifier_list, + &msm_rtb_panic_blk); + msm_rtb.initialized = 1; + return 0; +} + +static struct of_device_id msm_match_table[] = { + {.compatible = RTB_COMPAT_STR}, + {}, +}; + +static struct platform_driver msm_rtb_driver = { + .driver = { + .name = "msm_rtb", + .owner = THIS_MODULE, + .of_match_table = msm_match_table + }, +}; + +static int __init msm_rtb_init(void) +{ + return platform_driver_probe(&msm_rtb_driver, msm_rtb_probe); +} + +static void __exit msm_rtb_exit(void) +{ + platform_driver_unregister(&msm_rtb_driver); +} +module_init(msm_rtb_init) +module_exit(msm_rtb_exit) diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index eb4220a132ec..49fa2e6eea98 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c @@ -15,4 +15,3 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume); EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); - diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9d197b18e09c..a67f792fb950 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -42,6 +42,7 @@ #include <linux/nmi.h> #include <linux/fs.h> #include <linux/sched/rt.h> +#include <linux/coresight-stm.h> #include "trace.h" #include "trace_output.h" @@ -574,8 +575,11 @@ int __trace_puts(unsigned long ip, const char *str, int size) if (entry->buf[size - 1] != '\n') { entry->buf[size] = '\n'; entry->buf[size + 1] = '\0'; - } else + stm_log(OST_ENTITY_TRACE_PRINTK, entry->buf, size + 2); + } else { entry->buf[size] = '\0'; + stm_log(OST_ENTITY_TRACE_PRINTK, entry->buf, size + 1); + } __buffer_unlock_commit(buffer, event); ftrace_trace_stack(&global_trace, buffer, irq_flags, 4, pc, NULL); @@ -616,6 +620,7 @@ int __trace_bputs(unsigned long ip, const char *str) entry = ring_buffer_event_data(event); entry->ip = ip; entry->str = str; + stm_log(OST_ENTITY_TRACE_PRINTK, entry->str, strlen(entry->str)+1); __buffer_unlock_commit(buffer, event); ftrace_trace_stack(&global_trace, buffer, irq_flags, 4, pc, NULL); @@ -1360,11 +1365,11 @@ void tracing_reset_all_online_cpus(void) #define SAVED_CMDLINES_DEFAULT 128 #define NO_CMDLINE_MAP UINT_MAX -static unsigned saved_tgids[SAVED_CMDLINES_DEFAULT]; static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; struct saved_cmdlines_buffer { unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; unsigned *map_cmdline_to_pid; + unsigned *map_cmdline_to_tgid; unsigned cmdline_num; int cmdline_idx; char *saved_cmdlines; @@ -1395,12 +1400,23 @@ static int allocate_cmdlines_buffer(unsigned int val, return -ENOMEM; } + s->map_cmdline_to_tgid = kmalloc_array(val, + sizeof(*s->map_cmdline_to_tgid), + GFP_KERNEL); + if (!s->map_cmdline_to_tgid) { + kfree(s->map_cmdline_to_pid); + kfree(s->saved_cmdlines); + return -ENOMEM; + } + s->cmdline_idx = 0; s->cmdline_num = val; memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP, sizeof(s->map_pid_to_cmdline)); memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP, val * sizeof(*s->map_cmdline_to_pid)); + memset(s->map_cmdline_to_tgid, NO_CMDLINE_MAP, + val * sizeof(*s->map_cmdline_to_tgid)); return 0; } @@ -1569,14 +1585,17 @@ static int trace_save_cmdline(struct task_struct *tsk) tpid = tsk->pid & (PID_MAX_DEFAULT - 1); + preempt_disable(); /* * It's not the end of the world if we don't get * the lock, but we also don't want to spin * nor do we want to disable interrupts, * so if we miss here, then better luck next time. */ - if (!arch_spin_trylock(&trace_cmdline_lock)) + if (!arch_spin_trylock(&trace_cmdline_lock)) { + preempt_enable(); return 0; + } idx = savedcmd->map_pid_to_cmdline[tpid]; if (idx == NO_CMDLINE_MAP) { @@ -1588,8 +1607,9 @@ static int trace_save_cmdline(struct task_struct *tsk) savedcmd->map_cmdline_to_pid[idx] = tsk->pid; set_cmdline(idx, tsk->comm); - saved_tgids[idx] = tsk->tgid; + savedcmd->map_cmdline_to_tgid[idx] = tsk->tgid; arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); return 1; } @@ -1632,19 +1652,29 @@ void trace_find_cmdline(int pid, char comm[]) preempt_enable(); } -int trace_find_tgid(int pid) +static int __find_tgid_locked(int pid) { unsigned map; int tgid; - preempt_disable(); - arch_spin_lock(&trace_cmdline_lock); map = savedcmd->map_pid_to_cmdline[pid]; if (map != NO_CMDLINE_MAP) - tgid = saved_tgids[map]; + tgid = savedcmd->map_cmdline_to_tgid[map]; else tgid = -1; + return tgid; +} + +int trace_find_tgid(int pid) +{ + int tgid; + + preempt_disable(); + arch_spin_lock(&trace_cmdline_lock); + + tgid = __find_tgid_locked(pid); + arch_spin_unlock(&trace_cmdline_lock); preempt_enable(); @@ -2255,6 +2285,7 @@ __trace_array_vprintk(struct ring_buffer *buffer, memcpy(&entry->buf, tbuffer, len + 1); if (!call_filter_check_discard(call, entry, buffer, event)) { + stm_log(OST_ENTITY_TRACE_PRINTK, entry->buf, len + 1); __buffer_unlock_commit(buffer, event); ftrace_trace_stack(&global_trace, buffer, flags, 6, pc, NULL); } @@ -3989,10 +4020,15 @@ tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf, { char buf[64]; int r; + unsigned int n; + preempt_disable(); arch_spin_lock(&trace_cmdline_lock); - r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num); + n = savedcmd->cmdline_num; arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); + + r = scnprintf(buf, sizeof(buf), "%u\n", n); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } @@ -4001,6 +4037,7 @@ static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s) { kfree(s->saved_cmdlines); kfree(s->map_cmdline_to_pid); + kfree(s->map_cmdline_to_tgid); kfree(s); } @@ -4017,10 +4054,12 @@ static int tracing_resize_saved_cmdlines(unsigned int val) return -ENOMEM; } + preempt_disable(); arch_spin_lock(&trace_cmdline_lock); savedcmd_temp = savedcmd; savedcmd = s; arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); free_saved_cmdlines_buffer(savedcmd_temp); return 0; @@ -4239,33 +4278,61 @@ tracing_saved_tgids_read(struct file *file, char __user *ubuf, char *file_buf; char *buf; int len = 0; - int pid; int i; + int *pids; + int n = 0; - file_buf = kmalloc(SAVED_CMDLINES_DEFAULT*(16+1+16), GFP_KERNEL); - if (!file_buf) - return -ENOMEM; + preempt_disable(); + arch_spin_lock(&trace_cmdline_lock); - buf = file_buf; + pids = kmalloc_array(savedcmd->cmdline_num, 2*sizeof(int), GFP_KERNEL); + if (!pids) { + arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); + return -ENOMEM; + } - for (i = 0; i < SAVED_CMDLINES_DEFAULT; i++) { - int tgid; - int r; + for (i = 0; i < savedcmd->cmdline_num; i++) { + int pid; pid = savedcmd->map_cmdline_to_pid[i]; if (pid == -1 || pid == NO_CMDLINE_MAP) continue; - tgid = trace_find_tgid(pid); - r = sprintf(buf, "%d %d\n", pid, tgid); + pids[n] = pid; + pids[n+1] = __find_tgid_locked(pid); + n += 2; + } + arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); + + if (n == 0) { + kfree(pids); + return 0; + } + + /* enough to hold max pair of pids + space, lr and nul */ + len = n * 12; + file_buf = kmalloc(len, GFP_KERNEL); + if (!file_buf) { + kfree(pids); + return -ENOMEM; + } + + buf = file_buf; + for (i = 0; i < n && len > 0; i += 2) { + int r; + + r = snprintf(buf, len, "%d %d\n", pids[i], pids[i+1]); buf += r; - len += r; + len -= r; } len = simple_read_from_buffer(ubuf, cnt, ppos, - file_buf, len); + file_buf, buf - file_buf); kfree(file_buf); + kfree(pids); return len; } @@ -5296,8 +5363,11 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, if (entry->buf[cnt - 1] != '\n') { entry->buf[cnt] = '\n'; entry->buf[cnt + 1] = '\0'; - } else + stm_log(OST_ENTITY_TRACE_MARKER, entry->buf, cnt + 2); + } else { entry->buf[cnt] = '\0'; + stm_log(OST_ENTITY_TRACE_MARKER, entry->buf, cnt + 1); + } __buffer_unlock_commit(buffer, event); diff --git a/kernel/trace/trace_cpu_freq_switch.c b/kernel/trace/trace_cpu_freq_switch.c new file mode 100644 index 000000000000..f9dab6c4bb72 --- /dev/null +++ b/kernel/trace/trace_cpu_freq_switch.c @@ -0,0 +1,312 @@ +/* + * Copyright (c) 2012, 2016 The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/percpu.h> +#include <linux/slab.h> +#include <linux/rbtree.h> +#include <linux/hrtimer.h> +#include <linux/tracefs.h> +#include <linux/ktime.h> +#include <trace/events/power.h> +#include "trace_stat.h" +#include "trace.h" + +struct trans { + struct rb_node node; + unsigned int cpu; + unsigned int start_freq; + unsigned int end_freq; + unsigned int min_us; + unsigned int max_us; + ktime_t total_t; + unsigned int count; +}; +static struct rb_root freq_trans_tree = RB_ROOT; + +static struct trans *tr_search(struct rb_root *root, unsigned int cpu, + unsigned int start_freq, unsigned int end_freq) +{ + struct rb_node *node = root->rb_node; + + while (node) { + struct trans *tr = container_of(node, struct trans, node); + + if (cpu < tr->cpu) + node = node->rb_left; + else if (cpu > tr->cpu) + node = node->rb_right; + else if (start_freq < tr->start_freq) + node = node->rb_left; + else if (start_freq > tr->start_freq) + node = node->rb_right; + else if (end_freq < tr->end_freq) + node = node->rb_left; + else if (end_freq > tr->end_freq) + node = node->rb_right; + else + return tr; + } + return NULL; +} + +static int tr_insert(struct rb_root *root, struct trans *tr) +{ + struct rb_node **new = &(root->rb_node), *parent = NULL; + + while (*new) { + struct trans *this = container_of(*new, struct trans, node); + + parent = *new; + if (tr->cpu < this->cpu) + new = &((*new)->rb_left); + else if (tr->cpu > this->cpu) + new = &((*new)->rb_right); + else if (tr->start_freq < this->start_freq) + new = &((*new)->rb_left); + else if (tr->start_freq > this->start_freq) + new = &((*new)->rb_right); + else if (tr->end_freq < this->end_freq) + new = &((*new)->rb_left); + else if (tr->end_freq > this->end_freq) + new = &((*new)->rb_right); + else + return -EINVAL; + } + + rb_link_node(&tr->node, parent, new); + rb_insert_color(&tr->node, root); + + return 0; +} + +struct trans_state { + spinlock_t lock; + unsigned int start_freq; + unsigned int end_freq; + ktime_t start_t; + bool started; +}; +static DEFINE_PER_CPU(struct trans_state, freq_trans_state); + +static DEFINE_SPINLOCK(state_lock); + +static void probe_start(void *ignore, unsigned int start_freq, + unsigned int end_freq, unsigned int cpu) +{ + unsigned long flags; + + spin_lock_irqsave(&state_lock, flags); + per_cpu(freq_trans_state, cpu).start_freq = start_freq; + per_cpu(freq_trans_state, cpu).end_freq = end_freq; + per_cpu(freq_trans_state, cpu).start_t = ktime_get(); + per_cpu(freq_trans_state, cpu).started = true; + spin_unlock_irqrestore(&state_lock, flags); +} + +static void probe_end(void *ignore, unsigned int cpu) +{ + unsigned long flags; + struct trans *tr; + s64 dur_us; + ktime_t dur_t, end_t = ktime_get(); + + spin_lock_irqsave(&state_lock, flags); + + if (!per_cpu(freq_trans_state, cpu).started) + goto out; + + dur_t = ktime_sub(end_t, per_cpu(freq_trans_state, cpu).start_t); + dur_us = ktime_to_us(dur_t); + + tr = tr_search(&freq_trans_tree, cpu, + per_cpu(freq_trans_state, cpu).start_freq, + per_cpu(freq_trans_state, cpu).end_freq); + if (!tr) { + tr = kzalloc(sizeof(*tr), GFP_ATOMIC); + if (!tr) { + WARN_ONCE(1, "CPU frequency trace is now invalid!\n"); + goto out; + } + + tr->start_freq = per_cpu(freq_trans_state, cpu).start_freq; + tr->end_freq = per_cpu(freq_trans_state, cpu).end_freq; + tr->cpu = cpu; + tr->min_us = UINT_MAX; + tr_insert(&freq_trans_tree, tr); + } + tr->total_t = ktime_add(tr->total_t, dur_t); + tr->count++; + + if (dur_us > tr->max_us) + tr->max_us = dur_us; + if (dur_us < tr->min_us) + tr->min_us = dur_us; + + per_cpu(freq_trans_state, cpu).started = false; +out: + spin_unlock_irqrestore(&state_lock, flags); +} + +static void *freq_switch_stat_start(struct tracer_stat *trace) +{ + struct rb_node *n; + unsigned long flags; + + spin_lock_irqsave(&state_lock, flags); + n = rb_first(&freq_trans_tree); + spin_unlock_irqrestore(&state_lock, flags); + + return n; +} + +static void *freq_switch_stat_next(void *prev, int idx) +{ + struct rb_node *n; + unsigned long flags; + + spin_lock_irqsave(&state_lock, flags); + n = rb_next(prev); + spin_unlock_irqrestore(&state_lock, flags); + + return n; +} + +static int freq_switch_stat_show(struct seq_file *s, void *p) +{ + unsigned long flags; + struct trans *tr = p; + + spin_lock_irqsave(&state_lock, flags); + seq_printf(s, "%3d %9d %8d %5d %6lld %6d %6d\n", tr->cpu, + tr->start_freq, tr->end_freq, tr->count, + div_s64(ktime_to_us(tr->total_t), tr->count), + tr->min_us, tr->max_us); + spin_unlock_irqrestore(&state_lock, flags); + + return 0; +} + +static void freq_switch_stat_release(void *stat) +{ + struct trans *tr = stat; + unsigned long flags; + + spin_lock_irqsave(&state_lock, flags); + rb_erase(&tr->node, &freq_trans_tree); + spin_unlock_irqrestore(&state_lock, flags); + kfree(tr); +} + +static int freq_switch_stat_headers(struct seq_file *s) +{ + seq_puts(s, "CPU START_KHZ END_KHZ COUNT AVG_US MIN_US MAX_US\n"); + seq_puts(s, " | | | | | | |\n"); + return 0; +} + +struct tracer_stat freq_switch_stats __read_mostly = { + .name = "cpu_freq_switch", + .stat_start = freq_switch_stat_start, + .stat_next = freq_switch_stat_next, + .stat_show = freq_switch_stat_show, + .stat_release = freq_switch_stat_release, + .stat_headers = freq_switch_stat_headers +}; + +static void trace_freq_switch_disable(void) +{ + unregister_stat_tracer(&freq_switch_stats); + unregister_trace_cpu_frequency_switch_end(probe_end, NULL); + unregister_trace_cpu_frequency_switch_start(probe_start, NULL); + pr_info("disabled cpu frequency switch time profiling\n"); +} + +static int trace_freq_switch_enable(void) +{ + int ret; + + ret = register_trace_cpu_frequency_switch_start(probe_start, NULL); + if (ret) + goto out; + + ret = register_trace_cpu_frequency_switch_end(probe_end, NULL); + if (ret) + goto err_register_switch_end; + + ret = register_stat_tracer(&freq_switch_stats); + if (ret) + goto err_register_stat_tracer; + + pr_info("enabled cpu frequency switch time profiling\n"); + return 0; + +err_register_stat_tracer: + unregister_trace_cpu_frequency_switch_end(probe_end, NULL); +err_register_switch_end: + register_trace_cpu_frequency_switch_start(probe_start, NULL); +out: + pr_err("failed to enable cpu frequency switch time profiling\n"); + + return ret; +} + +static DEFINE_MUTEX(debugfs_lock); +static bool trace_freq_switch_enabled; + +static int debug_toggle_tracing(void *data, u64 val) +{ + int ret = 0; + + mutex_lock(&debugfs_lock); + + if (val == 1 && !trace_freq_switch_enabled) + ret = trace_freq_switch_enable(); + else if (val == 0 && trace_freq_switch_enabled) + trace_freq_switch_disable(); + else if (val > 1) + ret = -EINVAL; + + if (!ret) + trace_freq_switch_enabled = val; + + mutex_unlock(&debugfs_lock); + + return ret; +} + +static int debug_tracing_state_get(void *data, u64 *val) +{ + mutex_lock(&debugfs_lock); + *val = trace_freq_switch_enabled; + mutex_unlock(&debugfs_lock); + + return 0; +} +DEFINE_SIMPLE_ATTRIBUTE(debug_tracing_state_fops, debug_tracing_state_get, + debug_toggle_tracing, "%llu\n"); + +static int __init trace_freq_switch_init(void) +{ + struct dentry *d_tracer = tracing_init_dentry(); + + if (IS_ERR(d_tracer)) + return 0; + + tracefs_create_file("cpu_freq_switch_profile_enabled", + S_IRUGO | S_IWUSR, d_tracer, NULL, &debug_tracing_state_fops); + + return 0; +} +late_initcall(trace_freq_switch_init); diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index cc9f7a9319be..731f6484b811 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -256,7 +256,8 @@ int perf_trace_add(struct perf_event *p_event, int flags) void perf_trace_del(struct perf_event *p_event, int flags) { struct trace_event_call *tp_event = p_event->tp_event; - hlist_del_rcu(&p_event->hlist_entry); + if (!hlist_unhashed(&p_event->hlist_entry)) + hlist_del_rcu(&p_event->hlist_entry); tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event); } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index b89e00c748f1..48aee7ff6381 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -287,14 +287,15 @@ static void output_printk(struct trace_event_buffer *fbuffer) spin_unlock_irqrestore(&tracepoint_iter_lock, flags); } -void trace_event_buffer_commit(struct trace_event_buffer *fbuffer) +void trace_event_buffer_commit(struct trace_event_buffer *fbuffer, + unsigned long len) { if (tracepoint_printk) output_printk(fbuffer); event_trigger_unlock_commit(fbuffer->trace_file, fbuffer->buffer, fbuffer->event, fbuffer->entry, - fbuffer->flags, fbuffer->pc); + fbuffer->flags, fbuffer->pc, len); } EXPORT_SYMBOL_GPL(trace_event_buffer_commit); diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 21b162c07e83..c00137ea939e 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -13,6 +13,7 @@ #include <linux/uaccess.h> #include <linux/module.h> #include <linux/ftrace.h> +#include <linux/sched/sysctl.h> #include "trace.h" @@ -39,6 +40,12 @@ static int save_flags; static void stop_irqsoff_tracer(struct trace_array *tr, int graph); static int start_irqsoff_tracer(struct trace_array *tr, int graph); +/* + * irqsoff stack tracing threshold in ns. + * default: 1ms + */ +unsigned int sysctl_irqsoff_tracing_threshold_ns = 1000000UL; + #ifdef CONFIG_PREEMPT_TRACER static inline int preempt_trace(void) @@ -454,17 +461,52 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1) #else /* !CONFIG_PROVE_LOCKING */ +#ifdef CONFIG_PREEMPTIRQ_EVENTS +struct irqsoff_store { + u64 ts; + unsigned long caddr[4]; +}; + +static DEFINE_PER_CPU(struct irqsoff_store, the_irqsoff); +#endif /* CONFIG_PREEMPTIRQ_EVENTS */ + /* * We are only interested in hardirq on/off events: */ static inline void tracer_hardirqs_on(void) { +#ifdef CONFIG_PREEMPTIRQ_EVENTS + struct irqsoff_store *is = &per_cpu(the_irqsoff, + raw_smp_processor_id()); + + if (!is->ts) { + is->ts = sched_clock(); + is->caddr[0] = CALLER_ADDR0; + is->caddr[1] = CALLER_ADDR1; + is->caddr[2] = CALLER_ADDR2; + is->caddr[3] = CALLER_ADDR3; + } +#endif /* CONFIG_PREEMPTIRQ_EVENTS */ if (!preempt_trace() && irq_trace()) stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } static inline void tracer_hardirqs_off(void) { +#ifdef CONFIG_PREEMPTIRQ_EVENTS + struct irqsoff_store *is = &per_cpu(the_irqsoff, + raw_smp_processor_id()); + u64 delta = 0; + + if (is->ts) { + delta = sched_clock() - is->ts; + is->ts = 0; + } + if (delta > sysctl_irqsoff_tracing_threshold_ns) + trace_irqs_disable(delta, is->caddr[0], is->caddr[1], + is->caddr[2], is->caddr[3]); +#endif /* CONFIG_PREEMPTIRQ_EVENTS */ + if (!preempt_trace() && irq_trace()) start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index ad1d6164e946..e82cff5c842c 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -304,7 +304,7 @@ static int t_show(struct seq_file *m, void *v) if (!*fmt) return 0; - seq_printf(m, "0x%lx : \"", *(unsigned long *)fmt); + seq_printf(m, "0x%lx : \"", 0L); /* * Tabs and new lines need to be converted. diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 6403f45da9d5..927fd4ad5846 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -359,7 +359,8 @@ static bool report_latency(struct trace_array *tr, cycle_t delta) } static void -probe_wakeup_migrate_task(void *ignore, struct task_struct *task, int cpu) +probe_wakeup_migrate_task(void *ignore, struct task_struct *task, int cpu, + unsigned int load) { if (task != wakeup_task) return; diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 0655afbea83f..a01740a98afa 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -336,7 +336,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args); event_trigger_unlock_commit(trace_file, buffer, event, entry, - irq_flags, pc); + irq_flags, pc, 0); } static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) @@ -382,7 +382,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) entry->ret = syscall_get_return_value(current, regs); event_trigger_unlock_commit(trace_file, buffer, event, entry, - irq_flags, pc); + irq_flags, pc, 0); } static int reg_event_syscall_enter(struct trace_event_file *file, diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 518e62a398d2..0839129fbbd8 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -830,7 +830,7 @@ static void __uprobe_trace_func(struct trace_uprobe *tu, memcpy(data, ucb->buf, tu->tp.size + dsize); - event_trigger_unlock_commit(trace_file, buffer, event, entry, 0, 0); + event_trigger_unlock_commit(trace_file, buffer, event, entry, 0, 0, 0); } /* uprobe handler */ diff --git a/kernel/watchdog.c b/kernel/watchdog.c index b718530034d2..f527d8e65d69 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -13,6 +13,7 @@ #include <linux/mm.h> #include <linux/cpu.h> +#include <linux/device.h> #include <linux/nmi.h> #include <linux/init.h> #include <linux/module.h> @@ -20,11 +21,13 @@ #include <linux/smpboot.h> #include <linux/sched/rt.h> #include <linux/tick.h> +#include <linux/workqueue.h> #include <asm/irq_regs.h> #include <linux/kvm_para.h> #include <linux/perf_event.h> #include <linux/kthread.h> +#include <soc/qcom/watchdog.h> /* * The run state of the lockup detectors is controlled by the content of the @@ -94,6 +97,7 @@ static u64 __read_mostly sample_period; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); +static DEFINE_PER_CPU(unsigned int, watchdog_en); static DEFINE_PER_CPU(bool, softlockup_touch_sync); static DEFINE_PER_CPU(bool, soft_watchdog_warn); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); @@ -230,7 +234,15 @@ static void __touch_watchdog(void) __this_cpu_write(watchdog_touch_ts, get_timestamp()); } -void touch_softlockup_watchdog(void) +/** + * touch_softlockup_watchdog_sched - touch watchdog on scheduler stalls + * + * Call when the scheduler may have stalled for legitimate reasons + * preventing the watchdog task from executing - e.g. the scheduler + * entering idle state. This should only be used for scheduler events. + * Use touch_softlockup_watchdog() for everything else. + */ +void touch_softlockup_watchdog_sched(void) { /* * Preemption can be enabled. It doesn't matter which CPU's timestamp @@ -238,6 +250,12 @@ void touch_softlockup_watchdog(void) */ raw_cpu_write(watchdog_touch_ts, 0); } + +void touch_softlockup_watchdog(void) +{ + touch_softlockup_watchdog_sched(); + wq_watchdog_touch(raw_smp_processor_id()); +} EXPORT_SYMBOL(touch_softlockup_watchdog); void touch_all_softlockup_watchdogs(void) @@ -251,6 +269,7 @@ void touch_all_softlockup_watchdogs(void) */ for_each_watchdog_cpu(cpu) per_cpu(watchdog_touch_ts, cpu) = 0; + wq_watchdog_touch(-1); } #ifdef CONFIG_HARDLOCKUP_DETECTOR @@ -346,8 +365,11 @@ static void watchdog_check_hardlockup_other_cpu(void) if (per_cpu(hard_watchdog_warn, next_cpu) == true) return; - if (hardlockup_panic) - panic("Watchdog detected hard LOCKUP on cpu %u", next_cpu); + if (hardlockup_panic) { + pr_err("Watchdog detected hard LOCKUP on cpu %u", + next_cpu); + msm_trigger_wdog_bite(); + } else WARN(1, "Watchdog detected hard LOCKUP on cpu %u", next_cpu); @@ -409,6 +431,9 @@ static void watchdog_overflow_callback(struct perf_event *event, return; pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu); + if (hardlockup_panic) + msm_trigger_wdog_bite(); + print_modules(); print_irqtrace_events(current); if (regs) @@ -531,6 +556,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", smp_processor_id(), duration, current->comm, task_pid_nr(current)); + + if (softlockup_panic) + msm_trigger_wdog_bite(); __this_cpu_write(softlockup_task_ptr_saved, current); print_modules(); print_irqtrace_events(current); @@ -567,9 +595,13 @@ static void watchdog_set_prio(unsigned int policy, unsigned int prio) sched_setscheduler(current, policy, ¶m); } -static void watchdog_enable(unsigned int cpu) +void watchdog_enable(unsigned int cpu) { struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); + unsigned int *enabled = raw_cpu_ptr(&watchdog_en); + + if (*enabled) + return; /* kick off the timer for the hardlockup detector */ hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); @@ -585,16 +617,40 @@ static void watchdog_enable(unsigned int cpu) /* initialize timestamp */ watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1); __touch_watchdog(); + + /* + * Need to ensure above operations are observed by other CPUs before + * indicating that timer is enabled. This is to synchronize core + * isolation and hotplug. Core isolation will wait for this flag to be + * set. + */ + mb(); + *enabled = 1; } -static void watchdog_disable(unsigned int cpu) +void watchdog_disable(unsigned int cpu) { struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); + unsigned int *enabled = raw_cpu_ptr(&watchdog_en); + + if (!*enabled) + return; watchdog_set_prio(SCHED_NORMAL, 0); hrtimer_cancel(hrtimer); /* disable the perf event */ watchdog_nmi_disable(cpu); + + /* + * No need for barrier here since disabling the watchdog is + * synchronized with hotplug lock + */ + *enabled = 0; +} + +bool watchdog_configured(unsigned int cpu) +{ + return *per_cpu_ptr(&watchdog_en, cpu); } static void watchdog_cleanup(unsigned int cpu, bool online) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index b7eed05ea987..46efb8ed5224 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -48,6 +48,8 @@ #include <linux/nodemask.h> #include <linux/moduleparam.h> #include <linux/uaccess.h> +#include <linux/bug.h> +#include <linux/delay.h> #include "workqueue_internal.h" @@ -149,6 +151,8 @@ struct worker_pool { int id; /* I: pool ID */ unsigned int flags; /* X: flags */ + unsigned long watchdog_ts; /* L: watchdog timestamp */ + struct list_head worklist; /* L: list of pending works */ int nr_workers; /* L: total number of workers */ @@ -1123,6 +1127,8 @@ static void pwq_activate_delayed_work(struct work_struct *work) struct pool_workqueue *pwq = get_work_pwq(work); trace_workqueue_activate_work(work); + if (list_empty(&pwq->pool->worklist)) + pwq->pool->watchdog_ts = jiffies; move_linked_works(work, &pwq->pool->worklist, NULL); __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); pwq->nr_active++; @@ -1281,6 +1287,12 @@ fail: if (work_is_canceling(work)) return -ENOENT; cpu_relax(); + /* + * The queueing is in progress in another context. If we keep + * taking the pool->lock in a busy loop, the other context may + * never get the lock. Give 1 usec delay to avoid this contention. + */ + udelay(1); return -EAGAIN; } @@ -1424,6 +1436,8 @@ retry: trace_workqueue_activate_work(work); pwq->nr_active++; worklist = &pwq->pool->worklist; + if (list_empty(worklist)) + pwq->pool->watchdog_ts = jiffies; } else { work_flags |= WORK_STRUCT_DELAYED; worklist = &pwq->delayed_works; @@ -1496,8 +1510,6 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, return; } - timer_stats_timer_set_start_info(&dwork->timer); - dwork->wq = wq; dwork->cpu = cpu; timer->expires = jiffies + delay; @@ -2076,6 +2088,7 @@ __acquires(&pool->lock) current->comm, preempt_count(), task_pid_nr(current), worker->current_func); debug_show_held_locks(current); + BUG_ON(PANIC_CORRUPTION); dump_stack(); } @@ -2191,6 +2204,8 @@ recheck: list_first_entry(&pool->worklist, struct work_struct, entry); + pool->watchdog_ts = jiffies; + if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) { /* optimization path, not strictly necessary */ process_one_work(worker, work); @@ -2274,6 +2289,7 @@ repeat: struct pool_workqueue, mayday_node); struct worker_pool *pool = pwq->pool; struct work_struct *work, *n; + bool first = true; __set_current_state(TASK_RUNNING); list_del_init(&pwq->mayday_node); @@ -2290,9 +2306,14 @@ repeat: * process'em. */ WARN_ON_ONCE(!list_empty(scheduled)); - list_for_each_entry_safe(work, n, &pool->worklist, entry) - if (get_work_pwq(work) == pwq) + list_for_each_entry_safe(work, n, &pool->worklist, entry) { + if (get_work_pwq(work) == pwq) { + if (first) + pool->watchdog_ts = jiffies; move_linked_works(work, scheduled, &n); + } + first = false; + } if (!list_empty(scheduled)) { process_scheduled_works(rescuer); @@ -2904,6 +2925,31 @@ bool flush_delayed_work(struct delayed_work *dwork) } EXPORT_SYMBOL(flush_delayed_work); +static bool __cancel_work(struct work_struct *work, bool is_dwork) +{ + unsigned long flags; + int ret; + + do { + ret = try_to_grab_pending(work, is_dwork, &flags); + } while (unlikely(ret == -EAGAIN)); + + if (unlikely(ret < 0)) + return false; + + set_work_pool_and_clear_pending(work, get_work_pool_id(work)); + local_irq_restore(flags); + return ret; +} + +/* + * See cancel_delayed_work() + */ +bool cancel_work(struct work_struct *work) +{ + return __cancel_work(work, false); +} + /** * cancel_delayed_work - cancel a delayed work * @dwork: delayed_work to cancel @@ -2922,20 +2968,7 @@ EXPORT_SYMBOL(flush_delayed_work); */ bool cancel_delayed_work(struct delayed_work *dwork) { - unsigned long flags; - int ret; - - do { - ret = try_to_grab_pending(&dwork->work, true, &flags); - } while (unlikely(ret == -EAGAIN)); - - if (unlikely(ret < 0)) - return false; - - set_work_pool_and_clear_pending(&dwork->work, - get_work_pool_id(&dwork->work)); - local_irq_restore(flags); - return ret; + return __cancel_work(&dwork->work, true); } EXPORT_SYMBOL(cancel_delayed_work); @@ -3109,6 +3142,7 @@ static int init_worker_pool(struct worker_pool *pool) pool->cpu = -1; pool->node = NUMA_NO_NODE; pool->flags |= POOL_DISASSOCIATED; + pool->watchdog_ts = jiffies; INIT_LIST_HEAD(&pool->worklist); INIT_LIST_HEAD(&pool->idle_list); hash_init(pool->busy_hash); @@ -4407,7 +4441,9 @@ void show_workqueue_state(void) pr_info("pool %d:", pool->id); pr_cont_pool_info(pool); - pr_cont(" workers=%d", pool->nr_workers); + pr_cont(" hung=%us workers=%d", + jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000, + pool->nr_workers); if (pool->manager) pr_cont(" manager: %d", task_pid_nr(pool->manager->task)); @@ -5277,6 +5313,154 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq) static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { } #endif /* CONFIG_SYSFS */ +/* + * Workqueue watchdog. + * + * Stall may be caused by various bugs - missing WQ_MEM_RECLAIM, illegal + * flush dependency, a concurrency managed work item which stays RUNNING + * indefinitely. Workqueue stalls can be very difficult to debug as the + * usual warning mechanisms don't trigger and internal workqueue state is + * largely opaque. + * + * Workqueue watchdog monitors all worker pools periodically and dumps + * state if some pools failed to make forward progress for a while where + * forward progress is defined as the first item on ->worklist changing. + * + * This mechanism is controlled through the kernel parameter + * "workqueue.watchdog_thresh" which can be updated at runtime through the + * corresponding sysfs parameter file. + */ +#ifdef CONFIG_WQ_WATCHDOG + +static void wq_watchdog_timer_fn(unsigned long data); + +static unsigned long wq_watchdog_thresh = 30; +static struct timer_list wq_watchdog_timer = + TIMER_DEFERRED_INITIALIZER(wq_watchdog_timer_fn, 0, 0); + +static unsigned long wq_watchdog_touched = INITIAL_JIFFIES; +static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES; + +static void wq_watchdog_reset_touched(void) +{ + int cpu; + + wq_watchdog_touched = jiffies; + for_each_possible_cpu(cpu) + per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies; +} + +static void wq_watchdog_timer_fn(unsigned long data) +{ + unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ; + bool lockup_detected = false; + struct worker_pool *pool; + int pi; + + if (!thresh) + return; + + rcu_read_lock(); + + for_each_pool(pool, pi) { + unsigned long pool_ts, touched, ts; + + if (list_empty(&pool->worklist)) + continue; + + /* get the latest of pool and touched timestamps */ + pool_ts = READ_ONCE(pool->watchdog_ts); + touched = READ_ONCE(wq_watchdog_touched); + + if (time_after(pool_ts, touched)) + ts = pool_ts; + else + ts = touched; + + if (pool->cpu >= 0) { + unsigned long cpu_touched = + READ_ONCE(per_cpu(wq_watchdog_touched_cpu, + pool->cpu)); + if (time_after(cpu_touched, ts)) + ts = cpu_touched; + } + + /* did we stall? */ + if (time_after(jiffies, ts + thresh)) { + lockup_detected = true; + pr_emerg("BUG: workqueue lockup - pool"); + pr_cont_pool_info(pool); + pr_cont(" stuck for %us!\n", + jiffies_to_msecs(jiffies - pool_ts) / 1000); + } + } + + rcu_read_unlock(); + + if (lockup_detected) + show_workqueue_state(); + + wq_watchdog_reset_touched(); + mod_timer(&wq_watchdog_timer, jiffies + thresh); +} + +void wq_watchdog_touch(int cpu) +{ + if (cpu >= 0) + per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies; + else + wq_watchdog_touched = jiffies; +} + +static void wq_watchdog_set_thresh(unsigned long thresh) +{ + wq_watchdog_thresh = 0; + del_timer_sync(&wq_watchdog_timer); + + if (thresh) { + wq_watchdog_thresh = thresh; + wq_watchdog_reset_touched(); + mod_timer(&wq_watchdog_timer, jiffies + thresh * HZ); + } +} + +static int wq_watchdog_param_set_thresh(const char *val, + const struct kernel_param *kp) +{ + unsigned long thresh; + int ret; + + ret = kstrtoul(val, 0, &thresh); + if (ret) + return ret; + + if (system_wq) + wq_watchdog_set_thresh(thresh); + else + wq_watchdog_thresh = thresh; + + return 0; +} + +static const struct kernel_param_ops wq_watchdog_thresh_ops = { + .set = wq_watchdog_param_set_thresh, + .get = param_get_ulong, +}; + +module_param_cb(watchdog_thresh, &wq_watchdog_thresh_ops, &wq_watchdog_thresh, + 0644); + +static void wq_watchdog_init(void) +{ + wq_watchdog_set_thresh(wq_watchdog_thresh); +} + +#else /* CONFIG_WQ_WATCHDOG */ + +static inline void wq_watchdog_init(void) { } + +#endif /* CONFIG_WQ_WATCHDOG */ + static void __init wq_numa_init(void) { cpumask_var_t *tbl; @@ -5400,6 +5584,9 @@ static int __init init_workqueues(void) !system_unbound_wq || !system_freezable_wq || !system_power_efficient_wq || !system_freezable_power_efficient_wq); + + wq_watchdog_init(); + return 0; } early_initcall(init_workqueues); |