#include #include #include #include #include #include #include #include #include #include "sched.h" unsigned int sysctl_sched_cfs_boost __read_mostly; /* * System energy normalization constants */ static struct target_nrg { unsigned long min_power; unsigned long max_power; struct reciprocal_value rdiv; } schedtune_target_nrg; /* Performance Boost region (B) threshold params */ static int perf_boost_idx; /* Performance Constraint region (C) threshold params */ static int perf_constrain_idx; /** * Performance-Energy (P-E) Space thresholds constants */ struct threshold_params { int nrg_gain; int cap_gain; }; /* * System specific P-E space thresholds constants */ static struct threshold_params threshold_gains[] = { { 0, 4 }, /* >= 0% */ { 0, 4 }, /* >= 10% */ { 1, 4 }, /* >= 20% */ { 2, 4 }, /* >= 30% */ { 3, 4 }, /* >= 40% */ { 4, 3 }, /* >= 50% */ { 4, 2 }, /* >= 60% */ { 4, 1 }, /* >= 70% */ { 4, 0 }, /* >= 80% */ { 4, 0 } /* >= 90% */ }; static int __schedtune_accept_deltas(int nrg_delta, int cap_delta, int perf_boost_idx, int perf_constrain_idx) { int payoff = -INT_MAX; /* Performance Boost (B) region */ if (nrg_delta > 0 && cap_delta > 0) { /* * Evaluate "Performance Boost" vs "Energy Increase" * payoff criteria: * cap_delta / nrg_delta < cap_gain / nrg_gain * which is: * nrg_delta * cap_gain > cap_delta * nrg_gain */ payoff = nrg_delta * threshold_gains[perf_boost_idx].cap_gain; payoff -= cap_delta * threshold_gains[perf_boost_idx].nrg_gain; return payoff; } /* Performance Constraint (C) region */ if (nrg_delta < 0 && cap_delta < 0) { /* * Evaluate "Performance Boost" vs "Energy Increase" * payoff criteria: * cap_delta / nrg_delta > cap_gain / nrg_gain * which is: * cap_delta * nrg_gain > nrg_delta * cap_gain */ payoff = cap_delta * threshold_gains[perf_constrain_idx].nrg_gain; payoff -= nrg_delta * threshold_gains[perf_constrain_idx].cap_gain; return payoff; } /* Default: reject schedule candidate */ return payoff; } #ifdef CONFIG_CGROUP_SCHEDTUNE /* * EAS scheduler tunables for task groups. */ /* SchdTune tunables for a group of tasks */ struct schedtune { /* SchedTune CGroup subsystem */ struct cgroup_subsys_state css; /* Boost group allocated ID */ int idx; /* Boost value for tasks on that SchedTune CGroup */ int boost; /* Performance Boost (B) region threshold params */ int perf_boost_idx; /* Performance Constraint (C) region threshold params */ int perf_constrain_idx; }; static inline struct schedtune *css_st(struct cgroup_subsys_state *css) { return css ? container_of(css, struct schedtune, css) : NULL; } static inline struct schedtune *task_schedtune(struct task_struct *tsk) { return css_st(task_css(tsk, schedtune_cgrp_id)); } static inline struct schedtune *parent_st(struct schedtune *st) { return css_st(st->css.parent); } /* * SchedTune root control group * The root control group is used to defined a system-wide boosting tuning, * which is applied to all tasks in the system. * Task specific boost tuning could be specified by creating and * configuring a child control group under the root one. * By default, system-wide boosting is disabled, i.e. no boosting is applied * to tasks which are not into a child control group. */ static struct schedtune root_schedtune = { .boost = 0, .perf_boost_idx = 0, .perf_constrain_idx = 0, }; int schedtune_accept_deltas(int nrg_delta, int cap_delta, struct task_struct *task) { struct schedtune *ct; int perf_boost_idx; int perf_constrain_idx; /* Optimal (O) region */ if (nrg_delta < 0 && cap_delta > 0) return INT_MAX; /* Suboptimal (S) region */ if (nrg_delta > 0 && cap_delta < 0) return -INT_MAX; /* Get task specific perf Boost/Constraints indexes */ rcu_read_lock(); ct = task_schedtune(task); perf_boost_idx = ct->perf_boost_idx; perf_constrain_idx = ct->perf_constrain_idx; rcu_read_unlock(); return __schedtune_accept_deltas(nrg_delta, cap_delta, perf_boost_idx, perf_constrain_idx); } /* * Maximum number of boost groups to support * When per-task boosting is used we still allow only limited number of * boost groups for two main reasons: * 1. on a real system we usually have only few classes of workloads which * make sense to boost with different values (e.g. background vs foreground * tasks, interactive vs low-priority tasks) * 2. a limited number allows for a simpler and more memory/time efficient * implementation especially for the computation of the per-CPU boost * value */ #define BOOSTGROUPS_COUNT 4 /* Array of configured boostgroups */ static struct schedtune *allocated_group[BOOSTGROUPS_COUNT] = { &root_schedtune, NULL, }; /* SchedTune boost groups * Keep track of all the boost groups which impact on CPU, for example when a * CPU has two RUNNABLE tasks belonging to two different boost groups and thus * likely with different boost values. * Since on each system we expect only a limited number of boost groups, here * we use a simple array to keep track of the metrics required to compute the * maximum per-CPU boosting value. */ struct boost_groups { /* Maximum boost value for all RUNNABLE tasks on a CPU */ unsigned boost_max; struct { /* The boost for tasks on that boost group */ unsigned boost; /* Count of RUNNABLE tasks on that boost group */ unsigned tasks; } group[BOOSTGROUPS_COUNT]; }; /* Boost groups affecting each CPU in the system */ DEFINE_PER_CPU(struct boost_groups, cpu_boost_groups); static void schedtune_cpu_update(int cpu) { struct boost_groups *bg; unsigned boost_max; int idx; bg = &per_cpu(cpu_boost_groups, cpu); /* The root boost group is always active */ boost_max = bg->group[0].boost; for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx) { /* * A boost group affects a CPU only if it has * RUNNABLE tasks on that CPU */ if (bg->group[idx].tasks == 0) continue; boost_max = max(boost_max, bg->group[idx].boost); } bg->boost_max = boost_max; } static int schedtune_boostgroup_update(int idx, int boost) { struct boost_groups *bg; int cur_boost_max; int old_boost; int cpu; /* Update per CPU boost groups */ for_each_possible_cpu(cpu) { bg = &per_cpu(cpu_boost_groups, cpu); /* * Keep track of current boost values to compute the per CPU * maximum only when it has been affected by the new value of * the updated boost group */ cur_boost_max = bg->boost_max; old_boost = bg->group[idx].boost; /* Update the boost value of this boost group */ bg->group[idx].boost = boost; /* Check if this update increase current max */ if (boost > cur_boost_max && bg->group[idx].tasks) { bg->boost_max = boost; continue; } /* Check if this update has decreased current max */ if (cur_boost_max == old_boost && old_boost > boost) schedtune_cpu_update(cpu); } return 0; } static inline void schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count) { struct boost_groups *bg; int tasks; bg = &per_cpu(cpu_boost_groups, cpu); /* Update boosted tasks count while avoiding to make it negative */ if (task_count < 0 && bg->group[idx].tasks <= -task_count) bg->group[idx].tasks = 0; else bg->group[idx].tasks += task_count; /* Boost group activation or deactivation on that RQ */ tasks = bg->group[idx].tasks; if (tasks == 1 || tasks == 0) schedtune_cpu_update(cpu); } /* * NOTE: This function must be called while holding the lock on the CPU RQ */ void schedtune_enqueue_task(struct task_struct *p, int cpu) { struct schedtune *st; int idx; /* * When a task is marked PF_EXITING by do_exit() it's going to be * dequeued and enqueued multiple times in the exit path. * Thus we avoid any further update, since we do not want to change * CPU boosting while the task is exiting. */ if (p->flags & PF_EXITING) return; /* Get task boost group */ rcu_read_lock(); st = task_schedtune(p); idx = st->idx; rcu_read_unlock(); schedtune_tasks_update(p, cpu, idx, 1); } /* * NOTE: This function must be called while holding the lock on the CPU RQ */ void schedtune_dequeue_task(struct task_struct *p, int cpu) { struct schedtune *st; int idx; /* * When a task is marked PF_EXITING by do_exit() it's going to be * dequeued and enqueued multiple times in the exit path. * Thus we avoid any further update, since we do not want to change * CPU boosting while the task is exiting. * The last dequeue will be done by cgroup exit() callback. */ if (p->flags & PF_EXITING) return; /* Get task boost group */ rcu_read_lock(); st = task_schedtune(p); idx = st->idx; rcu_read_unlock(); schedtune_tasks_update(p, cpu, idx, -1); } int schedtune_cpu_boost(int cpu) { struct boost_groups *bg; bg = &per_cpu(cpu_boost_groups, cpu); return bg->boost_max; } int schedtune_task_boost(struct task_struct *p) { struct schedtune *st; int task_boost; /* Get task boost value */ rcu_read_lock(); st = task_schedtune(p); task_boost = st->boost; rcu_read_unlock(); return task_boost; } static u64 boost_read(struct cgroup_subsys_state *css, struct cftype *cft) { struct schedtune *st = css_st(css); return st->boost; } static int boost_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 boost) { struct schedtune *st = css_st(css); if (boost < 0 || boost > 100) return -EINVAL; st->boost = boost; if (css == &root_schedtune.css) sysctl_sched_cfs_boost = boost; /* Update CPU boost */ schedtune_boostgroup_update(st->idx, st->boost); trace_sched_tune_config(st->boost); return 0; } static struct cftype files[] = { { .name = "boost", .read_u64 = boost_read, .write_u64 = boost_write, }, { } /* terminate */ }; static int schedtune_boostgroup_init(struct schedtune *st) { struct boost_groups *bg; int cpu; /* Keep track of allocated boost groups */ allocated_group[st->idx] = st; /* Initialize the per CPU boost groups */ for_each_possible_cpu(cpu) { bg = &per_cpu(cpu_boost_groups, cpu); bg->group[st->idx].boost = 0; bg->group[st->idx].tasks = 0; } return 0; } static int schedtune_init(void) { struct boost_groups *bg; int cpu; /* Initialize the per CPU boost groups */ for_each_possible_cpu(cpu) { bg = &per_cpu(cpu_boost_groups, cpu); memset(bg, 0, sizeof(struct boost_groups)); } pr_info(" schedtune configured to support %d boost groups\n", BOOSTGROUPS_COUNT); return 0; } static struct cgroup_subsys_state * schedtune_css_alloc(struct cgroup_subsys_state *parent_css) { struct schedtune *st; int idx; if (!parent_css) { schedtune_init(); return &root_schedtune.css; } /* Allow only single level hierachies */ if (parent_css != &root_schedtune.css) { pr_err("Nested SchedTune boosting groups not allowed\n"); return ERR_PTR(-ENOMEM); } /* Allow only a limited number of boosting groups */ for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx) if (!allocated_group[idx]) break; if (idx == BOOSTGROUPS_COUNT) { pr_err("Trying to create more than %d SchedTune boosting groups\n", BOOSTGROUPS_COUNT); return ERR_PTR(-ENOSPC); } st = kzalloc(sizeof(*st), GFP_KERNEL); if (!st) goto out; /* Initialize per CPUs boost group support */ st->idx = idx; if (schedtune_boostgroup_init(st)) goto release; return &st->css; release: kfree(st); out: return ERR_PTR(-ENOMEM); } static void schedtune_boostgroup_release(struct schedtune *st) { /* Reset this boost group */ schedtune_boostgroup_update(st->idx, 0); /* Keep track of allocated boost groups */ allocated_group[st->idx] = NULL; } static void schedtune_css_free(struct cgroup_subsys_state *css) { struct schedtune *st = css_st(css); schedtune_boostgroup_release(st); kfree(st); } struct cgroup_subsys schedtune_cgrp_subsys = { .css_alloc = schedtune_css_alloc, .css_free = schedtune_css_free, .legacy_cftypes = files, .early_init = 1, }; #else /* CONFIG_CGROUP_SCHEDTUNE */ int schedtune_accept_deltas(int nrg_delta, int cap_delta, struct task_struct *task) { /* Optimal (O) region */ if (nrg_delta < 0 && cap_delta > 0) return INT_MAX; /* Suboptimal (S) region */ if (nrg_delta > 0 && cap_delta < 0) return -INT_MAX; return __schedtune_accept_deltas(nrg_delta, cap_delta, perf_boost_idx, perf_constrain_idx); } #endif /* CONFIG_CGROUP_SCHEDTUNE */ int sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret || !write) return ret; /* Performance Boost (B) region threshold params */ perf_boost_idx = sysctl_sched_cfs_boost; perf_boost_idx /= 10; /* Performance Constraint (C) region threshold params */ perf_constrain_idx = 100 - sysctl_sched_cfs_boost; perf_constrain_idx /= 10; return 0; } /* * System energy normalization * Returns the normalized value, in the range [0..SCHED_LOAD_SCALE], * corresponding to the specified energy variation. */ int schedtune_normalize_energy(int energy_diff) { u32 normalized_nrg; int max_delta; #ifdef CONFIG_SCHED_DEBUG /* Check for boundaries */ max_delta = schedtune_target_nrg.max_power; max_delta -= schedtune_target_nrg.min_power; WARN_ON(abs(energy_diff) >= max_delta); #endif /* Do scaling using positive numbers to increase the range */ normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff; /* Scale by energy magnitude */ normalized_nrg <<= SCHED_LOAD_SHIFT; /* Normalize on max energy for target platform */ normalized_nrg = reciprocal_divide( normalized_nrg, schedtune_target_nrg.rdiv); return (energy_diff < 0) ? -normalized_nrg : normalized_nrg; } #ifdef CONFIG_SCHED_DEBUG static void schedtune_test_nrg(unsigned long delta_pwr) { unsigned long test_delta_pwr; unsigned long test_norm_pwr; int idx; /* * Check normalization constants using some constant system * energy values */ pr_info("schedtune: verify normalization constants...\n"); for (idx = 0; idx < 6; ++idx) { test_delta_pwr = delta_pwr >> idx; /* Normalize on max energy for target platform */ test_norm_pwr = reciprocal_divide( test_delta_pwr << SCHED_LOAD_SHIFT, schedtune_target_nrg.rdiv); pr_info("schedtune: max_pwr/2^%d: %4lu => norm_pwr: %5lu\n", idx, test_delta_pwr, test_norm_pwr); } } #else #define schedtune_test_nrg(delta_pwr) #endif /* * Compute the min/max power consumption of a cluster and all its CPUs */ static void schedtune_add_cluster_nrg( struct sched_domain *sd, struct sched_group *sg, struct target_nrg *ste) { struct sched_domain *sd2; struct sched_group *sg2; struct cpumask *cluster_cpus; char str[32]; unsigned long min_pwr; unsigned long max_pwr; int cpu; /* Get Cluster energy using EM data for the first CPU */ cluster_cpus = sched_group_cpus(sg); snprintf(str, 32, "CLUSTER[%*pbl]", cpumask_pr_args(cluster_cpus)); min_pwr = sg->sge->idle_states[sg->sge->nr_idle_states - 1].power; max_pwr = sg->sge->cap_states[sg->sge->nr_cap_states - 1].power; pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n", str, min_pwr, max_pwr); /* * Keep track of this cluster's energy in the computation of the * overall system energy */ ste->min_power += min_pwr; ste->max_power += max_pwr; /* Get CPU energy using EM data for each CPU in the group */ for_each_cpu(cpu, cluster_cpus) { /* Get a SD view for the specific CPU */ for_each_domain(cpu, sd2) { /* Get the CPU group */ sg2 = sd2->groups; min_pwr = sg2->sge->idle_states[sg2->sge->nr_idle_states - 1].power; max_pwr = sg2->sge->cap_states[sg2->sge->nr_cap_states - 1].power; ste->min_power += min_pwr; ste->max_power += max_pwr; snprintf(str, 32, "CPU[%d]", cpu); pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n", str, min_pwr, max_pwr); /* * Assume we have EM data only at the CPU and * the upper CLUSTER level */ BUG_ON(!cpumask_equal( sched_group_cpus(sg), sched_group_cpus(sd2->parent->groups) )); break; } } } /* * Initialize the constants required to compute normalized energy. * The values of these constants depends on the EM data for the specific * target system and topology. * Thus, this function is expected to be called by the code * that bind the EM to the topology information. */ static int schedtune_init_late(void) { struct target_nrg *ste = &schedtune_target_nrg; unsigned long delta_pwr = 0; struct sched_domain *sd; struct sched_group *sg; pr_info("schedtune: init normalization constants...\n"); ste->max_power = 0; ste->min_power = 0; rcu_read_lock(); /* * When EAS is in use, we always have a pointer to the highest SD * which provides EM data. */ sd = rcu_dereference(per_cpu(sd_ea, cpumask_first(cpu_online_mask))); if (!sd) { pr_info("schedtune: no energy model data\n"); goto nodata; } sg = sd->groups; do { schedtune_add_cluster_nrg(sd, sg, ste); } while (sg = sg->next, sg != sd->groups); rcu_read_unlock(); pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n", "SYSTEM", ste->min_power, ste->max_power); /* Compute normalization constants */ delta_pwr = ste->max_power - ste->min_power; ste->rdiv = reciprocal_value(delta_pwr); pr_info("schedtune: using normalization constants mul: %u sh1: %u sh2: %u\n", ste->rdiv.m, ste->rdiv.sh1, ste->rdiv.sh2); schedtune_test_nrg(delta_pwr); return 0; nodata: rcu_read_unlock(); return -EINVAL; } late_initcall(schedtune_init_late);