sched: Add the mechanics of top task tracking for frequency guidance

The previous patches in this rewrite of scheduler guided frequency selection reintroduces the part-picture problem that we addressed in our initial implementation. In that, when tasks migrate across CPUs within a cluster, we end up losing the complete picture of the sequential nature of the workload. This patch aims to solve that problem slightly differently. We track the top task on every CPU within a window. Top task is defined as the task that runs the most in a given window. This enhances our ability to detect the sequential nature of workloads. A single migrating task executing for an entire window will cause 100% load to be reported for frequency guidance instead of the maximum footprint left on any individual CPU in the task's trail. There are cases, that this new approach does not address. Namely, cases where the sum of two or more tasks accurately reflects the true sequential nature of the workload. Future optimizations might aim to tackle that problem. To track top tasks, we first realize that there is no strict need to maintain the task struct itself as long as we know the load exerted by the top task. We also realize that to maintain top tasks on every CPU we have to track the execution of every single task that runs during the window. The load associated with a task needs to be migrated when the task migrates from one CPU to another. When the top task migrates away, we need to locate the second top task and so on. Given the above realizations, we use hashmaps to track top task load both for the current and the previous window. This hashmap is implemented as an array of fixed size. The key of the hashmap is given by task_execution_time_in_a_window / array_size. The size of the array (number of buckets in the hashmap) dictate the load granularity of each bucket. The value stored in each bucket is a refcount of all the tasks that executed long enough to be in that bucket. This approach has a few benefits. Firstly, any top task stats update now take O(1) time. While task migration is also O(1), it does still involve going through up to the size of the array to find the second top task. Further patches will aim to optimize this behavior. Secondly, and more importantly, not having to store the task struct itself saves a lot of memory usage in that 1) there is no need to retrieve task structs later causing cache misses and 2) we don't have to unnecessarily hold up task memory for up to 2 full windows by calling get_task_struct() after a task exits. Change-Id: I004dba474f41590db7d3f40d9deafe86e71359ac Signed-off-by: Syed Rameez Mustafa <rameezmustafa@codeaurora.org>
author: Syed Rameez Mustafa <rameezmustafa@codeaurora.org> 2016-05-31 16:40:45 -0700
committer: Syed Rameez Mustafa <rameezmustafa@codeaurora.org> 2016-10-17 12:43:55 -0700
commit: 7bd09f24415cb4809973ed4f536c717b91dc0e18 (patch)
tree: 96e02cb8cc121a5836103ba511ebcc3f7fc9fd57 /kernel
parent: 7e1a4f15b2c38ea0d0207a6fc95b721c09d6f994 (diff)
4 files changed, 242 insertions, 43 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 90d7ba39e4c2..5c616517d4d3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8015,9 +8015,20 @@ void __init sched_init(void)
 		rq->old_estimated_time = 0;
 		rq->old_busy_time_group = 0;
 		rq->hmp_stats.pred_demands_sum = 0;
-		for (j = 0; j < NUM_SUBTRACTION_WINDOWS; j++)
+		rq->curr_table = 0;
+		rq->prev_top = 0;
+		rq->curr_top = 0;
+
+		for (j = 0; j < NUM_TRACKED_WINDOWS; j++) {
 			memset(&rq->load_subs[j], 0,
 					sizeof(struct load_subtractions));
+
+			rq->top_tasks[j] = kcalloc(NUM_LOAD_INDICES,
+						sizeof(u8), GFP_NOWAIT);
+
+			/* No other choice */
+			BUG_ON(!rq->top_tasks[j]);
+		}
 #endif
 		rq->max_idle_balance_cost = sysctl_sched_migration_cost;
 
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index b6dc131f36a6..c8c4272c61d8 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -418,6 +418,7 @@ static void sched_debug_header(struct seq_file *m)
 	P(min_capacity);
 	P(max_capacity);
 	P(sched_ravg_window);
+	P(sched_load_granule);
 #endif
 #undef PN
 #undef P
diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c
index 35f4ea1761e2..8675ebeebf6a 100644
--- a/kernel/sched/hmp.c
+++ b/kernel/sched/hmp.c
@@ -825,15 +825,15 @@ unsigned int max_possible_capacity = 1024; /* max(rq->max_possible_capacity) */
 unsigned int
 min_max_possible_capacity = 1024; /* min(rq->max_possible_capacity) */
 
-/* Window size (in ns) */
-__read_mostly unsigned int sched_ravg_window = 10000000;
-
 /* Min window size (in ns) = 10ms */
 #define MIN_SCHED_RAVG_WINDOW 10000000
 
 /* Max window size (in ns) = 1s */
 #define MAX_SCHED_RAVG_WINDOW 1000000000
 
+/* Window size (in ns) */
+__read_mostly unsigned int sched_ravg_window = MIN_SCHED_RAVG_WINDOW;
+
 /* Temporarily disable window-stats activity on all cpus */
 unsigned int __read_mostly sched_disable_window_stats;
 
@@ -853,6 +853,17 @@ static DEFINE_RWLOCK(related_thread_group_lock);
 	list_for_each_entry(grp, &related_thread_groups, list)
 
 /*
+ * Task load is categorized into buckets for the purpose of top task tracking.
+ * The entire range of load from 0 to sched_ravg_window needs to be covered
+ * in NUM_LOAD_INDICES number of buckets. Therefore the size of each bucket
+ * is given by sched_ravg_window / NUM_LOAD_INDICES. Since the default value
+ * of sched_ravg_window is MIN_SCHED_RAVG_WINDOW, use that to compute
+ * sched_load_granule.
+ */
+__read_mostly unsigned int sched_load_granule =
+			MIN_SCHED_RAVG_WINDOW / NUM_LOAD_INDICES;
+
+/*
  * Demand aggregation for frequency purpose:
  *
  * 'sched_freq_aggregate' controls aggregation of cpu demand of related threads
@@ -2172,6 +2183,113 @@ void update_task_pred_demand(struct rq *rq, struct task_struct *p, int event)
 	p->ravg.pred_demand = new;
 }
 
+/*
+ * Special case the last index and provide a fast path for index = 0.
+ * Note that sched_load_granule can change underneath us if we are not
+ * holding any runqueue locks while calling the two functions below.
+ */
+static u32  __maybe_unused top_task_load(struct rq *rq)
+{
+	int index = rq->prev_top;
+
+	if (!index) {
+		if (!rq->prev_runnable_sum)
+			return 0;
+		else
+			return sched_load_granule;
+	} else if (index == NUM_LOAD_INDICES - 1) {
+		return sched_ravg_window;
+	} else {
+		return (index + 1) * sched_load_granule;
+	}
+}
+
+static int load_to_index(u32 load)
+{
+	if (load < sched_load_granule)
+		return 0;
+	else if (load >= sched_ravg_window)
+		return NUM_LOAD_INDICES - 1;
+	else
+		return load / sched_load_granule;
+}
+
+static void update_top_tasks(struct task_struct *p, struct rq *rq,
+		u32 old_curr_window, int new_window, bool full_window)
+{
+	u8 *curr_table = rq->top_tasks[rq->curr_table];
+	u8 *prev_table = rq->top_tasks[1 - rq->curr_table];
+	int old_index, new_index, update_index;
+	u32 curr_window = p->ravg.curr_window;
+	u32 prev_window = p->ravg.prev_window;
+	bool zero_index_update;
+
+	if (old_curr_window == curr_window && !new_window)
+		return;
+
+	old_index = load_to_index(old_curr_window);
+	new_index = load_to_index(curr_window);
+
+	if (!new_window) {
+		zero_index_update = !old_curr_window && curr_window;
+		if (old_index != new_index || zero_index_update) {
+			if (old_curr_window)
+				curr_table[old_index] -= 1;
+			if (curr_window)
+				curr_table[new_index] += 1;
+			if (new_index > rq->curr_top)
+				rq->curr_top = new_index;
+		}
+
+		return;
+	}
+
+	/*
+	 * The window has rolled over for this task. By the time we get
+	 * here, curr/prev swaps would has already occurred. So we need
+	 * to use prev_window for the new index.
+	 */
+	update_index = load_to_index(prev_window);
+
+	if (full_window) {
+		/*
+		 * Two cases here. Either 'p' ran for the entire window or
+		 * it didn't run at all. In either case there is no entry
+		 * in the prev table. If 'p' ran the entire window, we just
+		 * need to create a new entry in the prev table. In this case
+		 * update_index will be correspond to sched_ravg_window
+		 * so we can unconditionally update the top index.
+		 */
+		if (prev_window) {
+			prev_table[update_index] += 1;
+			rq->prev_top = update_index;
+		}
+	} else {
+		zero_index_update = !old_curr_window && prev_window;
+		if (old_index != update_index || zero_index_update) {
+			if (old_curr_window)
+				prev_table[old_index] -= 1;
+
+			prev_table[update_index] += 1;
+
+			if (update_index > rq->prev_top)
+				rq->prev_top = update_index;
+		}
+	}
+
+	if (curr_window) {
+		curr_table[new_index] += 1;
+
+		if (new_index > rq->curr_top)
+			rq->curr_top = new_index;
+	}
+}
+
+static inline void clear_top_tasks_table(u8 *table)
+{
+	memset(table, 0, NUM_LOAD_INDICES * sizeof(u8));
+}
+
 static u32 empty_windows[NR_CPUS];
 
 static void rollover_task_window(struct task_struct *p, bool full_window)
@@ -2219,6 +2337,7 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 	bool new_task;
 	struct related_thread_group *grp;
 	int cpu = rq->cpu;
+	u32 old_curr_window;
 
 	new_window = mark_start < window_start;
 	if (new_window) {
@@ -2278,51 +2397,40 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 	 * Handle per-task window rollover. We don't care about the idle
 	 * task or exiting tasks.
 	 */
-	if (new_window && !is_idle_task(p) && !exiting_task(p))
-		rollover_task_window(p, full_window);
+	if (!is_idle_task(p) && !exiting_task(p)) {
+		old_curr_window = p->ravg.curr_window;
 
+		if (new_window)
+			rollover_task_window(p, full_window);
+	}
 
 	if (flip_counters) {
 		u64 curr_sum = *curr_runnable_sum;
 		u64 nt_curr_sum = *nt_curr_runnable_sum;
+		u8 curr_table = rq->curr_table;
+		u8 prev_table = 1 - curr_table;
+		int curr_top = rq->curr_top;
 
-		if (prev_sum_reset)
+		clear_top_tasks_table(rq->top_tasks[prev_table]);
+
+		if (prev_sum_reset) {
 			curr_sum = nt_curr_sum = 0;
+			curr_top = 0;
+			clear_top_tasks_table(rq->top_tasks[curr_table]);
+		}
 
 		*prev_runnable_sum = curr_sum;
 		*nt_prev_runnable_sum = nt_curr_sum;
 
 		*curr_runnable_sum = 0;
 		*nt_curr_runnable_sum = 0;
+		rq->curr_table = prev_table;
+		rq->prev_top = curr_top;
+		rq->curr_top = 0;
 	}
 
-	if (!account_busy_for_cpu_time(rq, p, irqtime, event)) {
-		/*
-		 * account_busy_for_cpu_time() = 0, so no update to the
-		 * task's current window needs to be made. This could be
-		 * for example
-		 *
-		 *   - a wakeup event on a task within the current
-		 *     window (!new_window below, no action required),
-		 *   - switching to a new task from idle (PICK_NEXT_TASK)
-		 *     in a new window where irqtime is 0 and we aren't
-		 *     waiting on IO
-		 */
-
-		if (!new_window)
-			return;
-
-		/*
-		 * A new window has started. The RQ demand must be rolled
-		 * over if p is the current task.
-		 */
-		if (p_is_curr_task) {
-			/* p is idle task */
-			BUG_ON(p != rq->idle);
-		}
-
-		return;
-	}
+	if (!account_busy_for_cpu_time(rq, p, irqtime, event))
+		goto done;
 
 	if (!new_window) {
 		/*
@@ -2347,7 +2455,7 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 			p->ravg.curr_window_cpu[cpu] += delta;
 		}
 
-		return;
+		goto done;
 	}
 
 	if (!p_is_curr_task) {
@@ -2402,7 +2510,7 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 			p->ravg.curr_window_cpu[cpu] = delta;
 		}
 
-		return;
+		goto done;
 	}
 
 	if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) {
@@ -2462,7 +2570,7 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 			p->ravg.curr_window_cpu[cpu] = delta;
 		}
 
-		return;
+		goto done;
 	}
 
 	if (irqtime) {
@@ -2507,7 +2615,10 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 		return;
 	}
 
-	BUG();
+done:
+	if (!is_idle_task(p) && !exiting_task(p))
+		update_top_tasks(p, rq, old_curr_window,
+					new_window, full_window);
 }
 
 static inline u32 predict_and_update_buckets(struct rq *rq,
@@ -3028,6 +3139,7 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size)
 	if (window_size) {
 		sched_ravg_window = window_size * TICK_NSEC;
 		set_hmp_defaults();
+		sched_load_granule = sched_ravg_window / NUM_LOAD_INDICES;
 	}
 
 	enable_window_stats();
@@ -3039,9 +3151,15 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size)
 			rq->window_start = window_start;
 		rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
 		rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0;
-		for (i = 0; i < NUM_SUBTRACTION_WINDOWS; i++)
+		for (i = 0; i < NUM_TRACKED_WINDOWS; i++) {
 			memset(&rq->load_subs[i], 0,
 					sizeof(struct load_subtractions));
+			clear_top_tasks_table(rq->top_tasks[i]);
+		}
+
+		rq->curr_table = 0;
+		rq->curr_top = 0;
+		rq->prev_top = 0;
 		reset_cpu_hmp_stats(cpu, 1);
 	}
 
@@ -3088,7 +3206,7 @@ static inline void account_load_subtractions(struct rq *rq)
 	struct load_subtractions *ls = rq->load_subs;
 	int i;
 
-	for (i = 0; i < NUM_SUBTRACTION_WINDOWS; i++) {
+	for (i = 0; i < NUM_TRACKED_WINDOWS; i++) {
 		if (ls[i].window_start == ws) {
 			rq->curr_runnable_sum -= ls[i].subs;
 			rq->nt_curr_runnable_sum -= ls[i].new_subs;
@@ -3357,7 +3475,7 @@ static bool get_subtraction_index(struct rq *rq, u64 ws)
 	u64 oldest = ULLONG_MAX;
 	int oldest_index = 0;
 
-	for (i = 0; i < NUM_SUBTRACTION_WINDOWS; i++) {
+	for (i = 0; i < NUM_TRACKED_WINDOWS; i++) {
 		u64 entry_ws = rq->load_subs[i].window_start;
 
 		if (ws == entry_ws)
@@ -3454,6 +3572,68 @@ static inline void inter_cluster_migration_fixup
 	BUG_ON((s64)src_rq->nt_curr_runnable_sum < 0);
 }
 
+static int find_next_top_index(u8 *tasks, int end)
+{
+	int i;
+
+	if (end <= 1)
+		return 0;
+
+	for (i = end - 1; i >= 0; i--) {
+		if (tasks[i])
+			return i;
+	}
+
+	return 0;
+}
+
+static void
+migrate_top_tasks(struct task_struct *p, struct rq *src_rq, struct rq *dst_rq)
+{
+	int index;
+	int top_index;
+	u32 curr_window = p->ravg.curr_window;
+	u32 prev_window = p->ravg.prev_window;
+	u8 src = src_rq->curr_table;
+	u8 dst = dst_rq->curr_table;
+	u8 *src_table;
+	u8 *dst_table;
+
+	if (curr_window) {
+		src_table = src_rq->top_tasks[src];
+		dst_table = dst_rq->top_tasks[dst];
+		index = load_to_index(curr_window);
+		src_table[index] -= 1;
+		dst_table[index] += 1;
+
+		if (index > dst_rq->curr_top)
+			dst_rq->curr_top = index;
+
+		top_index = src_rq->curr_top;
+		if (index == top_index && !src_table[index])
+			src_rq->curr_top =
+				find_next_top_index(src_table, top_index);
+	}
+
+	if (prev_window) {
+		src = 1 - src;
+		dst = 1 - dst;
+		src_table = src_rq->top_tasks[src];
+		dst_table = dst_rq->top_tasks[dst];
+		index = load_to_index(prev_window);
+		src_table[index] -= 1;
+		dst_table[index] += 1;
+
+		if (index > dst_rq->prev_top)
+			dst_rq->prev_top = index;
+
+		top_index = src_rq->prev_top;
+		if (index == top_index && !src_table[index])
+			src_rq->prev_top =
+				find_next_top_index(src_table, top_index);
+	}
+}
+
 void fixup_busy_time(struct task_struct *p, int new_cpu)
 {
 	struct rq *src_rq = task_rq(p);
@@ -3544,6 +3724,7 @@ void fixup_busy_time(struct task_struct *p, int new_cpu)
 						task_cpu(p), new_task);
 	}
 
+	migrate_top_tasks(p, src_rq, dest_rq);
 
 	if (p == src_rq->ed_task) {
 		src_rq->ed_task = NULL;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c107712643dc..5cbf374696ee 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -351,7 +351,8 @@ struct cfs_bandwidth { };
 
 #ifdef CONFIG_SCHED_HMP
 
-#define NUM_SUBTRACTION_WINDOWS 2
+#define NUM_TRACKED_WINDOWS 2
+#define NUM_LOAD_INDICES 1000
 
 struct hmp_sched_stats {
 	int nr_big_tasks;
@@ -751,7 +752,11 @@ struct rq {
 	u64 prev_runnable_sum;
 	u64 nt_curr_runnable_sum;
 	u64 nt_prev_runnable_sum;
-	struct load_subtractions load_subs[NUM_SUBTRACTION_WINDOWS];
+	struct load_subtractions load_subs[NUM_TRACKED_WINDOWS];
+	u8 *top_tasks[NUM_TRACKED_WINDOWS];
+	u8 curr_table;
+	int prev_top;
+	int curr_top;
 #endif
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -1066,6 +1071,7 @@ extern unsigned int  __read_mostly sched_spill_load;
 extern unsigned int  __read_mostly sched_upmigrate;
 extern unsigned int  __read_mostly sched_downmigrate;
 extern unsigned int  __read_mostly sysctl_sched_spill_nr_run;
+extern unsigned int  __read_mostly sched_load_granule;
 
 extern void init_new_task_load(struct task_struct *p, bool idle_task);
 extern u64 sched_ktime_clock(void);
author	Syed Rameez Mustafa <rameezmustafa@codeaurora.org>	2016-05-31 16:40:45 -0700
committer	Syed Rameez Mustafa <rameezmustafa@codeaurora.org>	2016-10-17 12:43:55 -0700
commit	7bd09f24415cb4809973ed4f536c717b91dc0e18 (patch)
tree	96e02cb8cc121a5836103ba511ebcc3f7fc9fd57 /kernel
parent	7e1a4f15b2c38ea0d0207a6fc95b721c09d6f994 (diff)