kernel/smpboot.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567

/*
 * Common SMP CPU bringup/teardown functions
 */
#include <linux/cpu.h>
#include <linux/err.h>
#include <linux/smp.h>
#include <linux/delay.h>
#include <linux/init.h>
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/export.h>
#include <linux/percpu.h>
#include <linux/kthread.h>
#include <linux/smpboot.h>
#include <linux/kmemleak.h>

#include "smpboot.h"

#ifdef CONFIG_SMP

#ifdef CONFIG_GENERIC_SMP_IDLE_THREAD
/*
 * For the hotplug case we keep the task structs around and reuse
 * them.
 */
static DEFINE_PER_CPU(struct task_struct *, idle_threads);

struct task_struct *idle_thread_get(unsigned int cpu)
{
	struct task_struct *tsk = per_cpu(idle_threads, cpu);

	if (!tsk)
		return ERR_PTR(-ENOMEM);
	init_idle(tsk, cpu);
	return tsk;
}

void __init idle_thread_set_boot_cpu(void)
{
	per_cpu(idle_threads, smp_processor_id()) = current;
}

/**
 * idle_init - Initialize the idle thread for a cpu
 * @cpu:	The cpu for which the idle thread should be initialized
 *
 * Creates the thread if it does not exist.
 */
static inline void idle_init(unsigned int cpu)
{
	struct task_struct *tsk = per_cpu(idle_threads, cpu);

	if (!tsk) {
		tsk = fork_idle(cpu);
		if (IS_ERR(tsk))
			pr_err("SMP: fork_idle() failed for CPU %u\n", cpu);
		else
			per_cpu(idle_threads, cpu) = tsk;
	}
}

/**
 * idle_threads_init - Initialize idle threads for all cpus
 */
void __init idle_threads_init(void)
{
	unsigned int cpu, boot_cpu;

	boot_cpu = smp_processor_id();

	for_each_possible_cpu(cpu) {
		if (cpu != boot_cpu)
			idle_init(cpu);
	}
}
#endif

#endif /* #ifdef CONFIG_SMP */

static LIST_HEAD(hotplug_threads);
static DEFINE_MUTEX(smpboot_threads_lock);

struct smpboot_thread_data {
	unsigned int			cpu;
	unsigned int			status;
	struct smp_hotplug_thread	*ht;
};

enum {
	HP_THREAD_NONE = 0,
	HP_THREAD_ACTIVE,
	HP_THREAD_PARKED,
};

/**
 * smpboot_thread_fn - percpu hotplug thread loop function
 * @data:	thread data pointer
 *
 * Checks for thread stop and park conditions. Calls the necessary
 * setup, cleanup, park and unpark functions for the registered
 * thread.
 *
 * Returns 1 when the thread should exit, 0 otherwise.
 */
static int smpboot_thread_fn(void *data)
{
	struct smpboot_thread_data *td = data;
	struct smp_hotplug_thread *ht = td->ht;

	while (1) {
		set_current_state(TASK_INTERRUPTIBLE);
		preempt_disable();
		if (kthread_should_stop()) {
			__set_current_state(TASK_RUNNING);
			preempt_enable();
			/* cleanup must mirror setup */
			if (ht->cleanup && td->status != HP_THREAD_NONE)
				ht->cleanup(td->cpu, cpu_online(td->cpu));
			kfree(td);
			return 0;
		}

		if (kthread_should_park()) {
			/*
			 * Serialize against wakeup. If we take the lock first,
			 * wakeup is skipped. If we run later, we observe,
			 * TASK_RUNNING update from wakeup path, before moving
			 * forward. This helps avoid the race, where wakeup
			 * observes TASK_INTERRUPTIBLE, and also observes
			 * the TASK_PARKED in kthread_parkme() before updating
			 * task state to TASK_RUNNING. In this case, kthread
			 * gets parked in TASK_RUNNING state. This results
			 * in panic later on in kthread_unpark(), as it sees
			 * KTHREAD_IS_PARKED flag set but fails to rebind the
			 * kthread, due to it being not in TASK_PARKED state.
			 *
			 * Control thread                      Hotplug Thread
			 *
			 * kthread_park()
			 *   set KTHREAD_SHOULD_PARK
			 *                                smpboot_thread_fn()
			 *                                  set_current_state(
			 *                                  TASK_INTERRUPTIBLE);
			 *                                  kthread_parkme()
			 *
			 *   wake_up_process()
			 *
			 * raw_spin_lock_irqsave(&p->pi_lock, flags);
			 * if (!(p->state & state))
			 *            goto out;
			 *
			 *                                  __set_current_state(
			 *                                  TASK_PARKED);
			 *
			 * if (p->on_rq && ttwu_remote(p, wake_flags))
			 *   ttwu_remote()
			 *     p->state = TASK_RUNNING;
			 *                                   schedule();
			 */
			raw_spin_lock(&current->pi_lock);
			__set_current_state(TASK_RUNNING);
			raw_spin_unlock(&current->pi_lock);
			preempt_enable();
			if (ht->park && td->status == HP_THREAD_ACTIVE) {
				BUG_ON(td->cpu != smp_processor_id());
				ht->park(td->cpu);
				td->status = HP_THREAD_PARKED;
			}
			kthread_parkme();
			/* We might have been woken for stop */
			continue;
		}

		BUG_ON(td->cpu != smp_processor_id());

		/* Check for state change setup */
		switch (td->status) {
		case HP_THREAD_NONE:
			__set_current_state(TASK_RUNNING);
			preempt_enable();
			if (ht->setup)
				ht->setup(td->cpu);
			td->status = HP_THREAD_ACTIVE;
			continue;

		case HP_THREAD_PARKED:
			__set_current_state(TASK_RUNNING);
			preempt_enable();
			if (ht->unpark)
				ht->unpark(td->cpu);
			td->status = HP_THREAD_ACTIVE;
			continue;
		}

		if (!ht->thread_should_run(td->cpu)) {
			preempt_enable_no_resched();
			schedule();
		} else {
			__set_current_state(TASK_RUNNING);
			preempt_enable();
			ht->thread_fn(td->cpu);
		}
	}
}

static int
__smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
{
	struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
	struct smpboot_thread_data *td;

	if (tsk)
		return 0;

	td = kzalloc_node(sizeof(*td), GFP_KERNEL, cpu_to_node(cpu));
	if (!td)
		return -ENOMEM;

	kmemleak_not_leak(td);
	td->cpu = cpu;
	td->ht = ht;

	tsk = kthread_create_on_cpu(smpboot_thread_fn, td, cpu,
				    ht->thread_comm);
	if (IS_ERR(tsk)) {
		kfree(td);
		return PTR_ERR(tsk);
	}
	get_task_struct(tsk);
	*per_cpu_ptr(ht->store, cpu) = tsk;
	if (ht->create) {
		/*
		 * Make sure that the task has actually scheduled out
		 * into park position, before calling the create
		 * callback. At least the migration thread callback
		 * requires that the task is off the runqueue.
		 */
		if (!wait_task_inactive(tsk, TASK_PARKED))
			WARN_ON(1);
		else
			ht->create(cpu);
	}
	return 0;
}

int smpboot_create_threads(unsigned int cpu)
{
	struct smp_hotplug_thread *cur;
	int ret = 0;

	mutex_lock(&smpboot_threads_lock);
	list_for_each_entry(cur, &hotplug_threads, list) {
		ret = __smpboot_create_thread(cur, cpu);
		if (ret)
			break;
	}
	mutex_unlock(&smpboot_threads_lock);
	return ret;
}

static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
{
	struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);

	if (!ht->selfparking)
		kthread_unpark(tsk);
}

void smpboot_unpark_threads(unsigned int cpu)
{
	struct smp_hotplug_thread *cur;

	mutex_lock(&smpboot_threads_lock);
	list_for_each_entry(cur, &hotplug_threads, list)
		if (cpumask_test_cpu(cpu, cur->cpumask))
			smpboot_unpark_thread(cur, cpu);
	mutex_unlock(&smpboot_threads_lock);
}

static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
{
	struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);

	if (tsk && !ht->selfparking)
		kthread_park(tsk);
}

void smpboot_park_threads(unsigned int cpu)
{
	struct smp_hotplug_thread *cur;

	mutex_lock(&smpboot_threads_lock);
	list_for_each_entry_reverse(cur, &hotplug_threads, list)
		smpboot_park_thread(cur, cpu);
	mutex_unlock(&smpboot_threads_lock);
}

static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
{
	unsigned int cpu;

	/* We need to destroy also the parked threads of offline cpus */
	for_each_possible_cpu(cpu) {
		struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);

		if (tsk) {
			kthread_stop(tsk);
			put_task_struct(tsk);
			*per_cpu_ptr(ht->store, cpu) = NULL;
		}
	}
}

/**
 * smpboot_register_percpu_thread_cpumask - Register a per_cpu thread related
 * 					    to hotplug
 * @plug_thread:	Hotplug thread descriptor
 * @cpumask:		The cpumask where threads run
 *
 * Creates and starts the threads on all online cpus.
 */
int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread,
					   const struct cpumask *cpumask)
{
	unsigned int cpu;
	int ret = 0;

	if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL))
		return -ENOMEM;
	cpumask_copy(plug_thread->cpumask, cpumask);

	get_online_cpus();
	mutex_lock(&smpboot_threads_lock);
	for_each_online_cpu(cpu) {
		ret = __smpboot_create_thread(plug_thread, cpu);
		if (ret) {
			smpboot_destroy_threads(plug_thread);
			free_cpumask_var(plug_thread->cpumask);
			goto out;
		}
		if (cpumask_test_cpu(cpu, cpumask))
			smpboot_unpark_thread(plug_thread, cpu);
	}
	list_add(&plug_thread->list, &hotplug_threads);
out:
	mutex_unlock(&smpboot_threads_lock);
	put_online_cpus();
	return ret;
}
EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread_cpumask);

/**
 * smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug
 * @plug_thread:	Hotplug thread descriptor
 *
 * Stops all threads on all possible cpus.
 */
void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread)
{
	get_online_cpus();
	mutex_lock(&smpboot_threads_lock);
	list_del(&plug_thread->list);
	smpboot_destroy_threads(plug_thread);
	mutex_unlock(&smpboot_threads_lock);
	put_online_cpus();
	free_cpumask_var(plug_thread->cpumask);
}
EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);

/**
 * smpboot_update_cpumask_percpu_thread - Adjust which per_cpu hotplug threads stay parked
 * @plug_thread:	Hotplug thread descriptor
 * @new:		Revised mask to use
 *
 * The cpumask field in the smp_hotplug_thread must not be updated directly
 * by the client, but only by calling this function.
 * This function can only be called on a registered smp_hotplug_thread.
 */
int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
					 const struct cpumask *new)
{
	struct cpumask *old = plug_thread->cpumask;
	cpumask_var_t tmp;
	unsigned int cpu;

	if (!alloc_cpumask_var(&tmp, GFP_KERNEL))
		return -ENOMEM;

	get_online_cpus();
	mutex_lock(&smpboot_threads_lock);

	/* Park threads that were exclusively enabled on the old mask. */
	cpumask_andnot(tmp, old, new);
	for_each_cpu_and(cpu, tmp, cpu_online_mask)
		smpboot_park_thread(plug_thread, cpu);

	/* Unpark threads that are exclusively enabled on the new mask. */
	cpumask_andnot(tmp, new, old);
	for_each_cpu_and(cpu, tmp, cpu_online_mask)
		smpboot_unpark_thread(plug_thread, cpu);

	cpumask_copy(old, new);

	mutex_unlock(&smpboot_threads_lock);
	put_online_cpus();

	free_cpumask_var(tmp);

	return 0;
}
EXPORT_SYMBOL_GPL(smpboot_update_cpumask_percpu_thread);

static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD);

/*
 * Called to poll specified CPU's state, for example, when waiting for
 * a CPU to come online.
 */
int cpu_report_state(int cpu)
{
	return atomic_read(&per_cpu(cpu_hotplug_state, cpu));
}

/*
 * If CPU has died properly, set its state to CPU_UP_PREPARE and
 * return success.  Otherwise, return -EBUSY if the CPU died after
 * cpu_wait_death() timed out.  And yet otherwise again, return -EAGAIN
 * if cpu_wait_death() timed out and the CPU still hasn't gotten around
 * to dying.  In the latter two cases, the CPU might not be set up
 * properly, but it is up to the arch-specific code to decide.
 * Finally, -EIO indicates an unanticipated problem.
 *
 * Note that it is permissible to omit this call entirely, as is
 * done in architectures that do no CPU-hotplug error checking.
 */
int cpu_check_up_prepare(int cpu)
{
	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) {
		atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_UP_PREPARE);
		return 0;
	}

	switch (atomic_read(&per_cpu(cpu_hotplug_state, cpu))) {

	case CPU_POST_DEAD:

		/* The CPU died properly, so just start it up again. */
		atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_UP_PREPARE);
		return 0;

	case CPU_DEAD_FROZEN:

		/*
		 * Timeout during CPU death, so let caller know.
		 * The outgoing CPU completed its processing, but after
		 * cpu_wait_death() timed out and reported the error. The
		 * caller is free to proceed, in which case the state
		 * will be reset properly by cpu_set_state_online().
		 * Proceeding despite this -EBUSY return makes sense
		 * for systems where the outgoing CPUs take themselves
		 * offline, with no post-death manipulation required from
		 * a surviving CPU.
		 */
		return -EBUSY;

	case CPU_BROKEN:

		/*
		 * The most likely reason we got here is that there was
		 * a timeout during CPU death, and the outgoing CPU never
		 * did complete its processing.  This could happen on
		 * a virtualized system if the outgoing VCPU gets preempted
		 * for more than five seconds, and the user attempts to
		 * immediately online that same CPU.  Trying again later
		 * might return -EBUSY above, hence -EAGAIN.
		 */
		return -EAGAIN;

	default:

		/* Should not happen.  Famous last words. */
		return -EIO;
	}
}

/*
 * Mark the specified CPU online.
 *
 * Note that it is permissible to omit this call entirely, as is
 * done in architectures that do no CPU-hotplug error checking.
 */
void cpu_set_state_online(int cpu)
{
	(void)atomic_xchg(&per_cpu(cpu_hotplug_state, cpu), CPU_ONLINE);
}

#ifdef CONFIG_HOTPLUG_CPU

/*
 * Wait for the specified CPU to exit the idle loop and die.
 */
bool cpu_wait_death(unsigned int cpu, int seconds)
{
	int jf_left = seconds * HZ;
	int oldstate;
	bool ret = true;
	int sleep_jf = 1;

	might_sleep();

	/* The outgoing CPU will normally get done quite quickly. */
	if (atomic_read(&per_cpu(cpu_hotplug_state, cpu)) == CPU_DEAD)
		goto update_state;
	udelay(5);

	/* But if the outgoing CPU dawdles, wait increasingly long times. */
	while (atomic_read(&per_cpu(cpu_hotplug_state, cpu)) != CPU_DEAD) {
		schedule_timeout_uninterruptible(sleep_jf);
		jf_left -= sleep_jf;
		if (jf_left <= 0)
			break;
		sleep_jf = DIV_ROUND_UP(sleep_jf * 11, 10);
	}
update_state:
	oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu));
	if (oldstate == CPU_DEAD) {
		/* Outgoing CPU died normally, update state. */
		smp_mb(); /* atomic_read() before update. */
		atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_POST_DEAD);
	} else {
		/* Outgoing CPU still hasn't died, set state accordingly. */
		if (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu),
				   oldstate, CPU_BROKEN) != oldstate)
			goto update_state;
		ret = false;
	}
	return ret;
}

/*
 * Called by the outgoing CPU to report its successful death.  Return
 * false if this report follows the surviving CPU's timing out.
 *
 * A separate "CPU_DEAD_FROZEN" is used when the surviving CPU
 * timed out.  This approach allows architectures to omit calls to
 * cpu_check_up_prepare() and cpu_set_state_online() without defeating
 * the next cpu_wait_death()'s polling loop.
 */
bool cpu_report_death(void)
{
	int oldstate;
	int newstate;
	int cpu = smp_processor_id();

	do {
		oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu));
		if (oldstate != CPU_BROKEN)
			newstate = CPU_DEAD;
		else
			newstate = CPU_DEAD_FROZEN;
	} while (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu),
				oldstate, newstate) != oldstate);
	return newstate == CPU_DEAD;
}

#endif /* #ifdef CONFIG_HOTPLUG_CPU */